|
@ -80,3 +80,57 @@ __global__ void shift_points_kernel(Matrix original_points, Matrix kernel_matrix |
|
|
new_shift.elements[row * new_shift.width + col] - |
|
|
new_shift.elements[row * new_shift.width + col] - |
|
|
shifted_points.elements[row * new_shift.width + col]; |
|
|
shifted_points.elements[row * new_shift.width + col]; |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
__global__ void norm(Matrix mean_shift_vector, double *current_norm) { |
|
|
|
|
|
// each thread computes one element of new_shift |
|
|
|
|
|
// by accumulating results into cell_value |
|
|
|
|
|
double cell_value = 0; |
|
|
|
|
|
int row = blockIdx.x * blockDim.x + threadIdx.x; |
|
|
|
|
|
int col = blockIdx.y * blockDim.y + threadIdx.y; |
|
|
|
|
|
|
|
|
|
|
|
// performs calculations only if thread's indexes are within matrix bounds |
|
|
|
|
|
if (row * mean_shift_vector.width + col >= mean_shift_vector.width * mean_shift_vector.height){ |
|
|
|
|
|
return; |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
for (int element_index = 0; element_index < mean_shift_vector.width; ++element_index){ |
|
|
|
|
|
cell_value += mean_shift_vector.elements[row * mean_shift_vector.width + element_index] |
|
|
|
|
|
* mean_shift_vector.elements[row * mean_shift_vector.width + element_index]; |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
¤t_norm = sqrt(cell_value); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// // new_shift elements are calculated by dividing with the denominator |
|
|
|
|
|
// new_shift.elements[row * new_shift.width + col] = |
|
|
|
|
|
// cell_value / denominator.elements[row]; |
|
|
|
|
|
// |
|
|
|
|
|
// // calculates mean-shift vector |
|
|
|
|
|
// mean_shift_vector.elements[row * new_shift.width + col] = |
|
|
|
|
|
// new_shift.elements[row * new_shift.width + col] - |
|
|
|
|
|
// shifted_points.elements[row * new_shift.width + col]; |
|
|
|
|
|
|
|
|
|
|
|
// int n_tid = 2 * (threadIdx.x + blockIdx.x * blockDim.x); |
|
|
|
|
|
// int i = 1; |
|
|
|
|
|
// int initial_tid = n_tid / 2; |
|
|
|
|
|
// int limit = gridDim.x * blockDim.x; |
|
|
|
|
|
|
|
|
|
|
|
// int block_end = 2 * (blockIdx.x * blockDim.x + blockDim.x) - 1; |
|
|
|
|
|
// |
|
|
|
|
|
// if (n_tid < (2 * limit)){ |
|
|
|
|
|
// |
|
|
|
|
|
// while ( (i < (2 * blockDim.x)) && n_tid < block_end && |
|
|
|
|
|
// (n_tid + i) <= block_end){ |
|
|
|
|
|
// |
|
|
|
|
|
// norms[n_tid] += norms[n_tid + i]; |
|
|
|
|
|
// n_tid = n_tid + i * (initial_tid * 2 - 2 * (blockIdx.x * blockDim.x)); |
|
|
|
|
|
// i *= 2; |
|
|
|
|
|
// __syncthreads(); |
|
|
|
|
|
// } |
|
|
|
|
|
// |
|
|
|
|
|
// |
|
|
|
|
|
// if (!((initial_tid) % blockDim.x)) |
|
|
|
|
|
// norm_per_block[blockIdx.x] = norms[n_tid]; |
|
|
|
|
|
// |
|
|
|
|
|
// } |
|
|
|
|
|
} |