anapt 7 years ago
parent
commit
265ffd5ede
  1. 16
      mean_shift_cuda_shared_mem/meanshift_kernels.cu

16
mean_shift_cuda_shared_mem/meanshift_kernels.cu

@ -58,7 +58,7 @@ __global__ void shift_points_kernel(Matrix original_points, Matrix kernel_matrix
int BLOCK_SIZE = blockDim.y; int BLOCK_SIZE = blockDim.y;
int block_row = blockIdx.x; int block_row = blockIdx.x;
int block_col = blockIdx.y; int block_col = blockIdx.y;
// each thread computes one element of new_shift by accumulating results into cell_value // each thread computes one element of new_shift by accumulating results into cell_value
double cell_value = 0; double cell_value = 0;
@ -78,11 +78,18 @@ __global__ void shift_points_kernel(Matrix original_points, Matrix kernel_matrix
// dynamically allocated shared memory used to store sub_kernel_matrix and sub_original_points // dynamically allocated shared memory used to store sub_kernel_matrix and sub_original_points
// respectively // respectively
extern __shared__ double joined_shared_memory[]; extern __shared__ double joined_shared_memory[];
// first part of the allocated memory is used for s_sub_kernel_matrix and second part is used // first part of the allocated memory is used for s_sub_kernel_matrix and second part is used
// for s_sub_original_points // for s_sub_original_points
double *s_sub_kernel_matrix = &(joined_shared_memory[0]); double *s_sub_kernel_matrix = &(joined_shared_memory[0]);
double *s_sub_original_points = &(joined_shared_memory[BLOCK_SIZE * BLOCK_SIZE]); double *s_sub_original_points = &(joined_shared_memory[BLOCK_SIZE * BLOCK_SIZE]);
// cancel execution if allocation failed
if (sizeof(s_sub_kernel_matrix) != BLOCK_SIZE * BLOCK_SIZE * 2){
__threadfence();
asm("trap;");
}
// loops over all the sub-matrices of kernel_matrix and original_points that are required to // loops over all the sub-matrices of kernel_matrix and original_points that are required to
// compute sub_new_shift, multiplies each pair of sub-matrices and accumulates the results // compute sub_new_shift, multiplies each pair of sub-matrices and accumulates the results
for (int sub_matrix_index = 0; for (int sub_matrix_index = 0;
@ -104,10 +111,13 @@ __global__ void shift_points_kernel(Matrix original_points, Matrix kernel_matrix
// synchronizes to make sure the sub-matrices are loaded before starting the computation // synchronizes to make sure the sub-matrices are loaded before starting the computation
__syncthreads(); __syncthreads();
int curr_col_begin = sub_matrix_index * BLOCK_SIZE;
// multiplies sub_kernel_matrix and sub_original_points // multiplies sub_kernel_matrix and sub_original_points
for (int element_index = 0; element_index < BLOCK_SIZE; ++element_index){ for (int element_index = 0; element_index < BLOCK_SIZE; ++element_index){
cell_value += s_sub_kernel_matrix[row * BLOCK_SIZE + element_index] * if (curr_col_begin + element_index < kernel_matrix.width){
s_sub_original_points[element_index * BLOCK_SIZE + col]; cell_value += s_sub_kernel_matrix[row * BLOCK_SIZE + element_index] *
s_sub_original_points[element_index * BLOCK_SIZE + col];
}
} }
// synchronizes to make sure that the preceding computation is done before loading two new // synchronizes to make sure that the preceding computation is done before loading two new

Loading…
Cancel
Save