From 440320e1f0d4df43be1bcca805a3d48b0c2b60bd Mon Sep 17 00:00:00 2001 From: Apostolof Date: Sun, 28 Jan 2018 20:11:05 +0200 Subject: [PATCH] Minor kernel fixes --- mean_shift_cuda_shared_mem/meanshift_kernels.cu | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/mean_shift_cuda_shared_mem/meanshift_kernels.cu b/mean_shift_cuda_shared_mem/meanshift_kernels.cu index f3e82e9..d109d6b 100644 --- a/mean_shift_cuda_shared_mem/meanshift_kernels.cu +++ b/mean_shift_cuda_shared_mem/meanshift_kernels.cu @@ -58,7 +58,7 @@ __global__ void shift_points_kernel(Matrix original_points, Matrix kernel_matrix int BLOCK_SIZE = blockDim.y; int block_row = blockIdx.x; int block_col = blockIdx.y; - + // each thread computes one element of new_shift by accumulating results into cell_value double cell_value = 0; @@ -78,11 +78,18 @@ __global__ void shift_points_kernel(Matrix original_points, Matrix kernel_matrix // dynamically allocated shared memory used to store sub_kernel_matrix and sub_original_points // respectively extern __shared__ double joined_shared_memory[]; + // first part of the allocated memory is used for s_sub_kernel_matrix and second part is used // for s_sub_original_points double *s_sub_kernel_matrix = &(joined_shared_memory[0]); double *s_sub_original_points = &(joined_shared_memory[BLOCK_SIZE * BLOCK_SIZE]); + // cancel execution if allocation failed + if (sizeof(s_sub_kernel_matrix) != BLOCK_SIZE * BLOCK_SIZE * 2){ + __threadfence(); + asm("trap;"); + } + // loops over all the sub-matrices of kernel_matrix and original_points that are required to // compute sub_new_shift, multiplies each pair of sub-matrices and accumulates the results for (int sub_matrix_index = 0; @@ -104,10 +111,13 @@ __global__ void shift_points_kernel(Matrix original_points, Matrix kernel_matrix // synchronizes to make sure the sub-matrices are loaded before starting the computation __syncthreads(); + int curr_col_begin = sub_matrix_index * BLOCK_SIZE; // multiplies sub_kernel_matrix and sub_original_points for (int element_index = 0; element_index < BLOCK_SIZE; ++element_index){ - cell_value += s_sub_kernel_matrix[row * BLOCK_SIZE + element_index] * - s_sub_original_points[element_index * BLOCK_SIZE + col]; + if (curr_col_begin + element_index < kernel_matrix.width){ + cell_value += s_sub_kernel_matrix[row * BLOCK_SIZE + element_index] * + s_sub_original_points[element_index * BLOCK_SIZE + col]; + } } // synchronizes to make sure that the preceding computation is done before loading two new