Minor kernel fixes

8 years ago · 440320e1f0
1 changed files with 13 additions and 3 deletions
--- a/mean_shift_cuda_shared_mem/meanshift_kernels.cu
+++ b/mean_shift_cuda_shared_mem/meanshift_kernels.cu
@ -58,7 +58,7 @@ __global__ void shift_points_kernel(Matrix original_points, Matrix kernel_matrix
    int BLOCK_SIZE = blockDim.y;
    int block_row = blockIdx.x;
    int block_col = blockIdx.y;
-    
+
    // each thread computes one element of new_shift by accumulating results into cell_value
    double cell_value = 0;

@ -78,11 +78,18 @@ __global__ void shift_points_kernel(Matrix original_points, Matrix kernel_matrix
    // dynamically allocated shared memory used to store sub_kernel_matrix and sub_original_points
    // respectively
    extern __shared__ double joined_shared_memory[];
+
    // first part of the allocated memory is used for s_sub_kernel_matrix and second part is used
    // for s_sub_original_points
    double *s_sub_kernel_matrix = &(joined_shared_memory[0]);
    double *s_sub_original_points = &(joined_shared_memory[BLOCK_SIZE * BLOCK_SIZE]);

+    // cancel execution if allocation failed
+    if (sizeof(s_sub_kernel_matrix) != BLOCK_SIZE * BLOCK_SIZE * 2){
+        __threadfence();
+        asm("trap;");
+    }
+
    // loops over all the sub-matrices of kernel_matrix and original_points that are required to
    // compute sub_new_shift, multiplies each pair of sub-matrices and accumulates the results
    for (int sub_matrix_index = 0;
@ -104,10 +111,13 @@ __global__ void shift_points_kernel(Matrix original_points, Matrix kernel_matrix
        // synchronizes to make sure the sub-matrices are loaded before starting the computation
        __syncthreads();

+        int curr_col_begin = sub_matrix_index * BLOCK_SIZE;
        // multiplies sub_kernel_matrix and sub_original_points
        for (int element_index = 0; element_index < BLOCK_SIZE; ++element_index){
-            cell_value += s_sub_kernel_matrix[row * BLOCK_SIZE + element_index] *
-                s_sub_original_points[element_index * BLOCK_SIZE + col];
+            if (curr_col_begin + element_index < kernel_matrix.width){
+                cell_value += s_sub_kernel_matrix[row * BLOCK_SIZE + element_index] *
+                    s_sub_original_points[element_index * BLOCK_SIZE + col];
+            }
        }

        // synchronizes to make sure that the preceding computation is done before loading two new