From 440320e1f0d4df43be1bcca805a3d48b0c2b60bd Mon Sep 17 00:00:00 2001
From: Apostolof <apotwohd@gmail.com>
Date: Sun, 28 Jan 2018 20:11:05 +0200
Subject: [PATCH] Minor kernel fixes

---
 mean_shift_cuda_shared_mem/meanshift_kernels.cu | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/mean_shift_cuda_shared_mem/meanshift_kernels.cu b/mean_shift_cuda_shared_mem/meanshift_kernels.cu
index f3e82e9..d109d6b 100644
--- a/mean_shift_cuda_shared_mem/meanshift_kernels.cu
+++ b/mean_shift_cuda_shared_mem/meanshift_kernels.cu
@@ -58,7 +58,7 @@ __global__ void shift_points_kernel(Matrix original_points, Matrix kernel_matrix
     int BLOCK_SIZE = blockDim.y;
     int block_row = blockIdx.x;
     int block_col = blockIdx.y;
-    
+
     // each thread computes one element of new_shift by accumulating results into cell_value
     double cell_value = 0;
 
@@ -78,11 +78,18 @@ __global__ void shift_points_kernel(Matrix original_points, Matrix kernel_matrix
     // dynamically allocated shared memory used to store sub_kernel_matrix and sub_original_points
     // respectively
     extern __shared__ double joined_shared_memory[];
+
     // first part of the allocated memory is used for s_sub_kernel_matrix and second part is used
     // for s_sub_original_points
     double *s_sub_kernel_matrix = &(joined_shared_memory[0]);
     double *s_sub_original_points = &(joined_shared_memory[BLOCK_SIZE * BLOCK_SIZE]);
 
+    // cancel execution if allocation failed
+    if (sizeof(s_sub_kernel_matrix) != BLOCK_SIZE * BLOCK_SIZE * 2){
+        __threadfence();
+        asm("trap;");
+    }
+
     // loops over all the sub-matrices of kernel_matrix and original_points that are required to
     // compute sub_new_shift, multiplies each pair of sub-matrices and accumulates the results
     for (int sub_matrix_index = 0;
@@ -104,10 +111,13 @@ __global__ void shift_points_kernel(Matrix original_points, Matrix kernel_matrix
         // synchronizes to make sure the sub-matrices are loaded before starting the computation
         __syncthreads();
 
+        int curr_col_begin = sub_matrix_index * BLOCK_SIZE;
         // multiplies sub_kernel_matrix and sub_original_points
         for (int element_index = 0; element_index < BLOCK_SIZE; ++element_index){
-            cell_value += s_sub_kernel_matrix[row * BLOCK_SIZE + element_index] *
-                s_sub_original_points[element_index * BLOCK_SIZE + col];
+            if (curr_col_begin + element_index < kernel_matrix.width){
+                cell_value += s_sub_kernel_matrix[row * BLOCK_SIZE + element_index] *
+                    s_sub_original_points[element_index * BLOCK_SIZE + col];
+            }
         }
 
         // synchronizes to make sure that the preceding computation is done before loading two new