diff --git a/mean_shift_cuda/meanshift.cu b/mean_shift_cuda/meanshift.cu
index a82d1a3..e76c2bb 100644
--- a/mean_shift_cuda/meanshift.cu
+++ b/mean_shift_cuda/meanshift.cu
@@ -5,10 +5,10 @@
 #include "meanshift_utils.h"
 #include "meanshift_gpu_utils.h"
 
-int DEVIATION = 31000;
-int NUMBER_OF_POINTS = 5000;
+int DEVIATION = 1;
+int NUMBER_OF_POINTS = 587;
 int DIMENSIONS = 2;
-char* POINTS_FILENAME = "../data/s4";
+const char *POINTS_FILENAME = "../data/X.bin";
 const char *LABELS_FILENAME = "../data/L.bin";
 parameters params;
 
diff --git a/mean_shift_cuda_shared_mem/meanshift_gpu_utils.cu b/mean_shift_cuda_shared_mem/meanshift_gpu_utils.cu
index 2a72557..6566df4 100644
--- a/mean_shift_cuda_shared_mem/meanshift_gpu_utils.cu
+++ b/mean_shift_cuda_shared_mem/meanshift_gpu_utils.cu
@@ -40,6 +40,9 @@ void set_GPU(){
     }
     // sets the device
     gpuErrchk( cudaSetDevice(max_device) );
+    // lastly sets shared memory bank size to 8 bytes since data are represented by doubles
+    gpuErrchk( cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte) );
+
     if (params.verbose){
         printf("Device chosen is \"%s\"\n"
             "Device has %d multi processors and compute capability %d.%d\n"
@@ -324,8 +327,8 @@ void shift_points(Matrix d_kernel_matrix, Matrix d_original_points, Matrix d_shi
 
         // size for kernel's dynamically allocated array
         // the size FOR EACH array is calculated as BLOCK_SIZE * BLOCK_SIZE * size_of_double
-        // the arrays nedded in kernel are two
-        shared_memory_size = dimBlock.x * dimBlock.x * sizeof(double) * 2;
+        // the arrays needed in kernel are two
+        shared_memory_size = (int)(dimBlock.x * dimBlock.x * sizeof(double) * 2);
 
         shift_points_kernel<<<dimGrid, dimBlock, shared_memory_size>>>(d_original_points,
             d_kernel_matrix, d_shifted_points, d_new_shift, d_denominator, d_mean_shift_vector);
diff --git a/mean_shift_cuda_shared_mem/meanshift_kernels.cu b/mean_shift_cuda_shared_mem/meanshift_kernels.cu
index d109d6b..9c2d948 100644
--- a/mean_shift_cuda_shared_mem/meanshift_kernels.cu
+++ b/mean_shift_cuda_shared_mem/meanshift_kernels.cu
@@ -55,7 +55,7 @@ __global__ void denominator_kernel(Matrix denominator, Matrix kernel_matrix){
 
 __global__ void shift_points_kernel(Matrix original_points, Matrix kernel_matrix,
     Matrix shifted_points, Matrix new_shift, Matrix denominator, Matrix mean_shift_vector){
-    int BLOCK_SIZE = blockDim.y;
+    int BLOCK_SIZE = blockDim.x;
     int block_row = blockIdx.x;
     int block_col = blockIdx.y;
 
@@ -67,8 +67,8 @@ __global__ void shift_points_kernel(Matrix original_points, Matrix kernel_matrix
     int col = threadIdx.y;
 
     // performs calculations only if thread's indexes are within matrix bounds
-    if ((BLOCK_SIZE * block_row + row) >= new_shift.height ||
-        (BLOCK_SIZE * block_col + col) >= new_shift.width){
+    if ((BLOCK_SIZE * block_row + row) > new_shift.height ||
+        (BLOCK_SIZE * block_col + col) > new_shift.width){
         return;
     }
 
@@ -78,17 +78,13 @@ __global__ void shift_points_kernel(Matrix original_points, Matrix kernel_matrix
     // dynamically allocated shared memory used to store sub_kernel_matrix and sub_original_points
     // respectively
     extern __shared__ double joined_shared_memory[];
+    // makes sure enough memory has been allocated
+    joined_shared_memory[BLOCK_SIZE * BLOCK_SIZE * 2] = 0;
 
     // first part of the allocated memory is used for s_sub_kernel_matrix and second part is used
     // for s_sub_original_points
-    double *s_sub_kernel_matrix = &(joined_shared_memory[0]);
-    double *s_sub_original_points = &(joined_shared_memory[BLOCK_SIZE * BLOCK_SIZE]);
-
-    // cancel execution if allocation failed
-    if (sizeof(s_sub_kernel_matrix) != BLOCK_SIZE * BLOCK_SIZE * 2){
-        __threadfence();
-        asm("trap;");
-    }
+    double *s_sub_kernel_matrix = (double *)joined_shared_memory;
+    double *s_sub_original_points = (double *)&joined_shared_memory[BLOCK_SIZE * BLOCK_SIZE];
 
     // loops over all the sub-matrices of kernel_matrix and original_points that are required to
     // compute sub_new_shift, multiplies each pair of sub-matrices and accumulates the results