diff --git a/mean_shift_cuda_shared_mem/meanshift_gpu_utils.cu b/mean_shift_cuda_shared_mem/meanshift_gpu_utils.cu
index 8bc80ed..2acbfea 100644
--- a/mean_shift_cuda_shared_mem/meanshift_gpu_utils.cu
+++ b/mean_shift_cuda_shared_mem/meanshift_gpu_utils.cu
@@ -122,18 +122,6 @@ int meanshift(double **original_points, double ***shifted_points, int deviation)
         &tmp_w_memcpy_time);
     w_memcpy_time += tmp_w_memcpy_time;
 
-    /*for (int row=0; row<2; ++row){
-        for (int col=0; col<2; ++col){
-            printf("new_shift[%d][%d] = %f\n", row, col, new_shift[row][col]);
-            printf("new_shift[%d][%d] = %f\n", 300+row, 216+col, new_shift[300+row][216+col]);
-            printf("new_shift[%d][%d] = %f\n", 562+row, 487+col, new_shift[562+row][487+col]);
-        }
-    }*/
-
-    /*if(is_first_recursion){
-        exit(0);
-    }*/
-
     // frees previously shifted points, they're now garbage
     free((*shifted_points)[0]);
     gpuErrchk( cudaFree(d_shifted_points.elements) );
@@ -328,13 +316,16 @@ void shift_points(Matrix d_kernel_matrix, Matrix d_original_points, Matrix d_shi
     do {
         /*dimBlock.x = requested_block_size;
         dimBlock.y = d_new_shift.width;*/
-        dimBlock.x = 2;
-        dimBlock.y = 2;
+        dimBlock.x = min(d_new_shift.width, d_new_shift.height);
+        dimBlock.y = min(d_new_shift.width, d_new_shift.height);
         dimGrid.x = (d_new_shift.height + dimBlock.x - 1) / dimBlock.x;
-        dimGrid.y = 1;
+        dimGrid.y = (d_new_shift.height + dimBlock.x - 1) / dimBlock.x;
+
+        int shared_memory_size = dimBlock.x * 2 * sizeof(double);
+        //Kernel <<<numBlocks, threadsPerBlock, sharedMemory>>> (count_a, count_b);
 
-        shift_points_kernel<<<dimGrid, dimBlock>>>(d_original_points, d_kernel_matrix, d_shifted_points,
-            d_new_shift, d_denominator, d_mean_shift_vector);
+        shift_points_kernel<<<dimGrid, dimBlock, shared_memory_size>>>(d_original_points,
+            d_kernel_matrix, d_shifted_points, d_new_shift, d_denominator, d_mean_shift_vector);
         if (cudaGetLastError() != cudaSuccess){
             --requested_block_size;
         } else {
diff --git a/mean_shift_cuda_shared_mem/meanshift_kernels.cu b/mean_shift_cuda_shared_mem/meanshift_kernels.cu
index 99368b6..09dada6 100644
--- a/mean_shift_cuda_shared_mem/meanshift_kernels.cu
+++ b/mean_shift_cuda_shared_mem/meanshift_kernels.cu
@@ -74,11 +74,13 @@ __global__ void shift_points_kernel(Matrix original_points, Matrix kernel_matrix
     // each thread block computes one sub-matrix sub_new_shift of C
     Matrix sub_new_shift = GetSubMatrix(new_shift, block_row, block_col, BLOCK_SIZE);
 
-    // shared memory used to store sub_kernel_matrix and sub_original_points respectively
-    __shared__ double *s_sub_kernel_matrix;
-    s_sub_kernel_matrix = (double*)malloc(BLOCK_SIZE * BLOCK_SIZE * sizeof(double));
-    __shared__ double *s_sub_original_points;
-    s_sub_original_points = (double*)malloc(BLOCK_SIZE * BLOCK_SIZE * sizeof(double));
+    // dynamically allocated shared memory used to store sub_kernel_matrix and sub_original_points
+    // respectively
+    extern __shared__ double joined_shared_memory[];
+    // first part of the allocated memory is used for s_sub_kernel_matrix and second part is used
+    // for s_sub_original_points
+    double *s_sub_kernel_matrix = &(joined_shared_memory[0]);
+    double *s_sub_original_points = &(joined_shared_memory[BLOCK_SIZE * BLOCK_SIZE]);
 
     // loops over all the sub-matrices of kernel_matrix and original_points that are required to
     // compute sub_new_shift, multiplies each pair of sub-matrices and accumulates the results