diff --git a/mean_shift_cuda/meanshift_gpu_utils.cu b/mean_shift_cuda/meanshift_gpu_utils.cu
index 555709c..af9454c 100644
--- a/mean_shift_cuda/meanshift_gpu_utils.cu
+++ b/mean_shift_cuda/meanshift_gpu_utils.cu
@@ -361,7 +361,7 @@ void calculate_norm(Matrix d_mean_shift_vector, double *current_norm){
     dim3 dimGrid;
     do {
         dimBlock.x = requested_block_size;
-        dimBlock.y = d_mean_shift_vector.width;
+        dimBlock.y = 1;
         dimGrid.x = (d_mean_shift_vector.height + dimBlock.x - 1) / dimBlock.x;
         dimGrid.y = 1;
 
diff --git a/mean_shift_cuda/meanshift_kernels.cu b/mean_shift_cuda/meanshift_kernels.cu
index f0d1c7b..dfb6496 100644
--- a/mean_shift_cuda/meanshift_kernels.cu
+++ b/mean_shift_cuda/meanshift_kernels.cu
@@ -86,13 +86,21 @@ __global__ void norm(Matrix mean_shift_vector, double *current_norm) {
     // by accumulating results into cell_value
     double cell_value = 0;
     int row = blockIdx.x * blockDim.x + threadIdx.x;
-    int col = blockIdx.y * blockDim.y + threadIdx.y;
 
     // performs calculations only if thread's indexes are within matrix bounds
-    if (row * mean_shift_vector.width + col >= mean_shift_vector.width * mean_shift_vector.height){
+    if (row >= denominator.height){
         return;
     }
 
+//    for (int column = 0; column < kernel_matrix.width; ++column){
+//        cell_value += kernel_matrix.elements[row * kernel_matrix.width + column];
+//    }
+    denominator.elements[row] = cell_value;
+    // performs calculations only if thread's indexes are within matrix bounds
+//    if (row * mean_shift_vector.width + col >= mean_shift_vector.width * mean_shift_vector.height){
+//        return;
+//    }
+
     for (int element_index = 0; element_index < mean_shift_vector.width; ++element_index){
         cell_value += mean_shift_vector.elements[row * mean_shift_vector.width + element_index]
                       * mean_shift_vector.elements[row * mean_shift_vector.width + element_index];