diff --git a/mean_shift_cuda/meanshift_kernels.cu b/mean_shift_cuda/meanshift_kernels.cu
index 4115289..fdc1867 100644
--- a/mean_shift_cuda/meanshift_kernels.cu
+++ b/mean_shift_cuda/meanshift_kernels.cu
@@ -27,7 +27,7 @@ __global__ void calculate_kernel_matrix_kernel(Matrix shifted_points, Matrix ori
     }
 
     int dimensions = shifted_points.width;
-
+    // calculate distance
     double sum = 0, dif;
     for (int i=0; i<dimensions; i++){
         dif = shifted_points.elements[row * dimensions + i] - original_points.elements[col * dimensions + i];
@@ -46,4 +46,29 @@ __global__ void calculate_kernel_matrix_kernel(Matrix shifted_points, Matrix ori
     if (row == col){
         kernel_matrix.elements[row * kernel_matrix.width + col] += 1;
     }
+}
+
+__global__ void denominator_kernel<<<dimGrid, dimBlock>>>(Matrix denominator, Matrix kernel_matrix, int total){
+
+    int row = blockIdx.x * blockDim.x + threadIdx.x;
+    int col = blockIdx.y * blockDim.y + threadIdx.y;
+
+
+    if (row>=total || col>=total)
+        return;
+
+    denominator[col]=0;
+    denominator[row] += kernel_matrix[row*denominator.width + col];
+
+}
+
+// serial
+
+// calculate denominator
+for (int i=0; i<NUMBER_OF_POINTS; i++){
+    double sum = 0;
+    for (int j=0; j<NUMBER_OF_POINTS; j++){
+        sum = sum + kernel_matrix[i][j];
+    }
+    denominator[i] = sum;
 }
\ No newline at end of file
diff --git a/mean_shift_cuda/meanshift_kernels.h b/mean_shift_cuda/meanshift_kernels.h
index 7377805..05186fb 100644
--- a/mean_shift_cuda/meanshift_kernels.h
+++ b/mean_shift_cuda/meanshift_kernels.h
@@ -13,4 +13,6 @@ __global__ void multiply_kernel(Matrix matrix1, Matrix matrix2, Matrix output);
 __global__ void calculate_kernel_matrix_kernel(Matrix shifted_points, Matrix original_points
     , double deviation, Matrix kernel_matrix);
 
+__global__ void denominator_kernel(Matrix denominator, Matrix kernel_matrix, int T);
+
 #endif //SERIAL_KERNELS_H
\ No newline at end of file
diff --git a/mean_shift_cuda/meanshift_utils.cu b/mean_shift_cuda/meanshift_utils.cu
index fd55372..a6690b7 100644
--- a/mean_shift_cuda/meanshift_utils.cu
+++ b/mean_shift_cuda/meanshift_utils.cu
@@ -183,18 +183,21 @@ int meanshift(double **original_points, double ***shifted_points, int deviation
         kernel_matrix = alloc_2d_double(NUMBER_OF_POINTS, NUMBER_OF_POINTS);
         denominator = (double *)malloc(NUMBER_OF_POINTS * sizeof(double));
     }
+    // TODO move arrays to device and create global kernel for the iteration
 
     // finds pairwise distance matrix (inside radius)
     // [I, D] = rangesearch(x,y,h);
     calculate_kernel_matrix((*shifted_points), original_points, deviation, &kernel_matrix);
-    // calculate denominator
-    for (int i=0; i<NUMBER_OF_POINTS; i++){
-        double sum = 0;
-        for (int j=0; j<NUMBER_OF_POINTS; j++){
-            sum = sum + kernel_matrix[i][j];
-        }
-        denominator[i] = sum;
-    }
+
+//    // calculate denominator
+//    for (int i=0; i<NUMBER_OF_POINTS; i++){
+//        double sum = 0;
+//        for (int j=0; j<NUMBER_OF_POINTS; j++){
+//            sum = sum + kernel_matrix[i][j];
+//        }
+//        denominator[i] = sum;
+//    }
+    calculate_denominator(kernel_matrix);
 
     // creates new y vector
     double **new_shift = alloc_2d_double(NUMBER_OF_POINTS, DIMENSIONS);
@@ -244,7 +247,7 @@ int meanshift(double **original_points, double ***shifted_points, int deviation
     return iteration;
 }
 
-// TODO check why there's is a difference in the norm calculate in matlab
+
 double norm(double **matrix, int rows, int cols){
     double sum=0, temp_mul=0;
     for (int i=0; i<rows; i++) {
@@ -432,3 +435,49 @@ void save_matrix(double **matrix, int iteration){
         fprintf(file, "\n");
     }
 }
+
+void calculate_denominator(double **kernel_matrix){
+    static bool first_iter = true;
+
+    // allocates memory for denominator_matrix in GPU
+    Matrix d_denominator_matrix;
+    d_denominator_matrix.width = NUMBER_OF_POINTS;
+    d_denominator_matrix = 1;
+    int size = NUMBER_OF_POINTS * sizeof(double);
+    gpuErrchk( cudaMalloc(&d_denominator_matrix.elements, size) );
+
+    // allocates memory for kernel_matrix in GPU and copies the array
+    Matrix d_kernel_matrix;
+    d_kernel_matrix.width = NUMBER_OF_POINTS;
+    d_kernel_matrix.height = NUMBER_OF_POINTS;
+    int size = NUMBER_OF_POINTS * NUMBER_OF_POINTS * sizeof(double);
+    gpuErrchk( cudaMalloc(&d_kernel_matrix.elements, size) );
+    gpuErrchk( cudaMemcpy(d_kernel_matrix.elements, &(kernel_matrix[0][0])
+            , size, cudaMemcpyHostToDevice) );
+
+    // get max sizes supported from the device
+    int max_block_size = device_properties.maxThreadsPerBlock;
+    dim3 dimBlock((d_new_shift.height < sqrt(max_block_size)) ? d_new_shift.height : sqrt(max_block_size)
+            , (d_new_shift.width < sqrt(max_block_size)) ? d_new_shift.width : sqrt(max_block_size));
+    dim3 dimGrid((d_new_shift.height + dimBlock.x - 1) / dimBlock.x
+            , (d_new_shift.width + dimBlock.y - 1) / dimBlock.y);
+
+    if (first_iter && params.verbose){
+        printf("calculate_denominator called with:\n");
+        printf("dimBlock.x = %d, dimBlock.y = %d\n", dimBlock.x, dimBlock.y);
+        printf("dimGrid.x = %d, dimGrid.y = %d\n\n", dimGrid.x, dimGrid.y);
+        first_iter = false;
+    }
+
+    denominator_kernel<<<dimGrid, dimBlock>>>(d_denominator_matrix, d_kernel_matrix, T);
+    gpuErrchk( cudaPeekAtLastError() );
+    gpuErrchk( cudaDeviceSynchronize() );
+
+    size = NUMBER_OF_POINTS sizeof(double);
+    gpuErrchk( cudaMemcpy(&((*denominator)[0]), d_denominator_matrix.elements
+            ,size, cudaMemcpyDeviceToHost) );
+
+    gpuErrchk( cudaFree(d_kernel_matrix.elements) );
+    gpuErrchk( cudaFree(d_original_points.elements) );
+    gpuErrchk( cudaFree(d_new_shift.elements) );
+}
\ No newline at end of file
diff --git a/mean_shift_cuda/meanshift_utils.h b/mean_shift_cuda/meanshift_utils.h
index 9d80c14..c1f3e5e 100644
--- a/mean_shift_cuda/meanshift_utils.h
+++ b/mean_shift_cuda/meanshift_utils.h
@@ -71,4 +71,8 @@ void print_matrix(double **array, int rows, int cols);
 void save_matrix(double **matrix
     , int iteration);
 
+//Function calculate_denominator allocates memory in GPU, sends the data and calls the
+//denominator kernel function.
+void calculate_denominator(double **kernel_matrix);
+
 #endif //SERIAL_UTILS_H
\ No newline at end of file