diff --git a/mean_shift_cuda/Makefile b/mean_shift_cuda/Makefile
index f25d776..a931e40 100644
--- a/mean_shift_cuda/Makefile
+++ b/mean_shift_cuda/Makefile
@@ -3,11 +3,11 @@ SHELL := /bin/bash
 # ============================================
 # COMMANDS
 
-CC = /usr/local/cuda/bin/nvcc -ccbin gcc
+CC = /usr/local/cuda/bin/nvcc
 RM = rm -f
-CFLAGS=-lm -O3 -I.
-OBJ=meanshift.o meanshift_declarations.o
-DEPS=meanshift_declarations.h
+CFLAGS= -arch=sm_21 -lm -O0 -I.
+OBJ=meanshift.o meanshift_utils.o meanshift_kernels.o
+DEPS=meanshift_utils.h meanshift_kernels.h
 
 # ==========================================
 # TARGETS
@@ -21,8 +21,8 @@ all: $(EXECUTABLES)
 # ==========================================
 # DEPENDENCIES (HEADERS)
 
-%.o: %.c $(DEPS)
-	$(CC) -c -o $@ $< $(CFLAGS)
+%.o: %.cu $(DEPS)
+	$(CC) -x cu $(CFLAGS) -dc $< -o $@
 
 .PRECIOUS: $(EXECUTABLES) $(OBJ)
 
@@ -30,7 +30,7 @@ all: $(EXECUTABLES)
 # EXECUTABLE (MAIN)
 
 $(EXECUTABLES): $(OBJ)
-	$(CC) -o $@ $^ $(CFLAGS)
+	$(CC) $(CFLAGS) $(OBJ) -o $@
 
 clean:
-	$(RM) *.o *~ $(EXECUTABLES)
+	$(RM) *.o *~ $(EXECUTABLES)
\ No newline at end of file
diff --git a/mean_shift_cuda/meanshift.c b/mean_shift_cuda/meanshift.c
deleted file mode 100644
index 8f4a645..0000000
--- a/mean_shift_cuda/meanshift.c
+++ /dev/null
@@ -1,270 +0,0 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include <sys/time.h>
-#include <math.h>
-#include <float.h>
-
-#include <cuda_runtime.h>
-
-#include "meanshift_declarations.h"
-#define N       512
-
-int NUMBER_OF_POINTS = 600;
-int DIMENSIONS = 2;
-char* POINTS_FILENAME = "../data/X.bin";
-char* LABELS_FILENAME = "../data/L.bin";
-
-struct timeval startwtime, endwtime;
-double seq_time;
-
-__device__ double norm(double **matrix, int rows, int cols){
-
-    double sum=0, temp_mul=0;
-    for (int i=0; i<rows; i++) {
-        for (int j=0; j<cols; j++) {
-            temp_mul = matrix[i][j] * matrix[i][j];
-            sum = sum + temp_mul;
-        }
-    }
-    double norm = sqrt(sum);
-    return norm;
-}
-
-int main(int argc, char **argv){
-    int h = 1;
-
-    //get_args(argc, argv, &h); commented out while in development
-
-    FILE *f;
-//    f = fopen(X, "rb");
-//    fseek(f, 0L, SEEK_END);
-//    long int pos = ftell(f);
-//    fclose(f);
-//    int elements = pos / sizeof(double);  // number of total elements (points*dimension)
-//    int points = elements/DIMENSIONS;
-//    //printf("points : %d \n", points);
-    f = fopen(POINTS_FILENAME, "rb");
-    double **vectors;
-    vectors = alloc_2d_double(NUMBER_OF_POINTS, DIMENSIONS);
-    for (int i=0; i<NUMBER_OF_POINTS; i++){
-        int out = fread(vectors[i], sizeof(double), DIMENSIONS, f);
-    }
-
-    save_matrix(vectors, 0);
-
-    // initializing file that will contain the labels (train)
-    f = fopen(LABELS_FILENAME, "rb");
-    // NOTE : Labels were classified as <class 'numpy.uint8'>
-    // variables of type uint8 are stored as 1-byte (8-bit) unsigned integers
-    fseek(f, 0L, SEEK_END);
-    long int pos = ftell(f);
-    rewind(f);
-    //printf("position : %ld \n", pos);
-    int label_elements = pos/ sizeof(char);
-    char *labels = (char*)malloc(label_elements* sizeof(char));
-    fseek(f, 0L, SEEK_SET);
-    int out = fread(labels, sizeof(char), label_elements, f);
-    fclose(f);
-
-    // MEAN SHIFT OPTIONS
-    parameters params;
-    params.epsilon = 0.0001;
-    params.verbose = false;
-    params.display = false;
-    parameters *opt;
-    opt = &params;
-
-    double **shifted_points;
-    // tic
-    gettimeofday (&startwtime, NULL);
-
-    int iterations = meanshift(vectors, &shifted_points, h, opt, 1);
-
-    // toc
-    gettimeofday (&endwtime, NULL);
-    seq_time = (double)((endwtime.tv_usec - startwtime.tv_usec)/1.0e6 + endwtime.tv_sec - startwtime.tv_sec);
-    printf("%s wall clock time = %f\n","Mean Shift", seq_time);
-
-    //TODO write output points to file -> plot later
-    //save_matrix(shifted_points, iterations);
-}
-
-int meanshift(double **original_points, double ***shifted_points, int h
-        , parameters *opt, int iteration){
-
-    // allocates space and copies original points on first iteration
-    if (iteration == 1){
-        (*shifted_points) = alloc_2d_double(NUMBER_OF_POINTS, DIMENSIONS);
-        duplicate(original_points, NUMBER_OF_POINTS, DIMENSIONS, shifted_points);
-    }
-
-    // mean shift vector
-    double **mean_shift_vector;
-    mean_shift_vector = alloc_2d_double(NUMBER_OF_POINTS, DIMENSIONS);
-    // initialize elements of mean_shift_vector to inf
-    for (int i=0;i<NUMBER_OF_POINTS;i++){
-        for (int j=0;j<DIMENSIONS;j++){
-            mean_shift_vector[i][j] = DBL_MAX;
-        }
-    }
-
-
-    /** allocate memory **/
-    double **kernel_matrix = alloc_2d_double(NUMBER_OF_POINTS, NUMBER_OF_POINTS);
-    double *denominator = malloc(NUMBER_OF_POINTS * sizeof(double));
-    // create new y vector
-    double **new_shift = alloc_2d_double(NUMBER_OF_POINTS, DIMENSIONS);
-
-
-    double * d_kernel_matrix;
-    size_t pitch_kernel_matrix;
-    cudaMallocPitch(&d_kernel_matrix, &pitch_kernel_matrix,
-                    NUMBER_OF_POINTS * sizeof(double), NUMBER_OF_POINTS);
-
-    double * d_denominator;
-    cudaMalloc(&d_denominator, NUMBER_OF_POINTS * sizeof(double));
-
-    double * d_new_shift;
-    size_t pitch_new_shift;
-    cudaMallocPitch(&d_new_shift, &pitch_new_shift,
-                    NUMBER_OF_POINTS * sizeof(double), DIMENSIONS);
-
-    double * d_shifted_points;
-    size_t pitch_shifted_points;
-    cudaMallocPitch(&d_shifted_points, &pitch_shifted_points,
-                    NUMBER_OF_POINTS * sizeof(double), DIMENSIONS);
-
-    double * d_mean_shift_vector;
-    size_t pitch_mean_shift_vector;
-    cudaMallocPitch(&d_mean_shift_vector, &pitch_mean_shift_vector,
-                    NUMBER_OF_POINTS * sizeof(double), DIMENSIONS);
-
-
-    cudaMemcpy2D(d_shifted_points, NUMBER_OF_POINTS * sizeof(double), *shifted_points,
-                 pitch_shifted_points, NUMBER_OF_POINTS * sizeof(double),
-                 DIMENSIONS, cudaMemcpyHostToDevice);
-
-    cudaMemcpy2D(d_mean_shift_vector, NUMBER_OF_POINTS * sizeof(double), *mean_shift_vector,
-                 pitch_mean_shift_vector, NUMBER_OF_POINTS * sizeof(double),
-                 DIMENSIONS, cudaMemcpyHostToDevice);
-
-
-    // TODO REFACTOR AS A KERNEL
-
-    for (int i=0; i<NUMBER_OF_POINTS; i++){
-        double sum = 0;
-        for (int j=0; j<NUMBER_OF_POINTS; j++){
-            double dist_sum = 0;
-            for (int p=0; p<DIMENSIONS; p++){
-                double dif = ((*shifted_points)[i])[p]-(original_points[j])[p];
-                dist_sum += dif * dif;
-            }
-            double dist = sqrt(dist_sum);
-
-            if (dist < h*h){
-                kernel_matrix[i][j] = dist * dist;
-                // compute kernel matrix
-                double pow = ((-1)*(kernel_matrix[i][j]))/(2*(h*h));
-                kernel_matrix[i][j] = exp(pow);
-            } else {
-                kernel_matrix[i][j] = 0;
-            }
-            if (i==j){
-                kernel_matrix[i][j] += 1;
-            }
-            sum = sum + kernel_matrix[i][j];
-        }
-        denominator[i] = sum;
-
-        // build nominator
-        for (int j=0; j<DIMENSIONS; j++){
-            new_shift[i][j] = 0;
-            for (int k=0; k<NUMBER_OF_POINTS; k++){
-                new_shift[i][j] += kernel_matrix[i][k] * original_points[k][j];
-            }
-            // divide element-wise
-            new_shift[i][j] = new_shift[i][j] / denominator[i];
-            // calculate mean-shift vector at the same time
-            mean_shift_vector[i][j] = new_shift[i][j] - (*shifted_points)[i][j];
-        }
-    }
-
-
-    // frees previously shifted points, they're now garbage
-    free((*shifted_points)[0]);
-    // updates shifted points pointer to the new array address
-    shifted_points = &new_shift;
-
-    save_matrix((*shifted_points), iteration);
-
-    double current_norm = norm(mean_shift_vector, NUMBER_OF_POINTS, DIMENSIONS);
-    printf("Iteration n. %d, error %f \n", iteration, current_norm);
-
-    // clean up this iteration's allocates
-    free(mean_shift_vector[0]);
-    free(mean_shift_vector);
-    free(kernel_matrix[0]);
-    free(kernel_matrix);
-    free(denominator);
-
-    /** iterate until convergence **/
-    if (current_norm > opt->epsilon) {
-        return meanshift(original_points, shifted_points, h, opt, ++iteration);
-    }
-
-    return iteration;
-}
-
-/**
-
-__global__ int iteration(double * kernel_matrix, double * denominator,
-            double * new_shift, double *shifted_points, double mean_shift_vector,
-            int NUMBER_OF_POINTS, int DIMENSIONS, int h){
-
-    int i = threadIdx.x + blockIdx.x * blockDim.x;
-    for (i = 0; i < NUMBER_OF_POINTS; i++) {
-        double sum = 0;
-        for (int j = 0; j < NUMBER_OF_POINTS; j++) {
-            double dist_sum = 0;
-            for (int p = 0; p < DIMENSIONS; p++) {
-                double dif = ((*shifted_points)[i])[p] - (original_points[j])[p];
-                dist_sum += dif * dif;
-            }
-            double dist = sqrt(dist_sum);
-
-            if (dist < h * h) {
-                kernel_matrix[i][j] = dist * dist;
-                // compute kernel matrix
-                double pow = ((-1) * (kernel_matrix[i][j])) / (2 * (h * h));
-                kernel_matrix[i][j] = exp(pow);
-            } else {
-                kernel_matrix[i][j] = 0;
-            }
-            if (i == j) {
-                kernel_matrix[i][j] += 1;
-            }
-            sum = sum + kernel_matrix[i][j];
-        }
-        denominator[i] = sum;
-
-        // build nominator
-        for (int j = 0; j < DIMENSIONS; j++) {
-            new_shift[i][j] = 0;
-            for (int k = 0; k < NUMBER_OF_POINTS; k++) {
-                new_shift[i][j] += kernel_matrix[i][k] * original_points[k][j];
-            }
-            // divide element-wise
-            new_shift[i][j] = new_shift[i][j] / denominator[i];
-            // calculate mean-shift vector at the same time
-            mean_shift_vector[i][j] = new_shift[i][j] - (*shifted_points)[i][j];
-        }
-    }
-
-    // frees previously shifted points, they're now garbage
-    free((*shifted_points)[0]);
-    // updates shifted points pointer to the new array address
-    shifted_points = &new_shift;
-
-}
-
-*/
\ No newline at end of file
diff --git a/mean_shift_cuda/meanshift.cu b/mean_shift_cuda/meanshift.cu
new file mode 100644
index 0000000..f312238
--- /dev/null
+++ b/mean_shift_cuda/meanshift.cu
@@ -0,0 +1,39 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/time.h>
+
+#include "meanshift_utils.h"
+
+int DEVIATION = 1;
+int NUMBER_OF_POINTS = 600;
+int DIMENSIONS = 2;
+char* POINTS_FILENAME = "../data/X.bin";
+char* LABELS_FILENAME = "../data/L.bin";
+
+struct timeval startwtime, endwtime;
+double seq_time;
+
+int main(int argc, char **argv){
+    double **vectors, **shifted_points;
+    char *labels;
+    parameters params;
+
+    //get_args(argc, argv); commented out while in development
+    init(&vectors, &labels, &params);
+
+    //save_matrix(vectors, 0);
+
+    // tic
+    gettimeofday (&startwtime, NULL);
+
+    int iterations = meanshift(vectors, &shifted_points, DEVIATION, &params);
+    printf("Total iterations = %d\n", iterations);
+
+    // toc
+    gettimeofday (&endwtime, NULL);
+    seq_time = (double)((endwtime.tv_usec - startwtime.tv_usec)/1.0e6 + endwtime.tv_sec - startwtime.tv_sec);
+    printf("%s wall clock time = %f\n","Mean Shift", seq_time);
+
+    //TODO write output points to file -> plot later
+    //save_matrix(shifted_points, iterations);
+}
\ No newline at end of file
diff --git a/mean_shift_cuda/meanshift_declarations.c b/mean_shift_cuda/meanshift_declarations.c
deleted file mode 100644
index b3a5100..0000000
--- a/mean_shift_cuda/meanshift_declarations.c
+++ /dev/null
@@ -1,66 +0,0 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include <math.h>
-#include <float.h>
-#include <string.h>
-
-#include "meanshift_declarations.h"
-
-void get_args(int argc, char **argv, int *h){
-    if (argc != 6) {
-        printf("Usage: %s h N D Pd Pl\nwhere:\n", argv[0]);
-        printf("\th is the variance\n");
-        printf("\tN is the the number of points\n");
-        printf("\tD is the number of dimensions of each point\n");
-        printf("\tPd is the path of the dataset file\n");
-        printf("\tPl is the path of the labels file\n");
-        exit(1);
-    }
-
-    *h = atoi(argv[1]);
-    NUMBER_OF_POINTS = atoi(argv[2]);
-    DIMENSIONS = atoi(argv[3]);
-    POINTS_FILENAME = argv[4];
-    LABELS_FILENAME = argv[5];
-}
-
-double **alloc_2d_double(int rows, int cols) {
-    double *data = (double *) malloc(rows*cols*sizeof(double));
-    double **array = (double **) malloc(rows*sizeof(double*));
-    for (int i=0; i<rows; i++)
-        array[i] = &(data[cols*i]);
-    return array;
-}
-
-void duplicate(double **source, int rows, int cols, double ***dest){
-    for (int i=0; i<rows; i++){
-        for (int j=0; j<cols; j++){
-            (*dest)[i][j] = source[i][j];
-        }
-    }
-}
-
-void print_matrix(double **array, int rows, int cols){
-    for (int i=0; i<cols; i++){
-        for (int j=0; j<rows; j++){
-            printf("%f ", array[j][i]);
-        }
-        printf("\n");
-    }
-}
-
-void save_matrix(double **matrix, int iteration){
-    char filename[50];
-    snprintf(filename, sizeof(filename), "%s%d", "../output/output_", iteration);
-    FILE *file;
-    file = fopen(filename, "w");
-    for (int rows=0; rows<NUMBER_OF_POINTS; ++rows){
-        for (int cols=0; cols<DIMENSIONS; ++cols){
-            fprintf(file, "%f", matrix[rows][cols]);
-            if (cols != DIMENSIONS - 1){
-                fprintf(file, ",");
-            }
-        }
-        fprintf(file, "\n");
-    }
-}
\ No newline at end of file
diff --git a/mean_shift_cuda/meanshift_kernels.cu b/mean_shift_cuda/meanshift_kernels.cu
new file mode 100644
index 0000000..79fb49a
--- /dev/null
+++ b/mean_shift_cuda/meanshift_kernels.cu
@@ -0,0 +1,19 @@
+#include "meanshift_kernels.h"
+#include <stdio.h>
+
+__global__ void multiply_kernel(Matrix matrix1, Matrix matrix2, Matrix output){
+    // Each thread computes one element of output
+    // by accumulating results into cell_value
+    double cell_value = 0;
+    int row = blockIdx.y * blockDim.y + threadIdx.y;
+    int col = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (row < output.height && col < output.width){
+        for (int element_index = 0; element_index < matrix1.width; ++element_index){
+            cell_value += matrix1.elements[row * matrix1.width + element_index]
+                * matrix2.elements[element_index * matrix2.width + col];
+        }
+        printf("%f\n", cell_value);
+        output.elements[row * output.width + col] = cell_value;
+    }
+}
\ No newline at end of file
diff --git a/mean_shift_cuda/meanshift_kernels.h b/mean_shift_cuda/meanshift_kernels.h
new file mode 100644
index 0000000..b9fc56c
--- /dev/null
+++ b/mean_shift_cuda/meanshift_kernels.h
@@ -0,0 +1,13 @@
+#ifndef SERIAL_KERNELS_H    /*    Include guard    */
+#define SERIAL_KERNELS_H
+
+typedef struct{
+    int width;
+    int height;
+    double *elements;
+} Matrix;
+
+//Function multiply_kernel calculates the product of matrices 1 and 2 into output.
+__global__ void multiply_kernel(Matrix matrix1, Matrix matrix2, Matrix output);
+
+#endif //SERIAL_KERNELS_H
\ No newline at end of file
diff --git a/mean_shift_cuda/meanshift_utils.cu b/mean_shift_cuda/meanshift_utils.cu
new file mode 100644
index 0000000..e791067
--- /dev/null
+++ b/mean_shift_cuda/meanshift_utils.cu
@@ -0,0 +1,282 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <float.h>
+#include <string.h>
+
+#include "meanshift_utils.h"
+#include "meanshift_kernels.h"
+
+#define OUTPUT_PREFIX "../output/output_"
+#define BLOCK_SIZE 16
+
+void get_args(int argc, char **argv){
+    if (argc != 6) {
+        printf("Usage: %s h N D Pd Pl\nwhere:\n", argv[0]);
+        printf("\th is the variance\n");
+        printf("\tN is the the number of points\n");
+        printf("\tD is the number of dimensions of each point\n");
+        printf("\tPd is the path of the dataset file\n");
+        printf("\tPl is the path of the labels file\n");
+        exit(1);
+    }
+
+    DEVIATION = atoi(argv[1]);
+    NUMBER_OF_POINTS = atoi(argv[2]);
+    DIMENSIONS = atoi(argv[3]);
+    POINTS_FILENAME = argv[4];
+    LABELS_FILENAME = argv[5];
+}
+
+void init(double ***vectors, char **labels, parameters *params){
+    int bytes_read = 0;
+    // initializes vectors
+    FILE *points_file;
+    points_file = fopen(POINTS_FILENAME, "rb");
+    if (points_file != NULL){
+        // allocates memory for the array
+        (*vectors) = alloc_2d_double(NUMBER_OF_POINTS, DIMENSIONS);
+        // reads vectors dataset from file
+        for (int i=0; i<NUMBER_OF_POINTS; i++){
+            bytes_read = fread((*vectors)[i], sizeof(double), DIMENSIONS, points_file);
+            if ( bytes_read != DIMENSIONS ){
+                if(feof(points_file)){
+                    printf("Premature end of file reached.\n");
+                } else{
+                    printf("Error reading points file.");
+                }
+                fclose(points_file);
+                exit(EXIT_FAILURE);
+            }
+        }
+    } else {
+        printf("Error reading dataset file.\n");
+        exit(EXIT_FAILURE);
+    }
+    fclose(points_file);
+
+    // initializes file that will contain the labels (train)
+    FILE *labels_file;
+    labels_file = fopen(LABELS_FILENAME, "rb");
+    if (labels_file != NULL){
+        // NOTE : Labels were classified as <class 'numpy.uint8'>
+        // variables of type uint8 are stored as 1-byte (8-bit) unsigned integers
+        // gets number of labels
+        fseek(labels_file, 0L, SEEK_END);
+        long int pos = ftell(labels_file);
+        rewind(labels_file);
+        int label_elements = pos/ sizeof(char);
+
+        // allocates memory for the array
+        *labels = (char*)malloc(label_elements* sizeof(char));
+        fseek(labels_file, 0L, SEEK_SET);
+        bytes_read = fread((*labels), sizeof(char), label_elements, labels_file);
+        if ( bytes_read != label_elements ){
+            if(feof(points_file)){
+                printf("Premature end of file reached.\n");
+            } else{
+                printf("Error reading points file.");
+            }
+            fclose(labels_file);
+            exit(EXIT_FAILURE);
+        }
+    }
+    fclose(labels_file);
+
+    // MEAN SHIFT OPTIONS
+    params->epsilon = 0.0001;
+    params->verbose = false;
+    params->display = false;
+}
+
+int meanshift(double **original_points, double ***shifted_points, int deviation
+    , parameters *opt){
+    static int iteration = 0;
+    static double **mean_shift_vector, **kernel_matrix, *denominator;
+
+    // allocates memory and copies original points on first iteration
+    if (iteration == 0 || (*shifted_points) == NULL){
+        (*shifted_points) = alloc_2d_double(NUMBER_OF_POINTS, DIMENSIONS);
+        duplicate(original_points, NUMBER_OF_POINTS, DIMENSIONS, shifted_points);
+
+        // allocates memory for mean shift vector
+        mean_shift_vector = alloc_2d_double(NUMBER_OF_POINTS, DIMENSIONS);
+        // initializes elements of mean_shift_vector to inf
+        for (int i=0;i<NUMBER_OF_POINTS;i++){
+            for (int j=0;j<DIMENSIONS;j++){
+                mean_shift_vector[i][j] = DBL_MAX;
+            }
+        }
+
+        // allocates memory for other arrays needed
+        kernel_matrix = alloc_2d_double(NUMBER_OF_POINTS, NUMBER_OF_POINTS);
+        denominator = (double *)malloc(NUMBER_OF_POINTS * sizeof(double));
+    }
+
+    // finds pairwise distance matrix (inside radius)
+    // [I, D] = rangesearch(x,y,h);
+    for (int i=0; i<NUMBER_OF_POINTS; i++){
+        double sum = 0;
+        for (int j=0; j<NUMBER_OF_POINTS; j++){
+            double distance = calculateDistance((*shifted_points)[i]
+                , original_points[j]);
+
+            double deviation_square = deviation*deviation;
+            if (distance < deviation_square){
+                // computes kernel matrix
+                double pow = ((-1)*(distance * distance))/(2*(deviation_square));
+                kernel_matrix[i][j] = exp(pow);
+            } else {
+                kernel_matrix[i][j] = 0;
+            }
+            if (i == j){
+                kernel_matrix[i][j] += 1;
+            }
+            sum = sum + kernel_matrix[i][j];
+        }
+        denominator[i] = sum;
+    }
+
+    // creates new y vector
+    double **new_shift = alloc_2d_double(NUMBER_OF_POINTS, DIMENSIONS);
+//==============================================================================
+
+    // builds nominator
+    /*multiply(kernel_matrix, original_points, new_shift);*/
+
+    Matrix d_kernel_matrix;
+    d_kernel_matrix.width = NUMBER_OF_POINTS;
+    d_kernel_matrix.height = NUMBER_OF_POINTS;
+    int size = NUMBER_OF_POINTS * NUMBER_OF_POINTS * sizeof(double);
+    cudaMalloc(&d_kernel_matrix.elements, size);
+    cudaMemcpy(d_kernel_matrix.elements, &(kernel_matrix[0][0]), size, cudaMemcpyHostToDevice);
+
+    Matrix d_original_points;
+    d_original_points.width = DIMENSIONS;
+    d_original_points.height = NUMBER_OF_POINTS;
+    size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double);
+    cudaMalloc(&d_original_points.elements, size);
+    cudaMemcpy(d_original_points.elements, &(original_points[0][0]), size, cudaMemcpyHostToDevice);
+
+    Matrix d_new_shift;
+    d_new_shift.width = DIMENSIONS;
+    d_new_shift.height = NUMBER_OF_POINTS;
+    size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double);
+    cudaMalloc(&d_new_shift.elements, size);
+
+    dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
+    dim3 dimGrid(d_original_points.width / dimBlock.x, d_kernel_matrix.height / dimBlock.y);
+
+    multiply_kernel<<<dimGrid, dimBlock>>>(d_kernel_matrix, d_original_points
+        , d_new_shift);
+
+    size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double);
+    cudaMemcpy(&(new_shift[0][0]), d_new_shift.elements, size, cudaMemcpyDeviceToHost);
+
+    cudaFree(d_kernel_matrix.elements);
+    cudaFree(d_original_points.elements);
+    cudaFree(d_new_shift.elements);
+
+//==============================================================================
+
+    // divides element-wise
+    for (int i=0; i<NUMBER_OF_POINTS; i++){
+        for (int j=0; j<DIMENSIONS; j++){
+            new_shift[i][j] = new_shift[i][j] / denominator[i];
+            // calculates mean-shift vector at the same time
+            mean_shift_vector[i][j] = new_shift[i][j] - (*shifted_points)[i][j];
+        }
+    }
+
+    // frees previously shifted points, they're now garbage
+    free((*shifted_points)[0]);
+    // updates shifted points pointer to the new array address
+    shifted_points = &new_shift;
+
+    save_matrix((*shifted_points), iteration);
+
+    // calculates norm of the new mean shift vector
+    double current_norm = norm(mean_shift_vector, NUMBER_OF_POINTS, DIMENSIONS);
+    printf("Iteration n. %d, error %f \n", iteration, current_norm);
+
+    /** iterates until convergence **/
+    if (current_norm > opt->epsilon) {
+        ++iteration;
+        meanshift(original_points, shifted_points, deviation, opt);
+    }
+
+    if (iteration == 0){
+        // cleans up this iteration's allocations
+        free(mean_shift_vector[0]);
+        free(mean_shift_vector);
+        free(kernel_matrix[0]);
+        free(kernel_matrix);
+        free(denominator);
+    }
+
+    return iteration;
+}
+
+// TODO check why there's is a difference in the norm calculate in matlab
+double norm(double **matrix, int rows, int cols){
+    double sum=0, temp_mul=0;
+    for (int i=0; i<rows; i++) {
+        for (int j=0; j<cols; j++) {
+            temp_mul = matrix[i][j] * matrix[i][j];
+            sum = sum + temp_mul;
+        }
+    }
+    double norm = sqrt(sum);
+    return norm;
+}
+
+double calculateDistance(double *y, double *x){
+    double sum = 0, dif;
+    for (int i=0; i<DIMENSIONS; i++){
+        dif = y[i]-x[i];
+        sum += dif * dif;
+    }
+    double distance = sqrt(sum);
+    return distance;
+}
+
+double **alloc_2d_double(int rows, int cols) {
+    double *data = (double *) malloc(rows*cols*sizeof(double));
+    double **array = (double **) malloc(rows*sizeof(double*));
+    for (int i=0; i<rows; i++)
+        array[i] = &(data[cols*i]);
+    return array;
+}
+
+void duplicate(double **source, int rows, int cols, double ***dest){
+    for (int i=0; i<rows; i++){
+        for (int j=0; j<cols; j++){
+            (*dest)[i][j] = source[i][j];
+        }
+    }
+}
+
+void print_matrix(double **array, int rows, int cols){
+    for (int i=0; i<cols; i++){
+        for (int j=0; j<rows; j++){
+            printf("%f ", array[j][i]);
+        }
+        printf("\n");
+    }
+}
+
+void save_matrix(double **matrix, int iteration){
+    char filename[50];
+    snprintf(filename, sizeof(filename), "%s%d", "../output/output_", iteration);
+    FILE *file;
+    file = fopen(filename, "w");
+    for (int rows=0; rows<NUMBER_OF_POINTS; ++rows){
+        for (int cols=0; cols<DIMENSIONS; ++cols){
+            fprintf(file, "%f", matrix[rows][cols]);
+            if (cols != DIMENSIONS - 1){
+                fprintf(file, ",");
+            }
+        }
+        fprintf(file, "\n");
+    }
+}
diff --git a/mean_shift_cuda/meanshift_declarations.h b/mean_shift_cuda/meanshift_utils.h
similarity index 68%
rename from mean_shift_cuda/meanshift_declarations.h
rename to mean_shift_cuda/meanshift_utils.h
index 07aa3e2..9fa97dc 100644
--- a/mean_shift_cuda/meanshift_declarations.h
+++ b/mean_shift_cuda/meanshift_utils.h
@@ -1,8 +1,9 @@
-#ifndef SERIAL_DECLARATIONS_H    /*    Include guard    */
-#define SERIAL_DECLARATIONS_H
+#ifndef SERIAL_UTILS_H    /*    Include guard    */
+#define SERIAL_UTILS_H
 
 #include <stdbool.h>
 
+extern int DEVIATION;
 extern int NUMBER_OF_POINTS;
 extern int DIMENSIONS;
 extern char* POINTS_FILENAME;
@@ -15,17 +16,22 @@ typedef struct parameters {
 } parameters;
 
 //Function get_args parses command line arguments.
-void get_args(int argc, char **argv, int *h);
+void get_args(int argc, char **argv);
+
+//Function init reads the dataset and label arrays from the corresponding files.
+void init(double ***vectors, char **labels, parameters *params);
 
 //Function meanshift recursively shifts original points according to th
 //mean-shift algorithm saving the result to shiftedPoints. Struct opt has user
-//options, h is the desirable deviation, iteration is this call's iteration
-//number.
+//options, h is the desirable deviation.
 int meanshift(double **original_points, double ***shifted_points, int h
-        , parameters *opt, int iteration);
+	, parameters *opt);
 
 //Function norm returns the second norm of matrix of dimensions rowsXcols.
-//double norm(double **matrix, int rows, int cols);
+double norm(double **matrix, int rows, int cols);
+
+//Function calculateDistance returns the distance between x and y vectors.
+double calculateDistance(double *y, double *x);
 
 //Function alloc_2d_double allocates rows*cols bytes of continuous memory.
 double **alloc_2d_double(int rows, int cols);
@@ -41,4 +47,4 @@ void print_matrix(double **array, int rows, int cols);
 void save_matrix(double **matrix
 	, int iteration);
 
-#endif //SERIAL_DECLARATIONS_H
\ No newline at end of file
+#endif //SERIAL_UTILS_H
\ No newline at end of file