diff --git a/mean_shift_cuda_shared_mem/meanshift.cu b/mean_shift_cuda_shared_mem/meanshift.cu
index 171c7b3..3ab88f7 100644
--- a/mean_shift_cuda_shared_mem/meanshift.cu
+++ b/mean_shift_cuda_shared_mem/meanshift.cu
@@ -5,10 +5,10 @@
 #include "meanshift_utils.h"
 #include "meanshift_gpu_utils.h"
 
-int DEVIATION = 20;
-int NUMBER_OF_POINTS = 1024;
-int DIMENSIONS = 32;
-const char *POINTS_FILENAME = "../data/32";
+int DEVIATION = 1;
+int NUMBER_OF_POINTS = 600;
+int DIMENSIONS = 2;
+const char *POINTS_FILENAME = "../data/X.bin";
 const char *LABELS_FILENAME = "../data/L.bin";
 parameters params;
 
@@ -33,7 +33,6 @@ int main(int argc, char **argv){
     // toc
     gettimeofday (&endwtime, NULL);
     seq_time = (double)((endwtime.tv_usec - startwtime.tv_usec)/1.0e6 + endwtime.tv_sec - startwtime.tv_sec);
-    
 
     printf("\nTotal number of recursions = %d\n", recursions);
     printf("%s wall clock time = %f\n","Mean Shift", seq_time);
diff --git a/mean_shift_cuda_shared_mem/meanshift_gpu_utils.cu b/mean_shift_cuda_shared_mem/meanshift_gpu_utils.cu
index 6e6437d..6cd2a85 100644
--- a/mean_shift_cuda_shared_mem/meanshift_gpu_utils.cu
+++ b/mean_shift_cuda_shared_mem/meanshift_gpu_utils.cu
@@ -55,24 +55,26 @@ void set_GPU(){
 
 int meanshift(double **original_points, double ***shifted_points, int deviation){
     // host variables
-    int size = 0;
     static int recursion = 0;
-    static double **kernel_matrix, **mean_shift_vector, w_memcpy_time;
-    double **new_shift, current_norm = 0, tmp_w_memcpy_time;
+    static double **kernel_matrix,  **new_shift, **mean_shift_vector, w_memcpy_time;
+    double current_norm = 0, tmp_w_memcpy_time;
     bool is_first_recursion = false;
 
     // device variables
     static Matrix d_original_points, d_shifted_points, d_kernel_matrix, d_denominator,
-        d_mean_shift_vector;
-    Matrix d_new_shift;
+        d_new_shift, d_mean_shift_vector;
 
     // allocates memory and copies original points on first recursion
     if (recursion == 0 || (*shifted_points) == NULL){
         is_first_recursion = true;
+
         // allocates memory for shifted points array and copies original points into it
         (*shifted_points) = alloc_double(NUMBER_OF_POINTS, DIMENSIONS);
         duplicate(original_points, NUMBER_OF_POINTS, DIMENSIONS, shifted_points);
 
+        // allocates memory for new shift array
+        new_shift = alloc_double(NUMBER_OF_POINTS, DIMENSIONS);
+
         // allocates memory for mean shift vector
         mean_shift_vector = alloc_double(NUMBER_OF_POINTS, DIMENSIONS);
         // initializes elements of mean_shift_vector to inf
@@ -90,7 +92,7 @@ int meanshift(double **original_points, double ***shifted_points, int deviation)
 
         // allocates corresponding memory in device
         init_device_memory(original_points, *shifted_points, &d_original_points, &d_shifted_points,
-            &d_kernel_matrix, &d_denominator, &d_mean_shift_vector);
+            &d_kernel_matrix, &d_denominator, &d_new_shift, &d_mean_shift_vector);
         // toc
         gettimeofday (&end_w_time, NULL);
         seq = (double)((end_w_time.tv_usec - start_w_time.tv_usec)
@@ -110,27 +112,19 @@ int meanshift(double **original_points, double ***shifted_points, int deviation)
     // calculates denominator
     calculate_denominator(d_kernel_matrix, d_denominator);
 
-    // creates new y vector
-    // allocates memory in every recursion
-    new_shift = alloc_double(NUMBER_OF_POINTS, DIMENSIONS);
-    // allocates corresponding memory in device
-    d_new_shift.width = DIMENSIONS;
-    d_new_shift.height = NUMBER_OF_POINTS;
-    d_new_shift.stride = d_new_shift.width;
-    size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double);
-    gpuErrchk( cudaMalloc(&(d_new_shift.elements), size) );
-
     shift_points(d_kernel_matrix, d_original_points, d_shifted_points, d_new_shift, d_denominator,
         d_mean_shift_vector, kernel_matrix, original_points, &new_shift, &mean_shift_vector,
         &tmp_w_memcpy_time);
     w_memcpy_time += tmp_w_memcpy_time;
 
-    // frees previously shifted points, they're now garbage
-    free((*shifted_points)[0]);
-    gpuErrchk( cudaFree(d_shifted_points.elements) );
     // updates shifted points pointer to the new array address
+    double ***temp = shifted_points;
     shifted_points = &new_shift;
+    new_shift = *temp;
+
+    double *d_temp = d_shifted_points.elements;
     d_shifted_points.elements = d_new_shift.elements;
+    d_new_shift.elements = d_temp;
 
     if (params.display){
         save_matrix((*shifted_points), recursion);
@@ -172,8 +166,11 @@ int meanshift(double **original_points, double ***shifted_points, int deviation)
         free(mean_shift_vector);
         free(kernel_matrix[0]);
         free(kernel_matrix);
+        //free(new_shift[0]);
+        //free(new_shift);
 
-        free_device_memory(d_original_points, d_kernel_matrix, d_denominator, d_shifted_points);
+        free_device_memory(d_original_points, d_kernel_matrix, d_denominator, d_shifted_points,
+            d_new_shift);
     }
 
     return recursion;
@@ -181,7 +178,7 @@ int meanshift(double **original_points, double ***shifted_points, int deviation)
 
 void init_device_memory(double **original_points, double **shifted_points,
     Matrix *d_original_points, Matrix *d_shifted_points, Matrix *d_kernel_matrix,
-    Matrix *d_denominator, Matrix *d_mean_shift_vector){
+    Matrix *d_denominator, Matrix *d_new_shift, Matrix *d_mean_shift_vector){
     int size;
 
     // allocates memory for original_points in GPU and copies the array
@@ -216,6 +213,13 @@ void init_device_memory(double **original_points, double **shifted_points,
     size = NUMBER_OF_POINTS * sizeof(double);
     gpuErrchk( cudaMalloc(&(d_denominator->elements), size) );
 
+    // allocates memory for new_shift in GPU
+    d_new_shift->width = DIMENSIONS;
+    d_new_shift->height = NUMBER_OF_POINTS;
+    d_new_shift->stride = d_new_shift->width;
+    size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double);
+    gpuErrchk( cudaMalloc(&(d_new_shift->elements), size) );
+
     // allocates memory for mean_shift_vector in GPU
     d_mean_shift_vector->width = DIMENSIONS;
     d_mean_shift_vector->height = NUMBER_OF_POINTS;
@@ -367,10 +371,11 @@ void shift_points(Matrix d_kernel_matrix, Matrix d_original_points, Matrix d_shi
 }
 
 void free_device_memory(Matrix d_original_points, Matrix d_kernel_matrix, Matrix d_denominator,
-    Matrix d_shifted_points){
+    Matrix d_shifted_points, Matrix d_new_shift){
     // frees all memory previously allocated in device
     gpuErrchk( cudaFree(d_original_points.elements) );
     gpuErrchk( cudaFree(d_kernel_matrix.elements) );
     gpuErrchk( cudaFree(d_denominator.elements) );
     gpuErrchk( cudaFree(d_shifted_points.elements) );
+    gpuErrchk( cudaFree(d_new_shift.elements) );
 }
\ No newline at end of file
diff --git a/mean_shift_cuda_shared_mem/meanshift_gpu_utils.h b/mean_shift_cuda_shared_mem/meanshift_gpu_utils.h
index 83f9784..5669ae6 100644
--- a/mean_shift_cuda_shared_mem/meanshift_gpu_utils.h
+++ b/mean_shift_cuda_shared_mem/meanshift_gpu_utils.h
@@ -1,5 +1,5 @@
-#ifndef SERIAL_GPU_UTILS_H    /*    Include guard    */
-#define SERIAL_GPU_UTILS_H
+#ifndef MEANSHIFT_GPU_UTILS_H    /*    Include guard    */
+#define MEANSHIFT_GPU_UTILS_H
 
 #include "meanshift_kernels.h"
 
@@ -33,7 +33,7 @@ int meanshift(double **original_points, double ***shifted_points, int h);
 //Function init_device_memory allocates memory for necessary arrays in the device
 void init_device_memory(double **original_points, double **shifted_points,
     Matrix *d_original_points, Matrix *d_shifted_points, Matrix *d_kernel_matrix,
-    Matrix *d_denominator, Matrix *d_new_shift);
+    Matrix *d_denominator, Matrix *d_new_shift, Matrix *d_mean_shift_vector);
 
 //Function calculate_kernel_matrix is a wrapper for the kernel call of the corresponding kernel
 //"calculate_kernel_matrix_kernel" that calculates the kernel matrix
@@ -53,6 +53,6 @@ void shift_points(Matrix d_kernel_matrix, Matrix d_original_points, Matrix d_shi
 
 //Function free_device_memory frees device's previously allocated memory
 void free_device_memory(Matrix d_original_points, Matrix d_kernel_matrix, Matrix d_denominator,
-    Matrix d_shifted_points);
+    Matrix d_shifted_points, Matrix d_new_shift);
 
-#endif //SERIAL_GPU_UTILS_H
\ No newline at end of file
+#endif //MEANSHIFT_GPU_UTILS_H
\ No newline at end of file
diff --git a/mean_shift_cuda_shared_mem/meanshift_kernels.cu b/mean_shift_cuda_shared_mem/meanshift_kernels.cu
index 2181327..3b5cda5 100644
--- a/mean_shift_cuda_shared_mem/meanshift_kernels.cu
+++ b/mean_shift_cuda_shared_mem/meanshift_kernels.cu
@@ -68,8 +68,8 @@ __global__ void shift_points_kernel(Matrix original_points, Matrix kernel_matrix
     int col = threadIdx.y;
 
     // performs calculations only if thread's indexes are within matrix bounds
-    if ((ROW_BLOCK_SIZE * block_row + row) > new_shift.height ||
-        (COLUMN_BLOCK_SIZE * block_col + col) > new_shift.width){
+    if ((ROW_BLOCK_SIZE * block_row + row) >= new_shift.height ||
+        (COLUMN_BLOCK_SIZE * block_col + col) >= new_shift.width){
         return;
     }
 
diff --git a/mean_shift_cuda_shared_mem/meanshift_kernels.h b/mean_shift_cuda_shared_mem/meanshift_kernels.h
index db93809..65998d0 100644
--- a/mean_shift_cuda_shared_mem/meanshift_kernels.h
+++ b/mean_shift_cuda_shared_mem/meanshift_kernels.h
@@ -1,5 +1,5 @@
-#ifndef SERIAL_KERNELS_H    /*    Include guard    */
-#define SERIAL_KERNELS_H
+#ifndef MEANSHIFT_KERNELS_H    /*    Include guard    */
+#define MEANSHIFT_KERNELS_H
 
 /*      Structures     */
 
@@ -27,4 +27,4 @@ __global__ void shift_points_kernel(Matrix original_points, Matrix kernel_matrix
 __device__ Matrix get_sub_matrix(Matrix A, int row, int col, int ROW_BLOCK_SIZE,
     int COLUMN_BLOCK_SIZE);
 
-#endif //SERIAL_KERNELS_H
\ No newline at end of file
+#endif //MEANSHIFT_KERNELS_H
\ No newline at end of file
diff --git a/mean_shift_cuda_shared_mem/meanshift_utils.h b/mean_shift_cuda_shared_mem/meanshift_utils.h
index 3be505a..d751695 100644
--- a/mean_shift_cuda_shared_mem/meanshift_utils.h
+++ b/mean_shift_cuda_shared_mem/meanshift_utils.h
@@ -1,5 +1,5 @@
-#ifndef SERIAL_UTILS_H    /*    Include guard    */
-#define SERIAL_UTILS_H
+#ifndef MEANSHIFT_UTILS_H    /*    Include guard    */
+#define MEANSHIFT_UTILS_H
 
 #include <stdbool.h>
 
@@ -32,4 +32,4 @@ void print_matrix(double **array, int rows, int cols);
 //If a file already exists new lines are concatenated
 void save_matrix(double **matrix, int iteration);
 
-#endif //SERIAL_UTILS_H
\ No newline at end of file
+#endif //MEANSHIFT_UTILS_H
\ No newline at end of file