diff --git a/mean_shift_cuda_shared_mem/Makefile b/mean_shift_cuda_shared_mem/Makefile
new file mode 100644
index 0000000..f7ff628
--- /dev/null
+++ b/mean_shift_cuda_shared_mem/Makefile
@@ -0,0 +1,43 @@
+SHELL := /bin/bash
+
+# ============================================
+# COMMANDS
+
+CC = nvcc
+HOST_COMPILER = -ccbin gcc
+CUDA_FLAGS = -arch=sm_21 -Wno-deprecated-gpu-targets -lcublas
+C_FLAGS = -lm -O3 -I.
+
+COMPILE_FLAGS = $(HOST_COMPILER) -x cu $(CUDA_FLAGS) -dc $(C_FLAGS)
+LINK_FLAGS = $(HOST_COMPILER) $(CUDA_FLAGS) $(C_FLAGS)
+
+OBJ = meanshift.o meanshift_utils.o meanshift_gpu_utils.o meanshift_kernels.o
+DEPS = meanshift_utils.h meanshift_kernels.h
+
+RM = rm -f
+
+# ==========================================
+# TARGETS
+
+EXECUTABLES = meanshift
+
+.PHONY: all clean
+
+all: $(EXECUTABLES)
+
+# ==========================================
+# DEPENDENCIES (HEADERS)
+
+%.o: %.cu $(DEPS)
+	$(CC) $(COMPILE_FLAGS) $< -o $@
+
+.PRECIOUS: $(EXECUTABLES) $(OBJ)
+
+# ==========================================
+# EXECUTABLE (MAIN)
+
+$(EXECUTABLES): $(OBJ)
+	$(CC) $(LINK_FLAGS) $(OBJ) -o $@
+
+clean:
+	$(RM) *.o *~ $(EXECUTABLES)
\ No newline at end of file
diff --git a/mean_shift_cuda_shared_mem/meanshift.cu b/mean_shift_cuda_shared_mem/meanshift.cu
new file mode 100644
index 0000000..37b971a
--- /dev/null
+++ b/mean_shift_cuda_shared_mem/meanshift.cu
@@ -0,0 +1,45 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/time.h>
+
+#include "meanshift_utils.h"
+#include "meanshift_gpu_utils.h"
+
+int DEVIATION = 1;
+int NUMBER_OF_POINTS = 600;
+int DIMENSIONS = 2;
+const char *POINTS_FILENAME = "../data/X.bin";
+const char *LABELS_FILENAME = "../data/L.bin";
+parameters params;
+
+struct timeval startwtime, endwtime;
+double seq_time;
+
+int main(int argc, char **argv){
+    int recursions = 0;
+    double **vectors, **shifted_points;
+    char *labels;
+
+    params.epsilon = 0.0001;
+    params.verbose = false;
+    params.display = true;
+    //get_args(argc, argv, &params); //commented out while in development
+    init(&vectors, &labels);
+
+    // tic
+    gettimeofday (&startwtime, NULL);
+    recursions = meanshift(vectors, &shifted_points, DEVIATION);
+
+    // toc
+    gettimeofday (&endwtime, NULL);
+    seq_time = (double)((endwtime.tv_usec - startwtime.tv_usec)/1.0e6 + endwtime.tv_sec - startwtime.tv_sec);
+    
+
+    printf("\nTotal number of recursions = %d\n", recursions);
+    printf("%s wall clock time = %f\n","Mean Shift", seq_time);
+
+    free(vectors[0]);
+    free(vectors);
+    free(shifted_points[0]);
+    free(shifted_points);
+}
diff --git a/mean_shift_cuda_shared_mem/meanshift_gpu_utils.cu b/mean_shift_cuda_shared_mem/meanshift_gpu_utils.cu
new file mode 100644
index 0000000..7f18758
--- /dev/null
+++ b/mean_shift_cuda_shared_mem/meanshift_gpu_utils.cu
@@ -0,0 +1,370 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <float.h>
+#include <string.h>
+#include <sys/time.h>
+
+#include <cublas_v2.h>
+
+#include "meanshift_utils.h"
+#include "meanshift_gpu_utils.h"
+
+cudaDeviceProp device_properties;
+
+struct timeval start_w_time, end_w_time;
+double seq;
+
+//Based on:
+//          https://www.cs.virginia.edu/~csadmin/wiki/index.php/CUDA_Support/Choosing_a_GPU
+void set_GPU(){
+    int devices_count = 0, max_multiprocessors = 0, max_device = 0;
+
+    // gets devices count checking for errors like no devices or no drivers to check for
+    // devices available
+    gpuErrchk( cudaGetDeviceCount(&devices_count) );
+    for(int device_index = 0; device_index < devices_count; ++device_index){
+        // gets current index device's properties
+        cudaDeviceProp this_device_properties;
+        gpuErrchk( cudaGetDeviceProperties(&this_device_properties, device_index) );
+
+        // stores best available device's index
+        // only devices with compute capability >= 2.0 are able to run the code
+        if (max_multiprocessors < this_device_properties.multiProcessorCount
+            && this_device_properties.major >= 2 && this_device_properties.minor >= 0){
+            // stores devices properties for later use
+            device_properties = this_device_properties;
+            max_multiprocessors = this_device_properties.multiProcessorCount;
+            max_device = device_index;
+        }
+    }
+    // sets the device
+    gpuErrchk( cudaSetDevice(max_device) );
+    if (params.verbose){
+        printf("Device chosen is \"%s\"\n"
+            "Device has %d multi processors and compute capability %d.%d\n"
+            "Max threads per block supported are %d\n\n"
+            , device_properties.name
+            , device_properties.multiProcessorCount, device_properties.major, device_properties.minor
+            , device_properties.maxThreadsPerBlock);
+    }
+}
+
+int meanshift(double **original_points, double ***shifted_points, int deviation){
+    // host variables
+    int size = 0;
+    static int recursion = 0;
+    static double **kernel_matrix, **mean_shift_vector, w_memcpy_time;
+    double **new_shift, current_norm = 0, tmp_w_memcpy_time;
+    bool is_first_recursion = false;
+
+    // device variables
+    static Matrix d_original_points, d_shifted_points, d_kernel_matrix, d_denominator,
+        d_mean_shift_vector;
+    Matrix d_new_shift;
+
+    // allocates memory and copies original points on first recursion
+    if (recursion == 0 || (*shifted_points) == NULL){
+        is_first_recursion = true;
+        // allocates memory for shifted points array and copies original points into it
+        (*shifted_points) = alloc_double(NUMBER_OF_POINTS, DIMENSIONS);
+        duplicate(original_points, NUMBER_OF_POINTS, DIMENSIONS, shifted_points);
+
+        // allocates memory for mean shift vector
+        mean_shift_vector = alloc_double(NUMBER_OF_POINTS, DIMENSIONS);
+        // initializes elements of mean_shift_vector to inf
+        for (int i=0;i<NUMBER_OF_POINTS;i++){
+            for (int j=0;j<DIMENSIONS;j++){
+                mean_shift_vector[i][j] = DBL_MAX;
+            }
+        }
+
+        // allocates memory for kernel_matrix
+        kernel_matrix = alloc_double(NUMBER_OF_POINTS, NUMBER_OF_POINTS);
+
+        // tic
+        gettimeofday (&start_w_time, NULL);
+
+        // allocates corresponding memory in device
+        init_device_memory(original_points, *shifted_points, &d_original_points, &d_shifted_points,
+            &d_kernel_matrix, &d_denominator, &d_mean_shift_vector);
+        // toc
+        gettimeofday (&end_w_time, NULL);
+        seq = (double)((end_w_time.tv_usec - start_w_time.tv_usec)
+            / 1.0e6 + end_w_time.tv_sec - start_w_time.tv_sec);
+
+        if (params.verbose){
+            printf("Device memory allocation wall clock time = %f\n\n", seq);
+        }
+    }
+
+    // finds pairwise distance matrix (inside radius)
+    // [I, D] = rangesearch(x,y,h);
+    calculate_kernel_matrix(d_shifted_points, d_original_points, d_kernel_matrix, deviation,
+        &kernel_matrix, &tmp_w_memcpy_time);
+    w_memcpy_time += tmp_w_memcpy_time;
+
+    // calculates denominator
+    calculate_denominator(d_kernel_matrix, d_denominator);
+
+    // creates new y vector
+    // allocates memory in every recursion
+    new_shift = alloc_double(NUMBER_OF_POINTS, DIMENSIONS);
+    // allocates corresponding memory in device
+    d_new_shift.width = DIMENSIONS;
+    d_new_shift.height = NUMBER_OF_POINTS;
+    size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double);
+    gpuErrchk( cudaMalloc(&(d_new_shift.elements), size) );
+
+    shift_points(d_kernel_matrix, d_original_points, d_shifted_points, d_new_shift, d_denominator,
+        d_mean_shift_vector, kernel_matrix, original_points, &new_shift, &mean_shift_vector,
+        &tmp_w_memcpy_time);
+    w_memcpy_time += tmp_w_memcpy_time;
+
+    for (int row=0; row<2; ++row){
+        for (int col=0; col<2; ++col){
+            printf("new_shift[%d][%d] = %f\n", row, col, new_shift[row][col]);
+            printf("new_shift[%d][%d] = %f\n", 300+row, 216+col, new_shift[300+row][216+col]);
+            printf("new_shift[%d][%d] = %f\n", 562+row, 487+col, new_shift[562+row][487+col]);
+        }
+    }
+
+    if(is_first_recursion){
+        exit(0);
+    }
+
+    // frees previously shifted points, they're now garbage
+    free((*shifted_points)[0]);
+    gpuErrchk( cudaFree(d_shifted_points.elements) );
+    // updates shifted points pointer to the new array address
+    shifted_points = &new_shift;
+    d_shifted_points.elements = d_new_shift.elements;
+
+    if (params.display){
+        save_matrix((*shifted_points), recursion);
+    }
+
+    // calculates norm of the new mean shift vector in GPU using "cuBlas" library function
+    cublasHandle_t handle;
+    cublasStatus_t cublas_status = cublasCreate(&handle);
+    if (cublas_status != CUBLAS_STATUS_SUCCESS){
+        exit(cublas_status);
+    }
+    cublas_status = cublasDnrm2(handle, NUMBER_OF_POINTS * DIMENSIONS, d_mean_shift_vector.elements,
+        1, &current_norm);
+    if (cublas_status != CUBLAS_STATUS_SUCCESS){
+        exit(cublas_status);
+    }
+    cublas_status = cublasDestroy(handle);
+    if (cublas_status != CUBLAS_STATUS_SUCCESS){
+        exit(cublas_status);
+    }
+
+    if (params.verbose){
+        printf("Recursion n. %d, error\t%f \n", recursion, current_norm);
+    }
+
+    // recurses until convergence
+    if (current_norm > params.epsilon) {
+        ++recursion;
+        meanshift(original_points, shifted_points, deviation);
+    }
+
+    if (is_first_recursion){
+        if (params.verbose){
+            printf("\nCopying between device and host wall clock time = %f\n", w_memcpy_time);
+        }
+
+        // cleans up allocations
+        free(mean_shift_vector[0]);
+        free(mean_shift_vector);
+        free(kernel_matrix[0]);
+        free(kernel_matrix);
+
+        free_device_memory(d_original_points, d_kernel_matrix, d_denominator, d_shifted_points);
+    }
+
+    return recursion;
+}
+
+void init_device_memory(double **original_points, double **shifted_points,
+    Matrix *d_original_points, Matrix *d_shifted_points, Matrix *d_kernel_matrix,
+    Matrix *d_denominator, Matrix *d_mean_shift_vector){
+    int size;
+
+    // allocates memory for original_points in GPU and copies the array
+    d_original_points->width = DIMENSIONS;
+    d_original_points->height = NUMBER_OF_POINTS;
+    size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double);
+    gpuErrchk( cudaMalloc(&(d_original_points->elements), size) );
+    gpuErrchk( cudaMemcpy(d_original_points->elements, &(original_points[0][0])
+        , size, cudaMemcpyHostToDevice) );
+
+    // allocates memory for shifted_points in GPU and copies the array
+    d_shifted_points->width = DIMENSIONS;
+    d_shifted_points->height = NUMBER_OF_POINTS;
+    size = DIMENSIONS * NUMBER_OF_POINTS * sizeof(double);
+    gpuErrchk( cudaMalloc(&(d_shifted_points->elements), size) );
+    gpuErrchk( cudaMemcpy(d_shifted_points->elements, &(shifted_points[0][0])
+        , size, cudaMemcpyHostToDevice) );
+
+    // allocates memory for kernel_matrix in GPU
+    d_kernel_matrix->width = NUMBER_OF_POINTS;
+    d_kernel_matrix->height = NUMBER_OF_POINTS;
+    size = NUMBER_OF_POINTS * NUMBER_OF_POINTS * sizeof(double);
+    gpuErrchk( cudaMalloc(&(d_kernel_matrix->elements), size) );
+
+    // allocates memory for denominator in GPU
+    d_denominator->width = 1;
+    d_denominator->height = NUMBER_OF_POINTS;
+    size = NUMBER_OF_POINTS * sizeof(double);
+    gpuErrchk( cudaMalloc(&(d_denominator->elements), size) );
+
+    // allocates memory for mean_shift_vector in GPU
+    d_mean_shift_vector->width = DIMENSIONS;
+    d_mean_shift_vector->height = NUMBER_OF_POINTS;
+    size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double);
+    gpuErrchk( cudaMalloc(&(d_mean_shift_vector->elements), size) );
+}
+
+void calculate_kernel_matrix(Matrix d_shifted_points, Matrix d_original_points,
+    Matrix d_kernel_matrix, double deviation, double ***kernel_matrix, double *w_memcpy_time){
+    int size;
+    static bool first_iter = true;
+    // gets max block size supported from the device
+    static int max_block_size = device_properties.maxThreadsPerBlock;
+    static int requested_block_size = (int)sqrt(max_block_size);
+    bool block_size_too_big = true;
+
+    dim3 dimBlock;
+    dim3 dimGrid;
+    do {
+        dimBlock.x = requested_block_size;
+        dimBlock.y = requested_block_size;
+        dimGrid.x = (d_kernel_matrix.height + dimBlock.x - 1) / dimBlock.x;
+        dimGrid.y = (d_kernel_matrix.width + dimBlock.y - 1) / dimBlock.y;
+
+        calculate_kernel_matrix_kernel<<<dimGrid, dimBlock>>>(d_shifted_points, d_original_points
+            , deviation, d_kernel_matrix);
+        if (cudaGetLastError() != cudaSuccess){
+            --requested_block_size;
+        } else {
+            block_size_too_big = false;
+            gpuErrchk( cudaDeviceSynchronize() );
+        }
+    } while(block_size_too_big);
+    
+    if (first_iter && params.verbose){
+        printf("calculate_kernel_matrix_kernel called with:\n");
+        printf("dimBlock.x = %d, dimBlock.y = %d\n", dimBlock.x, dimBlock.y);
+        printf("dimGrid.x = %d, dimGrid.y = %d\n\n", dimGrid.x, dimGrid.y);
+        first_iter = false;
+    }
+
+    size = NUMBER_OF_POINTS * NUMBER_OF_POINTS * sizeof(double);
+
+    // tic
+    gettimeofday (&start_w_time, NULL);
+
+    gpuErrchk( cudaMemcpy(&((*kernel_matrix)[0][0]), d_kernel_matrix.elements
+        , size, cudaMemcpyDeviceToHost) );
+
+    // toc
+    gettimeofday (&end_w_time, NULL);
+    *w_memcpy_time = (double)((end_w_time.tv_usec - start_w_time.tv_usec)
+        / 1.0e6 + end_w_time.tv_sec - start_w_time.tv_sec);
+}
+
+void calculate_denominator(Matrix d_kernel_matrix, Matrix d_denominator){
+    static bool first_iter = true;
+    // gets max block size supported from the device
+    static int requested_block_size = device_properties.maxThreadsPerBlock;
+    bool block_size_too_big = true;
+
+    dim3 dimBlock;
+    dim3 dimGrid;
+    do {
+        dimBlock.x = requested_block_size;
+        dimBlock.y = 1;
+        dimGrid.x = (d_kernel_matrix.height + dimBlock.x - 1) / dimBlock.x;
+        dimGrid.y = 1;
+
+        denominator_kernel<<<dimGrid, dimBlock>>>(d_denominator, d_kernel_matrix);
+        if (cudaGetLastError() != cudaSuccess){
+            --requested_block_size;
+        } else {
+            block_size_too_big = false;
+            gpuErrchk( cudaDeviceSynchronize() );
+        }
+    } while(block_size_too_big);
+    
+    if (first_iter && params.verbose){
+        printf("calculate_denominator called with:\n");
+        printf("dimBlock.x = %d, dimBlock.y = %d\n", dimBlock.x, dimBlock.y);
+        printf("dimGrid.x = %d, dimGrid.y = %d\n\n", dimGrid.x, dimGrid.y);
+        first_iter = false;
+    }
+}
+
+void shift_points(Matrix d_kernel_matrix, Matrix d_original_points, Matrix d_shifted_points,
+    Matrix d_new_shift, Matrix d_denominator, Matrix d_mean_shift_vector, double **kernel_matrix,
+    double **original_points, double ***new_shift, double ***mean_shift_vector,
+    double *w_memcpy_time){
+    int size;
+    static bool first_iter = true;
+    // gets max block size supported from the device
+    static int max_block_size = device_properties.maxThreadsPerBlock;
+    static int requested_block_size = (int)(max_block_size / d_new_shift.width);
+    bool block_size_too_big = true;
+
+    dim3 dimBlock;
+    dim3 dimGrid;
+    do {
+        /*dimBlock.x = requested_block_size;
+        dimBlock.y = d_new_shift.width;*/
+        dimBlock.x = 2;
+        dimBlock.y = 2;
+        dimGrid.x = (d_denominator.height + dimBlock.x - 1) / dimBlock.x;
+        dimGrid.y = 1;
+
+        shift_points_kernel<<<dimGrid, dimBlock>>>(d_original_points, d_kernel_matrix, d_shifted_points,
+            d_new_shift, d_denominator, d_mean_shift_vector);
+        if (cudaGetLastError() != cudaSuccess){
+            --requested_block_size;
+        } else {
+            block_size_too_big = false;
+            gpuErrchk( cudaDeviceSynchronize() );
+        }
+    } while(block_size_too_big);
+
+    if (first_iter && params.verbose){
+        printf("shift_points_kernel called with:\n");
+        printf("dimBlock.x = %d, dimBlock.y = %d\n", dimBlock.x, dimBlock.y);
+        printf("dimGrid.x = %d, dimGrid.y = %d\n\n", dimGrid.x, dimGrid.y);
+        first_iter = false;
+    }
+
+    size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double);
+
+    // tic
+    gettimeofday (&start_w_time, NULL);
+
+    gpuErrchk( cudaMemcpy(&((*new_shift)[0][0]), d_new_shift.elements
+        , size, cudaMemcpyDeviceToHost) );
+    gpuErrchk( cudaMemcpy(&((*mean_shift_vector)[0][0]), d_mean_shift_vector.elements
+        , size, cudaMemcpyDeviceToHost) );
+
+    // toc
+    gettimeofday (&end_w_time, NULL);
+    *w_memcpy_time = (double)((end_w_time.tv_usec - start_w_time.tv_usec)
+        / 1.0e6 + end_w_time.tv_sec - start_w_time.tv_sec);
+}
+
+void free_device_memory(Matrix d_original_points, Matrix d_kernel_matrix, Matrix d_denominator,
+    Matrix d_shifted_points){
+    // frees all memory previously allocated in device
+    gpuErrchk( cudaFree(d_original_points.elements) );
+    gpuErrchk( cudaFree(d_kernel_matrix.elements) );
+    gpuErrchk( cudaFree(d_denominator.elements) );
+    gpuErrchk( cudaFree(d_shifted_points.elements) );
+}
\ No newline at end of file
diff --git a/mean_shift_cuda_shared_mem/meanshift_gpu_utils.h b/mean_shift_cuda_shared_mem/meanshift_gpu_utils.h
new file mode 100644
index 0000000..83f9784
--- /dev/null
+++ b/mean_shift_cuda_shared_mem/meanshift_gpu_utils.h
@@ -0,0 +1,58 @@
+#ifndef SERIAL_GPU_UTILS_H    /*    Include guard    */
+#define SERIAL_GPU_UTILS_H
+
+#include "meanshift_kernels.h"
+
+//GPU error check snippet taken from:
+//              https://stackoverflow.com/a/14038590
+#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
+inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true){
+   if (code != cudaSuccess){
+      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
+      if (abort) exit(code);
+   }
+}
+
+/*        Global variables        */
+extern int DEVIATION;
+extern int NUMBER_OF_POINTS;
+extern int DIMENSIONS;
+extern const char* POINTS_FILENAME;
+extern const char* LABELS_FILENAME;
+extern Parameters params;
+extern cudaDeviceProp device_properties;
+
+//Function set_GPU parses available GPU devices, selects the one with the most multi-processors for
+//usage and stores its properties in global struct device_properties
+void set_GPU();
+
+//Function meanshift recursively shifts original points according to the mean-shift algorithm saving
+//the result to shiftedPoints, h is the desirable deviation
+int meanshift(double **original_points, double ***shifted_points, int h);
+
+//Function init_device_memory allocates memory for necessary arrays in the device
+void init_device_memory(double **original_points, double **shifted_points,
+    Matrix *d_original_points, Matrix *d_shifted_points, Matrix *d_kernel_matrix,
+    Matrix *d_denominator, Matrix *d_new_shift);
+
+//Function calculate_kernel_matrix is a wrapper for the kernel call of the corresponding kernel
+//"calculate_kernel_matrix_kernel" that calculates the kernel matrix
+void calculate_kernel_matrix(Matrix d_shifted_points, Matrix d_original_points,
+    Matrix d_kernel_matrix, double deviation, double ***kernel_matrix, double *w_memcpy_time);
+
+//Function calculate_denominator is a wrapper for the kernel call of the corresponding kernel
+//"calculate_denominator_kernel" that calculates the denominator of shifted points fraction
+void calculate_denominator(Matrix d_kernel_matrix, Matrix d_denominator);
+
+//Function shift_points is a wrapper for the kernel call of the corresponding kernel
+//"shift_points_kernel" that shifts the positions of all points
+void shift_points(Matrix d_kernel_matrix, Matrix d_original_points, Matrix d_shifted_points,
+    Matrix d_new_shift, Matrix d_denominator, Matrix d_mean_shift_vector, double **kernel_matrix,
+    double **original_points, double ***new_shift, double ***mean_shift_vector,
+    double *w_memcpy_time);
+
+//Function free_device_memory frees device's previously allocated memory
+void free_device_memory(Matrix d_original_points, Matrix d_kernel_matrix, Matrix d_denominator,
+    Matrix d_shifted_points);
+
+#endif //SERIAL_GPU_UTILS_H
\ No newline at end of file
diff --git a/mean_shift_cuda_shared_mem/meanshift_kernels.cu b/mean_shift_cuda_shared_mem/meanshift_kernels.cu
new file mode 100644
index 0000000..f8078fb
--- /dev/null
+++ b/mean_shift_cuda_shared_mem/meanshift_kernels.cu
@@ -0,0 +1,144 @@
+#include "meanshift_kernels.h"
+#include <stdio.h>
+#include <stdlib.h>
+
+__global__ void calculate_kernel_matrix_kernel(Matrix shifted_points, Matrix original_points,
+    double deviation, Matrix kernel_matrix){
+    // each thread calculates one element of kernel_matrix
+    int row = blockIdx.x * blockDim.x + threadIdx.x;
+    int col = blockIdx.y * blockDim.y + threadIdx.y;
+
+    // performs calculations only if thread's indexes are within matrix bounds
+    if (row * kernel_matrix.width + col >= kernel_matrix.width * kernel_matrix.height){
+        return;
+    }
+
+    int dimensions = shifted_points.width;
+    // calculate distance
+    double sum = 0, dif;
+    for (int i=0; i<dimensions; i++){
+        dif = shifted_points.elements[row * dimensions + i]
+            - original_points.elements[col * dimensions + i];
+        sum += dif * dif;
+    }
+    double distance = sqrt(sum);
+
+    double deviation_square = deviation*deviation;
+    if (distance < deviation_square){
+        // computes kernel matrix
+        double pow = ((-1)*(distance * distance))/(2*(deviation_square));
+        kernel_matrix.elements[row * kernel_matrix.width + col] = exp(pow);
+    } else {
+        kernel_matrix.elements[row * kernel_matrix.width + col] = 0;
+    }
+    if (row == col){
+        kernel_matrix.elements[row * kernel_matrix.width + col] += 1;
+    }
+}
+
+__global__ void denominator_kernel(Matrix denominator, Matrix kernel_matrix){
+    // each thread computes one element of denominator_kernel
+    // by accumulating results into cell_value
+    double cell_value = 0;
+    int row = blockIdx.x * blockDim.x + threadIdx.x;
+
+    // performs calculations only if thread's indexes are within matrix bounds
+    if (row >= denominator.height){
+        return;
+    }
+
+    for (int column = 0; column < kernel_matrix.width; ++column){
+         cell_value += kernel_matrix.elements[row * kernel_matrix.width + column];
+    }
+    denominator.elements[row] = cell_value;
+}
+
+__global__ void shift_points_kernel(Matrix original_points, Matrix kernel_matrix,
+    Matrix shifted_points, Matrix new_shift, Matrix denominator, Matrix mean_shift_vector){
+    int BLOCK_SIZE = blockDim.y;
+    int block_row = blockIdx.x;
+    int block_col = blockIdx.y;
+    
+    // each thread computes one element of new_shift by accumulating results into cell_value
+    double cell_value = 0;
+
+    // Thread row and column within sub_new_shift
+    int row = threadIdx.x;
+    int col = threadIdx.y;
+
+        // performs calculations only if thread's indexes are within matrix bounds
+    //if (row * new_shift.width + col >= new_shift.width * new_shift.height){
+    /*if (new_shift.stride * BLOCK_SIZE * block_row + BLOCK_SIZE * block_col >=
+        new_shift.width * new_shift.height){*/
+    if (BLOCK_SIZE * block_row >= new_shift.height || BLOCK_SIZE * block_col >= new_shift.width){
+        return;
+    }
+
+    // Each thread block computes one sub-matrix sub_new_shift of C
+    Matrix sub_new_shift = GetSubMatrix(new_shift, block_row, block_col, BLOCK_SIZE);
+
+    // shared memory used to store sub_kernel_matrix and sub_original_points respectively
+    __shared__ double *s_sub_kernel_matrix;
+    s_sub_kernel_matrix = (double*)malloc(BLOCK_SIZE * BLOCK_SIZE * sizeof(double));
+    __shared__ double *s_sub_original_points;
+    s_sub_original_points = (double*)malloc(BLOCK_SIZE * BLOCK_SIZE * sizeof(double));
+
+    // loops over all the sub-matrices of kernel_matrix and original_points that are required to
+    //compute sub_new_shift, multiplies each pair of sub-matrices and accumulates the results
+    for (int sub_matrix_index = 0; sub_matrix_index < (kernel_matrix.width / BLOCK_SIZE); ++sub_matrix_index) {
+
+        // gets sub-matrix sub_kernel_matrix of kernel_matrix
+        Matrix sub_kernel_matrix = GetSubMatrix(kernel_matrix, block_row, sub_matrix_index, BLOCK_SIZE);
+        // gets sub-matrix sub_original_points of original_points
+        Matrix sub_original_points = GetSubMatrix(original_points, sub_matrix_index, block_col, BLOCK_SIZE);
+
+        // loads s_sub_kernel_matrix and s_sub_original_points from device global memory to shared
+        //memory, each thread loads one element of each sub-matrix
+        s_sub_kernel_matrix[row * BLOCK_SIZE + col] =
+            sub_kernel_matrix.elements[row * sub_kernel_matrix.stride + col];
+        s_sub_original_points[row * BLOCK_SIZE + col] =
+            sub_original_points.elements[row * sub_original_points.stride + col];
+
+        // synchronizes to make sure the sub-matrices are loaded before starting the computation
+        __syncthreads();
+
+        // multiplies sub_kernel_matrix and sub_original_points
+        for (int element_index = 0; element_index < BLOCK_SIZE; ++element_index){
+            cell_value += s_sub_kernel_matrix[row * sub_kernel_matrix.stride + element_index] *
+                s_sub_original_points[element_index * sub_original_points.stride + col];
+        }
+
+        // synchronizes to make sure that the preceding computation is done before loading two new
+        // sub-matrices of kernel_matrix and original_points in the next iteration
+        __syncthreads();
+    }
+
+    // new_shift elements are calculated by dividing with the denominator
+    int cell_row = (block_row * BLOCK_SIZE + row) * new_shift.stride;
+    int cell_col = block_col * BLOCK_SIZE + col;
+    //sub_new_shift.elements[cell_row + cell_col] = cell_value / denominator.elements[cell_row];
+    sub_new_shift.elements[row * sub_new_shift.stride + col] =
+        cell_value / denominator.elements[block_row * BLOCK_SIZE + row];
+
+    // calculates mean-shift vector
+    /*mean_shift_vector.elements[(block_row * BLOCK_SIZE + row) * mean_shift_vector.stride
+        + (block_col * BLOCK_SIZE + col)] =
+        sub_new_shift.elements[row * sub_new_shift.stride + col] -
+        shifted_points.elements[(block_row * BLOCK_SIZE + row) * shifted_points.stride
+        + (block_col * BLOCK_SIZE + col)];*/
+
+    /*free(s_sub_kernel_matrix);
+    free(s_sub_original_points);*/
+}
+
+// Get the BLOCK_SIZExBLOCK_SIZE sub-matrix Asub of A that is
+// located col sub-matrices to the right and row sub-matrices down
+// from the upper-left corner of A
+__device__ Matrix GetSubMatrix(Matrix A, int row, int col, int BLOCK_SIZE){
+    Matrix Asub;
+    Asub.width = BLOCK_SIZE;
+    Asub.height = BLOCK_SIZE;
+    Asub.stride = BLOCK_SIZE;
+    Asub.elements = &(A.elements[A.stride * BLOCK_SIZE * row + BLOCK_SIZE * col]);
+    return Asub;
+}
\ No newline at end of file
diff --git a/mean_shift_cuda_shared_mem/meanshift_kernels.h b/mean_shift_cuda_shared_mem/meanshift_kernels.h
new file mode 100644
index 0000000..58836e6
--- /dev/null
+++ b/mean_shift_cuda_shared_mem/meanshift_kernels.h
@@ -0,0 +1,29 @@
+#ifndef SERIAL_KERNELS_H    /*    Include guard    */
+#define SERIAL_KERNELS_H
+
+/*      Structures     */
+
+//Matrix is used to describe matrices
+typedef struct {
+    int width;
+    int height;
+    int stride;
+    double *elements;
+} Matrix;
+
+//Kernel calculate_kernel_matrix_kernel calculates the current kernel matrix
+__global__ void calculate_kernel_matrix_kernel(Matrix shifted_points, Matrix original_points,
+    double deviation, Matrix kernel_matrix);
+
+//Kernel denominator_kernel calculates the sum in the denominator of the fraction used to find new
+//(shifted) positions of the points
+__global__ void denominator_kernel(Matrix denominator, Matrix kernel_matrix);
+
+//Kernel shift_points_kernel shifts the positions of all points and calculates the new mean shift
+//vector according to the new point array
+__global__ void shift_points_kernel(Matrix original_points, Matrix kernel_matrix,
+    Matrix shifted_points, Matrix new_shift, Matrix denominator, Matrix mean_shift_vector);
+
+__device__ Matrix GetSubMatrix(Matrix A, int row, int col, int BLOCK_SIZE);
+
+#endif //SERIAL_KERNELS_H
\ No newline at end of file
diff --git a/mean_shift_cuda_shared_mem/meanshift_utils.cu b/mean_shift_cuda_shared_mem/meanshift_utils.cu
new file mode 100644
index 0000000..6cef7dd
--- /dev/null
+++ b/mean_shift_cuda_shared_mem/meanshift_utils.cu
@@ -0,0 +1,165 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <float.h>
+#include <string.h>
+
+#include "meanshift_utils.h"
+#include "meanshift_gpu_utils.h"
+
+#define OUTPUT_PREFIX "../output/output_"
+
+void get_args(int argc, char **argv, parameters *params){
+    if (argc < 7) {
+        printf("Usage: %s h e N D Pd Pl\nwhere:\n"
+        "\th is the variance\n"
+        "\te is the min distance, between two points, that is taken into account in computations\n"
+        "\tN is the the number of points\n"
+        "\tD is the number of dimensions of each point\n"
+        "\tPd is the path of the dataset file\n"
+        "\tPl is the path of the labels file\n"
+        "\n\t--verbose | -v is an optional flag to enable execution information output"
+        "\n\t--output | -o is an optional flag to enable points output in each iteration", argv[0]);
+        exit(1);
+    }
+
+    DEVIATION = atoi(argv[1]);
+    params->epsilon = atof(argv[2]);
+    NUMBER_OF_POINTS = atoi(argv[3]);
+    DIMENSIONS = atoi(argv[4]);
+    POINTS_FILENAME = argv[5];
+    LABELS_FILENAME = argv[6];
+    params->verbose = false;
+    params->display = false;
+
+    if (argc > 7){
+        for (int index=7; index<argc; ++index){
+            if (!strcmp(argv[index], "--verbose") || !strcmp(argv[index], "-v")){
+                params->verbose = true;
+            } else if (!strcmp(argv[index], "--output") || !strcmp(argv[index], "-o")){
+                params->display = true;
+            } else {
+                printf("Couldn't parse argument %d: %s\n", index, argv[index]);
+                exit(EXIT_FAILURE);
+            }
+        }
+    }
+
+    /*printf("DEVIATION = %d\n"
+        "epsilon = %f\n"
+        "NUMBER_OF_POINTS = %d\n"
+        "DIMENSIONS = %d\n"
+        "POINTS_FILENAME = %s\n"
+        "LABELS_FILENAME = %s\n"
+        "verbose = %d\n"
+        "display = %d\n", DEVIATION, params->epsilon, NUMBER_OF_POINTS, DIMENSIONS, POINTS_FILENAME
+            , LABELS_FILENAME, params->verbose, params->display);*/
+}
+
+void init(double ***vectors, char **labels){
+    int bytes_read = 0;
+
+    set_GPU();
+
+    if (params.verbose){
+        printf("Reading dataset and labels...\n");
+    }
+
+    // initializes vectors
+    FILE *points_file;
+    points_file = fopen(POINTS_FILENAME, "rb");
+    if (points_file != NULL){
+        // allocates memory for the array
+        (*vectors) = alloc_double(NUMBER_OF_POINTS, DIMENSIONS);
+        // reads vectors dataset from file
+        for (int i=0; i<NUMBER_OF_POINTS; i++){
+            bytes_read = fread((*vectors)[i], sizeof(double), DIMENSIONS, points_file);
+            if ( bytes_read != DIMENSIONS ){
+                if(feof(points_file)){
+                    printf("Premature end of file reached.\n");
+                } else{
+                    printf("Error reading points file.");
+                }
+                fclose(points_file);
+                exit(EXIT_FAILURE);
+            }
+        }
+    } else {
+        printf("Error reading dataset file.\n");
+        exit(EXIT_FAILURE);
+    }
+    fclose(points_file);
+
+    // initializes file that will contain the labels (train)
+    FILE *labels_file;
+    labels_file = fopen(LABELS_FILENAME, "rb");
+    if (labels_file != NULL){
+        // NOTE : Labels were classified as <class 'numpy.uint8'>
+        // variables of type uint8 are stored as 1-byte (8-bit) unsigned integers
+        // gets number of labels
+        fseek(labels_file, 0L, SEEK_END);
+        long int pos = ftell(labels_file);
+        rewind(labels_file);
+        int label_elements = pos/ sizeof(char);
+
+        // allocates memory for the array
+        *labels = (char*)malloc(label_elements* sizeof(char));
+        fseek(labels_file, 0L, SEEK_SET);
+        bytes_read = fread((*labels), sizeof(char), label_elements, labels_file);
+        if ( bytes_read != label_elements ){
+            if(feof(points_file)){
+                printf("Premature end of file reached.\n");
+            } else{
+                printf("Error reading points file.");
+            }
+            fclose(labels_file);
+            exit(EXIT_FAILURE);
+        }
+    }
+    fclose(labels_file);
+
+    if (params.verbose){
+        printf("Done.\n\n");
+    }
+}
+
+double **alloc_double(int rows, int cols) {
+    double *data = (double *) malloc(rows*cols*sizeof(double));
+    double **array = (double **) malloc(rows*sizeof(double*));
+    for (int i=0; i<rows; i++)
+        array[i] = &(data[cols*i]);
+    return array;
+}
+
+void duplicate(double **source, int rows, int cols, double ***dest){
+    for (int i=0; i<rows; i++){
+        for (int j=0; j<cols; j++){
+            (*dest)[i][j] = source[i][j];
+        }
+    }
+}
+
+void print_matrix(double **array, int rows, int cols){
+    for (int i=0; i<cols; i++){
+        for (int j=0; j<rows; j++){
+            printf("%f ", array[j][i]);
+        }
+        printf("\n");
+    }
+}
+
+void save_matrix(double **matrix, int iteration){
+    char filename[50];
+    snprintf(filename, sizeof(filename), "%s%d", "../output/output_", iteration);
+    FILE *file;
+    file = fopen(filename, "w");
+    for (int rows=0; rows<NUMBER_OF_POINTS; ++rows){
+        for (int cols=0; cols<DIMENSIONS; ++cols){
+            fprintf(file, "%f", matrix[rows][cols]);
+            if (cols != DIMENSIONS - 1){
+                fprintf(file, ",");
+            }
+        }
+        fprintf(file, "\n");
+    }
+}
\ No newline at end of file
diff --git a/mean_shift_cuda_shared_mem/meanshift_utils.h b/mean_shift_cuda_shared_mem/meanshift_utils.h
new file mode 100644
index 0000000..3be505a
--- /dev/null
+++ b/mean_shift_cuda_shared_mem/meanshift_utils.h
@@ -0,0 +1,35 @@
+#ifndef SERIAL_UTILS_H    /*    Include guard    */
+#define SERIAL_UTILS_H
+
+#include <stdbool.h>
+
+/*      Structures     */
+
+//Parameters is used to store session specific variables in an orderly way
+typedef struct parameters {
+    double epsilon;
+    bool verbose;
+    bool display;
+} Parameters;
+
+//Function get_args parses command line arguments
+void get_args(int argc, char **argv, Parameters *params);
+
+//Function init sets up the GPU for later use, gets its properties and reads the dataset and label
+//arrays from the corresponding files
+void init(double ***vectors, char **labels);
+
+//Function alloc_double allocates rows*cols bytes of continuous memory
+double **alloc_double(int rows, int cols);
+
+//Function duplicate copies the values of source array to dest array
+void duplicate(double **source, int rows, int cols, double ***dest);
+
+//Function print_matrix prints array of dimensions <rows X cols> to the console
+void print_matrix(double **array, int rows, int cols);
+
+//Function save_matrix stores matrix in a csv file with path/filename "../output/output_iteration".
+//If a file already exists new lines are concatenated
+void save_matrix(double **matrix, int iteration);
+
+#endif //SERIAL_UTILS_H
\ No newline at end of file