Shared memory implementation init

7 years ago · 649e1d7850
8 changed files with 889 additions and 0 deletions
--- a/mean_shift_cuda_shared_mem/Makefile
+++ b/mean_shift_cuda_shared_mem/Makefile
@ -0,0 +1,43 @@
 SHELL := /bin/bash
 # ============================================
 # COMMANDS
 CC = nvcc
 HOST_COMPILER = -ccbin gcc
 CUDA_FLAGS = -arch=sm_21 -Wno-deprecated-gpu-targets -lcublas
 C_FLAGS = -lm -O3 -I.
 COMPILE_FLAGS = $(HOST_COMPILER) -x cu $(CUDA_FLAGS) -dc $(C_FLAGS)
 LINK_FLAGS = $(HOST_COMPILER) $(CUDA_FLAGS) $(C_FLAGS)
 OBJ = meanshift.o meanshift_utils.o meanshift_gpu_utils.o meanshift_kernels.o
 DEPS = meanshift_utils.h meanshift_kernels.h
 RM = rm -f
 # ==========================================
 # TARGETS
 EXECUTABLES = meanshift
 .PHONY: all clean
 all: $(EXECUTABLES)
 # ==========================================
 # DEPENDENCIES (HEADERS)
 %.o: %.cu $(DEPS)
 	$(CC) $(COMPILE_FLAGS) $< -o $@
 .PRECIOUS: $(EXECUTABLES) $(OBJ)
 # ==========================================
 # EXECUTABLE (MAIN)
 $(EXECUTABLES): $(OBJ)
 	$(CC) $(LINK_FLAGS) $(OBJ) -o $@
 clean:
 	$(RM) *.o *~ $(EXECUTABLES)
--- a/mean_shift_cuda_shared_mem/meanshift.cu
+++ b/mean_shift_cuda_shared_mem/meanshift.cu
@ -0,0 +1,45 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <sys/time.h>
 #include "meanshift_utils.h"
 #include "meanshift_gpu_utils.h"
 int DEVIATION = 1;
 int NUMBER_OF_POINTS = 600;
 int DIMENSIONS = 2;
 const char *POINTS_FILENAME = "../data/X.bin";
 const char *LABELS_FILENAME = "../data/L.bin";
 parameters params;
 struct timeval startwtime, endwtime;
 double seq_time;
 int main(int argc, char **argv){
    int recursions = 0;
    double **vectors, **shifted_points;
    char *labels;
    params.epsilon = 0.0001;
    params.verbose = false;
    params.display = true;
    //get_args(argc, argv, &params); //commented out while in development
    init(&vectors, &labels);
    // tic
    gettimeofday (&startwtime, NULL);
    recursions = meanshift(vectors, &shifted_points, DEVIATION);
    // toc
    gettimeofday (&endwtime, NULL);
    seq_time = (double)((endwtime.tv_usec - startwtime.tv_usec)/1.0e6 + endwtime.tv_sec - startwtime.tv_sec);
    printf("\nTotal number of recursions = %d\n", recursions);
    printf("%s wall clock time = %f\n","Mean Shift", seq_time);
    free(vectors[0]);
    free(vectors);
    free(shifted_points[0]);
    free(shifted_points);
 }
--- a/mean_shift_cuda_shared_mem/meanshift_gpu_utils.cu
+++ b/mean_shift_cuda_shared_mem/meanshift_gpu_utils.cu
@ -0,0 +1,370 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <math.h>
 #include <float.h>
 #include <string.h>
 #include <sys/time.h>
 #include <cublas_v2.h>
 #include "meanshift_utils.h"
 #include "meanshift_gpu_utils.h"
 cudaDeviceProp device_properties;
 struct timeval start_w_time, end_w_time;
 double seq;
 //Based on:
 //          https://www.cs.virginia.edu/~csadmin/wiki/index.php/CUDA_Support/Choosing_a_GPU
 void set_GPU(){
    int devices_count = 0, max_multiprocessors = 0, max_device = 0;
    // gets devices count checking for errors like no devices or no drivers to check for
    // devices available
    gpuErrchk( cudaGetDeviceCount(&devices_count) );
    for(int device_index = 0; device_index < devices_count; ++device_index){
        // gets current index device's properties
        cudaDeviceProp this_device_properties;
        gpuErrchk( cudaGetDeviceProperties(&this_device_properties, device_index) );
        // stores best available device's index
        // only devices with compute capability >= 2.0 are able to run the code
        if (max_multiprocessors < this_device_properties.multiProcessorCount
            && this_device_properties.major >= 2 && this_device_properties.minor >= 0){
            // stores devices properties for later use
            device_properties = this_device_properties;
            max_multiprocessors = this_device_properties.multiProcessorCount;
            max_device = device_index;
        }
    }
    // sets the device
    gpuErrchk( cudaSetDevice(max_device) );
    if (params.verbose){
        printf("Device chosen is \"%s\"\n"
            "Device has %d multi processors and compute capability %d.%d\n"
            "Max threads per block supported are %d\n\n"
            , device_properties.name
            , device_properties.multiProcessorCount, device_properties.major, device_properties.minor
            , device_properties.maxThreadsPerBlock);
    }
 }
 int meanshift(double **original_points, double ***shifted_points, int deviation){
    // host variables
    int size = 0;
    static int recursion = 0;
    static double **kernel_matrix, **mean_shift_vector, w_memcpy_time;
    double **new_shift, current_norm = 0, tmp_w_memcpy_time;
    bool is_first_recursion = false;
    // device variables
    static Matrix d_original_points, d_shifted_points, d_kernel_matrix, d_denominator,
        d_mean_shift_vector;
    Matrix d_new_shift;
    // allocates memory and copies original points on first recursion
    if (recursion == 0 || (*shifted_points) == NULL){
        is_first_recursion = true;
        // allocates memory for shifted points array and copies original points into it
        (*shifted_points) = alloc_double(NUMBER_OF_POINTS, DIMENSIONS);
        duplicate(original_points, NUMBER_OF_POINTS, DIMENSIONS, shifted_points);
        // allocates memory for mean shift vector
        mean_shift_vector = alloc_double(NUMBER_OF_POINTS, DIMENSIONS);
        // initializes elements of mean_shift_vector to inf
        for (int i=0;i<NUMBER_OF_POINTS;i++){
            for (int j=0;j<DIMENSIONS;j++){
                mean_shift_vector[i][j] = DBL_MAX;
            }
        }
        // allocates memory for kernel_matrix
        kernel_matrix = alloc_double(NUMBER_OF_POINTS, NUMBER_OF_POINTS);
        // tic
        gettimeofday (&start_w_time, NULL);
        // allocates corresponding memory in device
        init_device_memory(original_points, *shifted_points, &d_original_points, &d_shifted_points,
            &d_kernel_matrix, &d_denominator, &d_mean_shift_vector);
        // toc
        gettimeofday (&end_w_time, NULL);
        seq = (double)((end_w_time.tv_usec - start_w_time.tv_usec)
            / 1.0e6 + end_w_time.tv_sec - start_w_time.tv_sec);
        if (params.verbose){
            printf("Device memory allocation wall clock time = %f\n\n", seq);
        }
    }
    // finds pairwise distance matrix (inside radius)
    // [I, D] = rangesearch(x,y,h);
    calculate_kernel_matrix(d_shifted_points, d_original_points, d_kernel_matrix, deviation,
        &kernel_matrix, &tmp_w_memcpy_time);
    w_memcpy_time += tmp_w_memcpy_time;
    // calculates denominator
    calculate_denominator(d_kernel_matrix, d_denominator);
    // creates new y vector
    // allocates memory in every recursion
    new_shift = alloc_double(NUMBER_OF_POINTS, DIMENSIONS);
    // allocates corresponding memory in device
    d_new_shift.width = DIMENSIONS;
    d_new_shift.height = NUMBER_OF_POINTS;
    size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double);
    gpuErrchk( cudaMalloc(&(d_new_shift.elements), size) );
    shift_points(d_kernel_matrix, d_original_points, d_shifted_points, d_new_shift, d_denominator,
        d_mean_shift_vector, kernel_matrix, original_points, &new_shift, &mean_shift_vector,
        &tmp_w_memcpy_time);
    w_memcpy_time += tmp_w_memcpy_time;
    for (int row=0; row<2; ++row){
        for (int col=0; col<2; ++col){
            printf("new_shift[%d][%d] = %f\n", row, col, new_shift[row][col]);
            printf("new_shift[%d][%d] = %f\n", 300+row, 216+col, new_shift[300+row][216+col]);
            printf("new_shift[%d][%d] = %f\n", 562+row, 487+col, new_shift[562+row][487+col]);
        }
    }
    if(is_first_recursion){
        exit(0);
    }
    // frees previously shifted points, they're now garbage
    free((*shifted_points)[0]);
    gpuErrchk( cudaFree(d_shifted_points.elements) );
    // updates shifted points pointer to the new array address
    shifted_points = &new_shift;
    d_shifted_points.elements = d_new_shift.elements;
    if (params.display){
        save_matrix((*shifted_points), recursion);
    }
    // calculates norm of the new mean shift vector in GPU using "cuBlas" library function
    cublasHandle_t handle;
    cublasStatus_t cublas_status = cublasCreate(&handle);
    if (cublas_status != CUBLAS_STATUS_SUCCESS){
        exit(cublas_status);
    }
    cublas_status = cublasDnrm2(handle, NUMBER_OF_POINTS * DIMENSIONS, d_mean_shift_vector.elements,
        1, &current_norm);
    if (cublas_status != CUBLAS_STATUS_SUCCESS){
        exit(cublas_status);
    }
    cublas_status = cublasDestroy(handle);
    if (cublas_status != CUBLAS_STATUS_SUCCESS){
        exit(cublas_status);
    }
    if (params.verbose){
        printf("Recursion n. %d, error\t%f \n", recursion, current_norm);
    }
    // recurses until convergence
    if (current_norm > params.epsilon) {
        ++recursion;
        meanshift(original_points, shifted_points, deviation);
    }
    if (is_first_recursion){
        if (params.verbose){
            printf("\nCopying between device and host wall clock time = %f\n", w_memcpy_time);
        }
        // cleans up allocations
        free(mean_shift_vector[0]);
        free(mean_shift_vector);
        free(kernel_matrix[0]);
        free(kernel_matrix);
        free_device_memory(d_original_points, d_kernel_matrix, d_denominator, d_shifted_points);
    }
    return recursion;
 }
 void init_device_memory(double **original_points, double **shifted_points,
    Matrix *d_original_points, Matrix *d_shifted_points, Matrix *d_kernel_matrix,
    Matrix *d_denominator, Matrix *d_mean_shift_vector){
    int size;
    // allocates memory for original_points in GPU and copies the array
    d_original_points->width = DIMENSIONS;
    d_original_points->height = NUMBER_OF_POINTS;
    size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double);
    gpuErrchk( cudaMalloc(&(d_original_points->elements), size) );
    gpuErrchk( cudaMemcpy(d_original_points->elements, &(original_points[0][0])
        , size, cudaMemcpyHostToDevice) );
    // allocates memory for shifted_points in GPU and copies the array
    d_shifted_points->width = DIMENSIONS;
    d_shifted_points->height = NUMBER_OF_POINTS;
    size = DIMENSIONS * NUMBER_OF_POINTS * sizeof(double);
    gpuErrchk( cudaMalloc(&(d_shifted_points->elements), size) );
    gpuErrchk( cudaMemcpy(d_shifted_points->elements, &(shifted_points[0][0])
        , size, cudaMemcpyHostToDevice) );
    // allocates memory for kernel_matrix in GPU
    d_kernel_matrix->width = NUMBER_OF_POINTS;
    d_kernel_matrix->height = NUMBER_OF_POINTS;
    size = NUMBER_OF_POINTS * NUMBER_OF_POINTS * sizeof(double);
    gpuErrchk( cudaMalloc(&(d_kernel_matrix->elements), size) );
    // allocates memory for denominator in GPU
    d_denominator->width = 1;
    d_denominator->height = NUMBER_OF_POINTS;
    size = NUMBER_OF_POINTS * sizeof(double);
    gpuErrchk( cudaMalloc(&(d_denominator->elements), size) );
    // allocates memory for mean_shift_vector in GPU
    d_mean_shift_vector->width = DIMENSIONS;
    d_mean_shift_vector->height = NUMBER_OF_POINTS;
    size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double);
    gpuErrchk( cudaMalloc(&(d_mean_shift_vector->elements), size) );
 }
 void calculate_kernel_matrix(Matrix d_shifted_points, Matrix d_original_points,
    Matrix d_kernel_matrix, double deviation, double ***kernel_matrix, double *w_memcpy_time){
    int size;
    static bool first_iter = true;
    // gets max block size supported from the device
    static int max_block_size = device_properties.maxThreadsPerBlock;
    static int requested_block_size = (int)sqrt(max_block_size);
    bool block_size_too_big = true;
    dim3 dimBlock;
    dim3 dimGrid;
    do {
        dimBlock.x = requested_block_size;
        dimBlock.y = requested_block_size;
        dimGrid.x = (d_kernel_matrix.height + dimBlock.x - 1) / dimBlock.x;
        dimGrid.y = (d_kernel_matrix.width + dimBlock.y - 1) / dimBlock.y;
        calculate_kernel_matrix_kernel<<<dimGrid, dimBlock>>>(d_shifted_points, d_original_points
            , deviation, d_kernel_matrix);
        if (cudaGetLastError() != cudaSuccess){
            --requested_block_size;
        } else {
            block_size_too_big = false;
            gpuErrchk( cudaDeviceSynchronize() );
        }
    } while(block_size_too_big);
    if (first_iter && params.verbose){
        printf("calculate_kernel_matrix_kernel called with:\n");
        printf("dimBlock.x = %d, dimBlock.y = %d\n", dimBlock.x, dimBlock.y);
        printf("dimGrid.x = %d, dimGrid.y = %d\n\n", dimGrid.x, dimGrid.y);
        first_iter = false;
    }
    size = NUMBER_OF_POINTS * NUMBER_OF_POINTS * sizeof(double);
    // tic
    gettimeofday (&start_w_time, NULL);
    gpuErrchk( cudaMemcpy(&((*kernel_matrix)[0][0]), d_kernel_matrix.elements
        , size, cudaMemcpyDeviceToHost) );
    // toc
    gettimeofday (&end_w_time, NULL);
    *w_memcpy_time = (double)((end_w_time.tv_usec - start_w_time.tv_usec)
        / 1.0e6 + end_w_time.tv_sec - start_w_time.tv_sec);
 }
 void calculate_denominator(Matrix d_kernel_matrix, Matrix d_denominator){
    static bool first_iter = true;
    // gets max block size supported from the device
    static int requested_block_size = device_properties.maxThreadsPerBlock;
    bool block_size_too_big = true;
    dim3 dimBlock;
    dim3 dimGrid;
    do {
        dimBlock.x = requested_block_size;
        dimBlock.y = 1;
        dimGrid.x = (d_kernel_matrix.height + dimBlock.x - 1) / dimBlock.x;
        dimGrid.y = 1;
        denominator_kernel<<<dimGrid, dimBlock>>>(d_denominator, d_kernel_matrix);
        if (cudaGetLastError() != cudaSuccess){
            --requested_block_size;
        } else {
            block_size_too_big = false;
            gpuErrchk( cudaDeviceSynchronize() );
        }
    } while(block_size_too_big);
    if (first_iter && params.verbose){
        printf("calculate_denominator called with:\n");
        printf("dimBlock.x = %d, dimBlock.y = %d\n", dimBlock.x, dimBlock.y);
        printf("dimGrid.x = %d, dimGrid.y = %d\n\n", dimGrid.x, dimGrid.y);
        first_iter = false;
    }
 }
 void shift_points(Matrix d_kernel_matrix, Matrix d_original_points, Matrix d_shifted_points,
    Matrix d_new_shift, Matrix d_denominator, Matrix d_mean_shift_vector, double **kernel_matrix,
    double **original_points, double ***new_shift, double ***mean_shift_vector,
    double *w_memcpy_time){
    int size;
    static bool first_iter = true;
    // gets max block size supported from the device
    static int max_block_size = device_properties.maxThreadsPerBlock;
    static int requested_block_size = (int)(max_block_size / d_new_shift.width);
    bool block_size_too_big = true;
    dim3 dimBlock;
    dim3 dimGrid;
    do {
        /*dimBlock.x = requested_block_size;
        dimBlock.y = d_new_shift.width;*/
        dimBlock.x = 2;
        dimBlock.y = 2;
        dimGrid.x = (d_denominator.height + dimBlock.x - 1) / dimBlock.x;
        dimGrid.y = 1;
        shift_points_kernel<<<dimGrid, dimBlock>>>(d_original_points, d_kernel_matrix, d_shifted_points,
            d_new_shift, d_denominator, d_mean_shift_vector);
        if (cudaGetLastError() != cudaSuccess){
            --requested_block_size;
        } else {
            block_size_too_big = false;
            gpuErrchk( cudaDeviceSynchronize() );
        }
    } while(block_size_too_big);
    if (first_iter && params.verbose){
        printf("shift_points_kernel called with:\n");
        printf("dimBlock.x = %d, dimBlock.y = %d\n", dimBlock.x, dimBlock.y);
        printf("dimGrid.x = %d, dimGrid.y = %d\n\n", dimGrid.x, dimGrid.y);
        first_iter = false;
    }
    size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double);
    // tic
    gettimeofday (&start_w_time, NULL);
    gpuErrchk( cudaMemcpy(&((*new_shift)[0][0]), d_new_shift.elements
        , size, cudaMemcpyDeviceToHost) );
    gpuErrchk( cudaMemcpy(&((*mean_shift_vector)[0][0]), d_mean_shift_vector.elements
        , size, cudaMemcpyDeviceToHost) );
    // toc
    gettimeofday (&end_w_time, NULL);
    *w_memcpy_time = (double)((end_w_time.tv_usec - start_w_time.tv_usec)
        / 1.0e6 + end_w_time.tv_sec - start_w_time.tv_sec);
 }
 void free_device_memory(Matrix d_original_points, Matrix d_kernel_matrix, Matrix d_denominator,
    Matrix d_shifted_points){
    // frees all memory previously allocated in device
    gpuErrchk( cudaFree(d_original_points.elements) );
    gpuErrchk( cudaFree(d_kernel_matrix.elements) );
    gpuErrchk( cudaFree(d_denominator.elements) );
    gpuErrchk( cudaFree(d_shifted_points.elements) );
 }
--- a/mean_shift_cuda_shared_mem/meanshift_gpu_utils.h
+++ b/mean_shift_cuda_shared_mem/meanshift_gpu_utils.h
@ -0,0 +1,58 @@
 #ifndef SERIAL_GPU_UTILS_H    /*    Include guard    */
 #define SERIAL_GPU_UTILS_H
 #include "meanshift_kernels.h"
 //GPU error check snippet taken from:
 //              https://stackoverflow.com/a/14038590
 #define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
 inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true){
   if (code != cudaSuccess){
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
 }
 /*        Global variables        */
 extern int DEVIATION;
 extern int NUMBER_OF_POINTS;
 extern int DIMENSIONS;
 extern const char* POINTS_FILENAME;
 extern const char* LABELS_FILENAME;
 extern Parameters params;
 extern cudaDeviceProp device_properties;
 //Function set_GPU parses available GPU devices, selects the one with the most multi-processors for
 //usage and stores its properties in global struct device_properties
 void set_GPU();
 //Function meanshift recursively shifts original points according to the mean-shift algorithm saving
 //the result to shiftedPoints, h is the desirable deviation
 int meanshift(double **original_points, double ***shifted_points, int h);
 //Function init_device_memory allocates memory for necessary arrays in the device
 void init_device_memory(double **original_points, double **shifted_points,
    Matrix *d_original_points, Matrix *d_shifted_points, Matrix *d_kernel_matrix,
    Matrix *d_denominator, Matrix *d_new_shift);
 //Function calculate_kernel_matrix is a wrapper for the kernel call of the corresponding kernel
 //"calculate_kernel_matrix_kernel" that calculates the kernel matrix
 void calculate_kernel_matrix(Matrix d_shifted_points, Matrix d_original_points,
    Matrix d_kernel_matrix, double deviation, double ***kernel_matrix, double *w_memcpy_time);
 //Function calculate_denominator is a wrapper for the kernel call of the corresponding kernel
 //"calculate_denominator_kernel" that calculates the denominator of shifted points fraction
 void calculate_denominator(Matrix d_kernel_matrix, Matrix d_denominator);
 //Function shift_points is a wrapper for the kernel call of the corresponding kernel
 //"shift_points_kernel" that shifts the positions of all points
 void shift_points(Matrix d_kernel_matrix, Matrix d_original_points, Matrix d_shifted_points,
    Matrix d_new_shift, Matrix d_denominator, Matrix d_mean_shift_vector, double **kernel_matrix,
    double **original_points, double ***new_shift, double ***mean_shift_vector,
    double *w_memcpy_time);
 //Function free_device_memory frees device's previously allocated memory
 void free_device_memory(Matrix d_original_points, Matrix d_kernel_matrix, Matrix d_denominator,
    Matrix d_shifted_points);
 #endif //SERIAL_GPU_UTILS_H
--- a/mean_shift_cuda_shared_mem/meanshift_kernels.cu
+++ b/mean_shift_cuda_shared_mem/meanshift_kernels.cu
@ -0,0 +1,144 @@
 #include "meanshift_kernels.h"
 #include <stdio.h>
 #include <stdlib.h>
 __global__ void calculate_kernel_matrix_kernel(Matrix shifted_points, Matrix original_points,
    double deviation, Matrix kernel_matrix){
    // each thread calculates one element of kernel_matrix
    int row = blockIdx.x * blockDim.x + threadIdx.x;
    int col = blockIdx.y * blockDim.y + threadIdx.y;
    // performs calculations only if thread's indexes are within matrix bounds
    if (row * kernel_matrix.width + col >= kernel_matrix.width * kernel_matrix.height){
        return;
    }
    int dimensions = shifted_points.width;
    // calculate distance
    double sum = 0, dif;
    for (int i=0; i<dimensions; i++){
        dif = shifted_points.elements[row * dimensions + i]
            - original_points.elements[col * dimensions + i];
        sum += dif * dif;
    }
    double distance = sqrt(sum);
    double deviation_square = deviation*deviation;
    if (distance < deviation_square){
        // computes kernel matrix
        double pow = ((-1)*(distance * distance))/(2*(deviation_square));
        kernel_matrix.elements[row * kernel_matrix.width + col] = exp(pow);
    } else {
        kernel_matrix.elements[row * kernel_matrix.width + col] = 0;
    }
    if (row == col){
        kernel_matrix.elements[row * kernel_matrix.width + col] += 1;
    }
 }
 __global__ void denominator_kernel(Matrix denominator, Matrix kernel_matrix){
    // each thread computes one element of denominator_kernel
    // by accumulating results into cell_value
    double cell_value = 0;
    int row = blockIdx.x * blockDim.x + threadIdx.x;
    // performs calculations only if thread's indexes are within matrix bounds
    if (row >= denominator.height){
        return;
    }
    for (int column = 0; column < kernel_matrix.width; ++column){
         cell_value += kernel_matrix.elements[row * kernel_matrix.width + column];
    }
    denominator.elements[row] = cell_value;
 }
 __global__ void shift_points_kernel(Matrix original_points, Matrix kernel_matrix,
    Matrix shifted_points, Matrix new_shift, Matrix denominator, Matrix mean_shift_vector){
    int BLOCK_SIZE = blockDim.y;
    int block_row = blockIdx.x;
    int block_col = blockIdx.y;
    // each thread computes one element of new_shift by accumulating results into cell_value
    double cell_value = 0;
    // Thread row and column within sub_new_shift
    int row = threadIdx.x;
    int col = threadIdx.y;
        // performs calculations only if thread's indexes are within matrix bounds
    //if (row * new_shift.width + col >= new_shift.width * new_shift.height){
    /*if (new_shift.stride * BLOCK_SIZE * block_row + BLOCK_SIZE * block_col >=
        new_shift.width * new_shift.height){*/
    if (BLOCK_SIZE * block_row >= new_shift.height || BLOCK_SIZE * block_col >= new_shift.width){
        return;
    }
    // Each thread block computes one sub-matrix sub_new_shift of C
    Matrix sub_new_shift = GetSubMatrix(new_shift, block_row, block_col, BLOCK_SIZE);
    // shared memory used to store sub_kernel_matrix and sub_original_points respectively
    __shared__ double *s_sub_kernel_matrix;
    s_sub_kernel_matrix = (double*)malloc(BLOCK_SIZE * BLOCK_SIZE * sizeof(double));
    __shared__ double *s_sub_original_points;
    s_sub_original_points = (double*)malloc(BLOCK_SIZE * BLOCK_SIZE * sizeof(double));
    // loops over all the sub-matrices of kernel_matrix and original_points that are required to
    //compute sub_new_shift, multiplies each pair of sub-matrices and accumulates the results
    for (int sub_matrix_index = 0; sub_matrix_index < (kernel_matrix.width / BLOCK_SIZE); ++sub_matrix_index) {
        // gets sub-matrix sub_kernel_matrix of kernel_matrix
        Matrix sub_kernel_matrix = GetSubMatrix(kernel_matrix, block_row, sub_matrix_index, BLOCK_SIZE);
        // gets sub-matrix sub_original_points of original_points
        Matrix sub_original_points = GetSubMatrix(original_points, sub_matrix_index, block_col, BLOCK_SIZE);
        // loads s_sub_kernel_matrix and s_sub_original_points from device global memory to shared
        //memory, each thread loads one element of each sub-matrix
        s_sub_kernel_matrix[row * BLOCK_SIZE + col] =
            sub_kernel_matrix.elements[row * sub_kernel_matrix.stride + col];
        s_sub_original_points[row * BLOCK_SIZE + col] =
            sub_original_points.elements[row * sub_original_points.stride + col];
        // synchronizes to make sure the sub-matrices are loaded before starting the computation
        __syncthreads();
        // multiplies sub_kernel_matrix and sub_original_points
        for (int element_index = 0; element_index < BLOCK_SIZE; ++element_index){
            cell_value += s_sub_kernel_matrix[row * sub_kernel_matrix.stride + element_index] *
                s_sub_original_points[element_index * sub_original_points.stride + col];
        }
        // synchronizes to make sure that the preceding computation is done before loading two new
        // sub-matrices of kernel_matrix and original_points in the next iteration
        __syncthreads();
    }
    // new_shift elements are calculated by dividing with the denominator
    int cell_row = (block_row * BLOCK_SIZE + row) * new_shift.stride;
    int cell_col = block_col * BLOCK_SIZE + col;
    //sub_new_shift.elements[cell_row + cell_col] = cell_value / denominator.elements[cell_row];
    sub_new_shift.elements[row * sub_new_shift.stride + col] =
        cell_value / denominator.elements[block_row * BLOCK_SIZE + row];
    // calculates mean-shift vector
    /*mean_shift_vector.elements[(block_row * BLOCK_SIZE + row) * mean_shift_vector.stride
        + (block_col * BLOCK_SIZE + col)] =
        sub_new_shift.elements[row * sub_new_shift.stride + col] -
        shifted_points.elements[(block_row * BLOCK_SIZE + row) * shifted_points.stride
        + (block_col * BLOCK_SIZE + col)];*/
    /*free(s_sub_kernel_matrix);
    free(s_sub_original_points);*/
 }
 // Get the BLOCK_SIZExBLOCK_SIZE sub-matrix Asub of A that is
 // located col sub-matrices to the right and row sub-matrices down
 // from the upper-left corner of A
 __device__ Matrix GetSubMatrix(Matrix A, int row, int col, int BLOCK_SIZE){
    Matrix Asub;
    Asub.width = BLOCK_SIZE;
    Asub.height = BLOCK_SIZE;
    Asub.stride = BLOCK_SIZE;
    Asub.elements = &(A.elements[A.stride * BLOCK_SIZE * row + BLOCK_SIZE * col]);
    return Asub;
 }
--- a/mean_shift_cuda_shared_mem/meanshift_kernels.h
+++ b/mean_shift_cuda_shared_mem/meanshift_kernels.h
@ -0,0 +1,29 @@
 #ifndef SERIAL_KERNELS_H    /*    Include guard    */
 #define SERIAL_KERNELS_H
 /*      Structures     */
 //Matrix is used to describe matrices
 typedef struct {
    int width;
    int height;
    int stride;
    double *elements;
 } Matrix;
 //Kernel calculate_kernel_matrix_kernel calculates the current kernel matrix
 __global__ void calculate_kernel_matrix_kernel(Matrix shifted_points, Matrix original_points,
    double deviation, Matrix kernel_matrix);
 //Kernel denominator_kernel calculates the sum in the denominator of the fraction used to find new
 //(shifted) positions of the points
 __global__ void denominator_kernel(Matrix denominator, Matrix kernel_matrix);
 //Kernel shift_points_kernel shifts the positions of all points and calculates the new mean shift
 //vector according to the new point array
 __global__ void shift_points_kernel(Matrix original_points, Matrix kernel_matrix,
    Matrix shifted_points, Matrix new_shift, Matrix denominator, Matrix mean_shift_vector);
 __device__ Matrix GetSubMatrix(Matrix A, int row, int col, int BLOCK_SIZE);
 #endif //SERIAL_KERNELS_H
--- a/mean_shift_cuda_shared_mem/meanshift_utils.cu
+++ b/mean_shift_cuda_shared_mem/meanshift_utils.cu
@ -0,0 +1,165 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <math.h>
 #include <float.h>
 #include <string.h>
 #include "meanshift_utils.h"
 #include "meanshift_gpu_utils.h"
 #define OUTPUT_PREFIX "../output/output_"
 void get_args(int argc, char **argv, parameters *params){
    if (argc < 7) {
        printf("Usage: %s h e N D Pd Pl\nwhere:\n"
        "\th is the variance\n"
        "\te is the min distance, between two points, that is taken into account in computations\n"
        "\tN is the the number of points\n"
        "\tD is the number of dimensions of each point\n"
        "\tPd is the path of the dataset file\n"
        "\tPl is the path of the labels file\n"
        "\n\t--verbose | -v is an optional flag to enable execution information output"
        "\n\t--output | -o is an optional flag to enable points output in each iteration", argv[0]);
        exit(1);
    }
    DEVIATION = atoi(argv[1]);
    params->epsilon = atof(argv[2]);
    NUMBER_OF_POINTS = atoi(argv[3]);
    DIMENSIONS = atoi(argv[4]);
    POINTS_FILENAME = argv[5];
    LABELS_FILENAME = argv[6];
    params->verbose = false;
    params->display = false;
    if (argc > 7){
        for (int index=7; index<argc; ++index){
            if (!strcmp(argv[index], "--verbose") || !strcmp(argv[index], "-v")){
                params->verbose = true;
            } else if (!strcmp(argv[index], "--output") || !strcmp(argv[index], "-o")){
                params->display = true;
            } else {
                printf("Couldn't parse argument %d: %s\n", index, argv[index]);
                exit(EXIT_FAILURE);
            }
        }
    }
    /*printf("DEVIATION = %d\n"
        "epsilon = %f\n"
        "NUMBER_OF_POINTS = %d\n"
        "DIMENSIONS = %d\n"
        "POINTS_FILENAME = %s\n"
        "LABELS_FILENAME = %s\n"
        "verbose = %d\n"
        "display = %d\n", DEVIATION, params->epsilon, NUMBER_OF_POINTS, DIMENSIONS, POINTS_FILENAME
            , LABELS_FILENAME, params->verbose, params->display);*/
 }
 void init(double ***vectors, char **labels){
    int bytes_read = 0;
    set_GPU();
    if (params.verbose){
        printf("Reading dataset and labels...\n");
    }
    // initializes vectors
    FILE *points_file;
    points_file = fopen(POINTS_FILENAME, "rb");
    if (points_file != NULL){
        // allocates memory for the array
        (*vectors) = alloc_double(NUMBER_OF_POINTS, DIMENSIONS);
        // reads vectors dataset from file
        for (int i=0; i<NUMBER_OF_POINTS; i++){
            bytes_read = fread((*vectors)[i], sizeof(double), DIMENSIONS, points_file);
            if ( bytes_read != DIMENSIONS ){
                if(feof(points_file)){
                    printf("Premature end of file reached.\n");
                } else{
                    printf("Error reading points file.");
                }
                fclose(points_file);
                exit(EXIT_FAILURE);
            }
        }
    } else {
        printf("Error reading dataset file.\n");
        exit(EXIT_FAILURE);
    }
    fclose(points_file);
    // initializes file that will contain the labels (train)
    FILE *labels_file;
    labels_file = fopen(LABELS_FILENAME, "rb");
    if (labels_file != NULL){
        // NOTE : Labels were classified as <class 'numpy.uint8'>
        // variables of type uint8 are stored as 1-byte (8-bit) unsigned integers
        // gets number of labels
        fseek(labels_file, 0L, SEEK_END);
        long int pos = ftell(labels_file);
        rewind(labels_file);
        int label_elements = pos/ sizeof(char);
        // allocates memory for the array
        *labels = (char*)malloc(label_elements* sizeof(char));
        fseek(labels_file, 0L, SEEK_SET);
        bytes_read = fread((*labels), sizeof(char), label_elements, labels_file);
        if ( bytes_read != label_elements ){
            if(feof(points_file)){
                printf("Premature end of file reached.\n");
            } else{
                printf("Error reading points file.");
            }
            fclose(labels_file);
            exit(EXIT_FAILURE);
        }
    }
    fclose(labels_file);
    if (params.verbose){
        printf("Done.\n\n");
    }
 }
 double **alloc_double(int rows, int cols) {
    double *data = (double *) malloc(rows*cols*sizeof(double));
    double **array = (double **) malloc(rows*sizeof(double*));
    for (int i=0; i<rows; i++)
        array[i] = &(data[cols*i]);
    return array;
 }
 void duplicate(double **source, int rows, int cols, double ***dest){
    for (int i=0; i<rows; i++){
        for (int j=0; j<cols; j++){
            (*dest)[i][j] = source[i][j];
        }
    }
 }
 void print_matrix(double **array, int rows, int cols){
    for (int i=0; i<cols; i++){
        for (int j=0; j<rows; j++){
            printf("%f ", array[j][i]);
        }
        printf("\n");
    }
 }
 void save_matrix(double **matrix, int iteration){
    char filename[50];
    snprintf(filename, sizeof(filename), "%s%d", "../output/output_", iteration);
    FILE *file;
    file = fopen(filename, "w");
    for (int rows=0; rows<NUMBER_OF_POINTS; ++rows){
        for (int cols=0; cols<DIMENSIONS; ++cols){
            fprintf(file, "%f", matrix[rows][cols]);
            if (cols != DIMENSIONS - 1){
                fprintf(file, ",");
            }
        }
        fprintf(file, "\n");
    }
 }
--- a/mean_shift_cuda_shared_mem/meanshift_utils.h
+++ b/mean_shift_cuda_shared_mem/meanshift_utils.h
@ -0,0 +1,35 @@
 #ifndef SERIAL_UTILS_H    /*    Include guard    */
 #define SERIAL_UTILS_H
 #include <stdbool.h>
 /*      Structures     */
 //Parameters is used to store session specific variables in an orderly way
 typedef struct parameters {
    double epsilon;
    bool verbose;
    bool display;
 } Parameters;
 //Function get_args parses command line arguments
 void get_args(int argc, char **argv, Parameters *params);
 //Function init sets up the GPU for later use, gets its properties and reads the dataset and label
 //arrays from the corresponding files
 void init(double ***vectors, char **labels);
 //Function alloc_double allocates rows*cols bytes of continuous memory
 double **alloc_double(int rows, int cols);
 //Function duplicate copies the values of source array to dest array
 void duplicate(double **source, int rows, int cols, double ***dest);
 //Function print_matrix prints array of dimensions <rows X cols> to the console
 void print_matrix(double **array, int rows, int cols);
 //Function save_matrix stores matrix in a csv file with path/filename "../output/output_iteration".
 //If a file already exists new lines are concatenated
 void save_matrix(double **matrix, int iteration);
 #endif //SERIAL_UTILS_H