Add meanshift_gpu_utils header and implementation, minor fixes

8 years ago · 11c170c185
9 changed files with 419 additions and 390 deletions
--- a/mean_shift_cuda/Makefile
+++ b/mean_shift_cuda/Makefile
@ -11,7 +11,7 @@ C_FLAGS = -lm -O3 -I.
 COMPILE_FLAGS = $(HOST_COMPILER) -x cu $(CUDA_FLAGS) -dc $(C_FLAGS)
 LINK_FLAGS = $(HOST_COMPILER) $(CUDA_FLAGS) $(C_FLAGS)

-OBJ = meanshift.o meanshift_utils.o meanshift_kernels.o
+OBJ = meanshift.o meanshift_utils.o meanshift_gpu_utils.o meanshift_kernels.o
 DEPS = meanshift_utils.h meanshift_kernels.h

 RM = rm -f
--- a/mean_shift_cuda/meanshift
+++ b/mean_shift_cuda/meanshift
--- a/mean_shift_cuda/meanshift.cu
+++ b/mean_shift_cuda/meanshift.cu
@ -3,6 +3,7 @@
 #include <sys/time.h>

 #include "meanshift_utils.h"
+#include "meanshift_gpu_utils.h"

 int DEVIATION = 1;
 int NUMBER_OF_POINTS = 600;
--- a/mean_shift_cuda/meanshift_gpu_utils.cu
+++ b/mean_shift_cuda/meanshift_gpu_utils.cu
@ -0,0 +1,308 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <float.h>
+#include <string.h>
+
+#include "meanshift_utils.h"
+#include "meanshift_gpu_utils.h"
+
+cudaDeviceProp device_properties;
+
+//Based on https://stackoverflow.com/a/28113186
+//Pio psagmeno link https://www.cs.virginia.edu/~csadmin/wiki/index.php/CUDA_Support/Choosing_a_GPU
+void set_GPU(){
+    int devices_count = 0, max_multiprocessors = 0, max_device = 0;
+
+    // gets devices count checking for errors like no devices or no drivers to check for
+    // devices available
+    gpuErrchk( cudaGetDeviceCount(&devices_count) );
+    for(int device_index = 0; device_index < devices_count; ++device_index){
+        // gets current index device's properties
+        cudaDeviceProp this_device_properties;
+        gpuErrchk( cudaGetDeviceProperties(&this_device_properties, device_index) );
+
+        // stores best available device's index
+        // only devices with compute capability >= 2.0 are able to run the code
+        if (max_multiprocessors < this_device_properties.multiProcessorCount
+            && this_device_properties.major >= 2 && this_device_properties.minor >= 0){
+            // stores devices properties for later use
+            device_properties = this_device_properties;
+            max_multiprocessors = this_device_properties.multiProcessorCount;
+            max_device = device_index;
+        }
+    }
+    // sets the device
+    gpuErrchk( cudaSetDevice(max_device) );
+    if (params.verbose){
+        printf("Device chosen is \"%s\"\n"
+            "Device has %d multi processors and compute capability %d.%d\n"
+            "Max threads per block supported are %d\n\n"
+            , device_properties.name
+            , device_properties.multiProcessorCount, device_properties.major, device_properties.minor
+            , device_properties.maxThreadsPerBlock);
+    }
+}
+
+int meanshift(double **original_points, double ***shifted_points, int deviation
+    , parameters *opt){
+    // host variables
+    int size = 0;
+    static int iteration = 0;
+    static double **kernel_matrix, *denominator, **mean_shift_vector;
+    double **new_shift;
+
+    // device variables
+    static Matrix d_original_points, d_shifted_points, d_kernel_matrix, d_denominator,
+        d_mean_shift_vector;
+    Matrix d_new_shift;
+
+    // allocates memory and copies original points on first iteration
+    if (iteration == 0 || (*shifted_points) == NULL){
+        // allocates memory for shifted points array and copies original points into it
+        (*shifted_points) = alloc_double(NUMBER_OF_POINTS, DIMENSIONS);
+        duplicate(original_points, NUMBER_OF_POINTS, DIMENSIONS, shifted_points);
+
+        // allocates memory for mean shift vector
+        mean_shift_vector = alloc_double(NUMBER_OF_POINTS, DIMENSIONS);
+        // initializes elements of mean_shift_vector to inf
+        for (int i=0;i<NUMBER_OF_POINTS;i++){
+            for (int j=0;j<DIMENSIONS;j++){
+                mean_shift_vector[i][j] = DBL_MAX;
+            }
+        }
+
+        // allocates memory for other arrays needed
+        kernel_matrix = alloc_double(NUMBER_OF_POINTS, NUMBER_OF_POINTS);
+        denominator = (double *)malloc(NUMBER_OF_POINTS * sizeof(double));
+
+        // allocates corresponding memory in device
+        init_device_memory(original_points, *shifted_points, &d_original_points, &d_shifted_points,
+            &d_kernel_matrix, &d_denominator, &d_mean_shift_vector);
+    }
+
+    // finds pairwise distance matrix (inside radius)
+    // [I, D] = rangesearch(x,y,h);
+    calculate_kernel_matrix(d_shifted_points, d_original_points, d_kernel_matrix, deviation,
+        &kernel_matrix);
+
+    // calculates denominator
+    calculate_denominator(d_kernel_matrix, d_denominator, &denominator);
+
+    size = NUMBER_OF_POINTS * sizeof(double);
+    gpuErrchk( cudaMemcpy(d_denominator.elements, &(denominator[0])
+        , size, cudaMemcpyHostToDevice) );
+
+    // creates new y vector
+    // allocates memory in every recursion
+    new_shift = alloc_double(NUMBER_OF_POINTS, DIMENSIONS);
+    // allocates corresponding memory in device
+    d_new_shift.width = DIMENSIONS;
+    d_new_shift.height = NUMBER_OF_POINTS;
+    size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double);
+    gpuErrchk( cudaMalloc(&(d_new_shift.elements), size) );
+
+    shift_points(d_kernel_matrix, d_original_points, d_shifted_points, d_new_shift, d_denominator,
+        d_mean_shift_vector, kernel_matrix, original_points, &new_shift, &mean_shift_vector);
+
+    // frees previously shifted points, they're now garbage
+    free((*shifted_points)[0]);
+    // updates shifted points pointer to the new array address
+    shifted_points = &new_shift;
+    d_shifted_points.elements = d_new_shift.elements;
+
+    if (params.display){
+        save_matrix((*shifted_points), iteration);
+    }
+
+    // calculates norm of the new mean shift vector
+    double current_norm = norm(mean_shift_vector, NUMBER_OF_POINTS, DIMENSIONS);
+    if (params.verbose){
+        printf("Iteration n. %d, error\t%f \n", iteration, current_norm);
+    }
+
+    /** iterates until convergence **/
+    if (current_norm > opt->epsilon) {
+        ++iteration;
+        meanshift(original_points, shifted_points, deviation, opt);
+    }
+
+    if (iteration == 0){
+        // cleans up allocations
+        free(mean_shift_vector[0]);
+        free(mean_shift_vector);
+        free(kernel_matrix[0]);
+        free(kernel_matrix);
+        free(denominator);
+
+        free_device_memory(d_original_points, d_kernel_matrix, d_denominator, d_new_shift);
+    }
+
+    return iteration;
+}
+
+void init_device_memory(double **original_points, double **shifted_points,
+    Matrix *d_original_points, Matrix *d_shifted_points, Matrix *d_kernel_matrix,
+    Matrix *d_denominator, Matrix *d_mean_shift_vector){
+    int size;
+
+    // allocates memory for original_points in GPU and copies the array
+    d_original_points->width = DIMENSIONS;
+    d_original_points->height = NUMBER_OF_POINTS;
+    size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double);
+    gpuErrchk( cudaMalloc(&(d_original_points->elements), size) );
+    gpuErrchk( cudaMemcpy(d_original_points->elements, &(original_points[0][0])
+        , size, cudaMemcpyHostToDevice) );
+
+    // allocates memory for shifted_points in GPU and copies the array
+    d_shifted_points->width = DIMENSIONS;
+    d_shifted_points->height = NUMBER_OF_POINTS;
+    size = DIMENSIONS * NUMBER_OF_POINTS * sizeof(double);
+    gpuErrchk( cudaMalloc(&(d_shifted_points->elements), size) );
+    gpuErrchk( cudaMemcpy(d_shifted_points->elements, &(shifted_points[0][0])
+        , size, cudaMemcpyHostToDevice) );
+
+    // allocates memory for kernel_matrix in GPU
+    d_kernel_matrix->width = NUMBER_OF_POINTS;
+    d_kernel_matrix->height = NUMBER_OF_POINTS;
+    size = NUMBER_OF_POINTS * NUMBER_OF_POINTS * sizeof(double);
+    gpuErrchk( cudaMalloc(&(d_kernel_matrix->elements), size) );
+
+    // allocates memory for denominator in GPU
+    d_denominator->width = 1;
+    d_denominator->height = NUMBER_OF_POINTS;
+    size = NUMBER_OF_POINTS * sizeof(double);
+    gpuErrchk( cudaMalloc(&(d_denominator->elements), size) );
+
+    // allocates memory for mean_shift_vector in GPU
+    d_mean_shift_vector->width = DIMENSIONS;
+    d_mean_shift_vector->height = NUMBER_OF_POINTS;
+    size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double);
+    gpuErrchk( cudaMalloc(&(d_mean_shift_vector->elements), size) );
+}
+
+void calculate_kernel_matrix(Matrix d_shifted_points, Matrix d_original_points,
+    Matrix d_kernel_matrix, double deviation, double ***kernel_matrix){
+    int size;
+    static bool first_iter = true;
+    // gets max block size supported from the device
+    static int max_block_size = device_properties.maxThreadsPerBlock;
+    static int requested_block_size = (int)sqrt(max_block_size);
+    bool block_size_too_big = true;
+
+    dim3 dimBlock;
+    dim3 dimGrid;
+    do {
+        dimBlock.x = requested_block_size;
+        dimBlock.y = requested_block_size;
+        dimGrid.x = (d_kernel_matrix.height + dimBlock.x - 1) / dimBlock.x;
+        dimGrid.y = (d_kernel_matrix.width + dimBlock.y - 1) / dimBlock.y;
+
+        calculate_kernel_matrix_kernel<<<dimGrid, dimBlock>>>(d_shifted_points, d_original_points
+            , deviation, d_kernel_matrix);
+        if (cudaGetLastError() != cudaSuccess){
+            --requested_block_size;
+        } else {
+            block_size_too_big = false;
+            gpuErrchk( cudaDeviceSynchronize() );
+        }
+    } while(block_size_too_big);
+    
+    if (first_iter && params.verbose){
+        printf("calculate_kernel_matrix_kernel called with:\n");
+        printf("dimBlock.x = %d, dimBlock.y = %d\n", dimBlock.x, dimBlock.y);
+        printf("dimGrid.x = %d, dimGrid.y = %d\n\n", dimGrid.x, dimGrid.y);
+        first_iter = false;
+    }
+
+    size = NUMBER_OF_POINTS * NUMBER_OF_POINTS * sizeof(double);
+    gpuErrchk( cudaMemcpy(&((*kernel_matrix)[0][0]), d_kernel_matrix.elements
+        , size, cudaMemcpyDeviceToHost) );
+}
+
+void calculate_denominator(Matrix d_kernel_matrix, Matrix d_denominator, double **denominator){
+    int size;
+    static bool first_iter = true;
+    // gets max block size supported from the device
+    static int requested_block_size = device_properties.maxThreadsPerBlock;
+    bool block_size_too_big = true;
+
+    dim3 dimBlock;
+    dim3 dimGrid;
+    do {
+        dimBlock.x = requested_block_size;
+        dimBlock.y = 1;
+        dimGrid.x = (d_kernel_matrix.height + dimBlock.x - 1) / dimBlock.x;
+        dimGrid.y = 1;
+
+        denominator_kernel<<<dimGrid, dimBlock>>>(d_denominator, d_kernel_matrix);
+        if (cudaGetLastError() != cudaSuccess){
+            --requested_block_size;
+        } else {
+            block_size_too_big = false;
+            gpuErrchk( cudaDeviceSynchronize() );
+        }
+    } while(block_size_too_big);
+    
+    if (first_iter && params.verbose){
+        printf("calculate_denominator called with:\n");
+        printf("dimBlock.x = %d, dimBlock.y = %d\n", dimBlock.x, dimBlock.y);
+        printf("dimGrid.x = %d, dimGrid.y = %d\n\n", dimGrid.x, dimGrid.y);
+        first_iter = false;
+    }
+
+    size = NUMBER_OF_POINTS * sizeof(double);
+    gpuErrchk( cudaMemcpy(&((*denominator)[0]), d_denominator.elements
+    	, size, cudaMemcpyDeviceToHost) );
+}
+
+void shift_points(Matrix d_kernel_matrix, Matrix d_original_points, Matrix d_shifted_points,
+    Matrix d_new_shift, Matrix d_denominator, Matrix d_mean_shift_vector, double **kernel_matrix,
+    double **original_points, double ***new_shift, double ***mean_shift_vector){
+    int size;
+    static bool first_iter = true;
+    // gets max block size supported from the device
+    static int max_block_size = device_properties.maxThreadsPerBlock;
+    static int requested_block_size = (int)(max_block_size / d_new_shift.width);
+    bool block_size_too_big = true;
+
+    dim3 dimBlock;
+    dim3 dimGrid;
+    do {
+        dimBlock.x = requested_block_size;
+        dimBlock.y = d_new_shift.width;
+        dimGrid.x = (d_denominator.height + dimBlock.x - 1) / dimBlock.x;
+        dimGrid.y = 1;
+
+        shift_points_kernel<<<dimGrid, dimBlock>>>(d_original_points, d_kernel_matrix, d_shifted_points,
+            d_new_shift, d_denominator, d_mean_shift_vector);
+        if (cudaGetLastError() != cudaSuccess){
+            --requested_block_size;
+        } else {
+            block_size_too_big = false;
+            gpuErrchk( cudaDeviceSynchronize() );
+        }
+    } while(block_size_too_big);
+
+    if (first_iter && params.verbose){
+        printf("shift_points_kernel called with:\n");
+        printf("dimBlock.x = %d, dimBlock.y = %d\n", dimBlock.x, dimBlock.y);
+        printf("dimGrid.x = %d, dimGrid.y = %d\n\n", dimGrid.x, dimGrid.y);
+        first_iter = false;
+    }
+
+    size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double);
+    gpuErrchk( cudaMemcpy(&((*new_shift)[0][0]), d_new_shift.elements
+        , size, cudaMemcpyDeviceToHost) );
+    gpuErrchk( cudaMemcpy(&((*mean_shift_vector)[0][0]), d_mean_shift_vector.elements
+        , size, cudaMemcpyDeviceToHost) );
+}
+
+void free_device_memory(Matrix d_original_points, Matrix d_kernel_matrix, Matrix d_denominator,
+    Matrix d_new_shift){
+    // frees all memory previously allocated in device
+    gpuErrchk( cudaFree(d_original_points.elements) );
+    gpuErrchk( cudaFree(d_kernel_matrix.elements) );
+    gpuErrchk( cudaFree(d_denominator.elements) );
+    gpuErrchk( cudaFree(d_new_shift.elements) );
+}
--- a/mean_shift_cuda/meanshift_gpu_utils.h
+++ b/mean_shift_cuda/meanshift_gpu_utils.h
@ -0,0 +1,53 @@
+#ifndef SERIAL_GPU_UTILS_H    /*    Include guard    */
+#define SERIAL_GPU_UTILS_H
+
+#include "meanshift_kernels.h"
+
+//GPU error check snippet taken from:
+//https://stackoverflow.com/a/14038590
+#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
+inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true){
+   if (code != cudaSuccess){
+      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
+      if (abort) exit(code);
+   }
+}
+
+/*        Global variables        */
+extern int DEVIATION;
+extern int NUMBER_OF_POINTS;
+extern int DIMENSIONS;
+extern char* POINTS_FILENAME;
+extern char* LABELS_FILENAME;
+extern parameters params;
+extern cudaDeviceProp device_properties;
+
+void set_GPU();
+
+//Function meanshift recursively shifts original points according to th
+//mean-shift algorithm saving the result to shiftedPoints. Struct opt has user
+//options, h is the desirable deviation.
+int meanshift(double **original_points, double ***shifted_points, int h
+    , parameters *opt);
+
+void init_device_memory(double **original_points, double **shifted_points,
+    Matrix *d_original_points, Matrix *d_shifted_points,
+    Matrix *d_kernel_matrix, Matrix *d_denominator, Matrix *d_new_shift);
+
+void calculate_kernel_matrix(Matrix d_shifted_points, Matrix d_original_points,
+    Matrix d_kernel_matrix, double deviation, double ***kernel_matrix);
+
+//Function multiply allocates memory in GPU, sends the data and calls the 
+//multiply kernel function.
+void shift_points(Matrix d_kernel_matrix, Matrix d_original_points, Matrix d_shifted_points,
+    Matrix d_new_shift, Matrix d_denominator, Matrix d_mean_shift_vector, double **kernel_matrix,
+    double **original_points, double ***new_shift, double ***mean_shift_vector);
+
+void free_device_memory(Matrix d_original_points, Matrix d_kernel_matrix, Matrix d_denominator,
+    Matrix d_new_shift);
+
+//Function calculate_denominator allocates memory in GPU, sends the data and calls the
+//denominator kernel function.
+void calculate_denominator(Matrix d_kernel_matrix, Matrix d_denominator, double **denominator);
+
+#endif //SERIAL_GPU_UTILS_H
--- a/mean_shift_cuda/meanshift_kernels.cu
+++ b/mean_shift_cuda/meanshift_kernels.cu
@ -1,28 +1,14 @@
 #include "meanshift_kernels.h"
 #include <stdio.h>

-__global__ void multiply_kernel(Matrix matrix1, Matrix matrix2, Matrix output){
-    // Each thread computes one element of output
-    // by accumulating results into cell_value
-    double cell_value = 0;
+__global__ void calculate_kernel_matrix_kernel(Matrix shifted_points, Matrix original_points,
+    double deviation, Matrix kernel_matrix){
+    // each thread calculates one element of kernel_matrix
    int row = blockIdx.x * blockDim.x + threadIdx.x;
    int col = blockIdx.y * blockDim.y + threadIdx.y;

-    if (row + col < output.height * output.width){
-        for (int element_index = 0; element_index < matrix1.width; ++element_index){
-            cell_value += matrix1.elements[row * matrix1.width + element_index]
-                * matrix2.elements[element_index * matrix2.width + col];
-        }
-        output.elements[row * output.width + col] = cell_value;
-    }
-}
-
-__global__ void calculate_kernel_matrix_kernel(Matrix shifted_points, Matrix original_points
-    , double deviation, Matrix kernel_matrix){
-    // Each thread calculates one element of kernel_matrix
-    int row = blockIdx.x * blockDim.x + threadIdx.x;
-    int col = blockIdx.y * blockDim.y + threadIdx.y;
-    if (row * kernel_matrix.width + col > kernel_matrix.width * kernel_matrix.height){
+    // performs calculations only if thread's indexes are within matrix bounds
+    if (row * kernel_matrix.width + col >= kernel_matrix.width * kernel_matrix.height){
        return;
    }

@ -48,17 +34,48 @@ __global__ void calculate_kernel_matrix_kernel(Matrix shifted_points, Matrix ori
    }
 }

-__global__ void denominator_kernel(Matrix denominator, Matrix kernel_matrix){
-
+__global__ void shift_points_kernel(Matrix original_points, Matrix kernel_matrix, Matrix shifted_points,
+    Matrix new_shift, Matrix denominator, Matrix mean_shift_vector){
+    // each thread computes one element of new_shift
+    // by accumulating results into cell_value
+    double cell_value = 0;
    int row = blockIdx.x * blockDim.x + threadIdx.x;
    int col = blockIdx.y * blockDim.y + threadIdx.y;

-
-    if (row * denominator.width + col > denominator.width * denominator.height){
+    // performs calculations only if thread's indexes are within matrix bounds
+    if (row * new_shift.width + col >= new_shift.width * new_shift.height){
        return;
    }

-    denominator.elements[col]=0;
-    denominator.elements[row] += kernel_matrix.elements[row*denominator.width + col];
+    // calculates new_shift
+    // builds nominator by multiplying kernel_matrix and original_points
+    for (int element_index = 0; element_index < kernel_matrix.width; ++element_index){
+        cell_value += kernel_matrix.elements[row * kernel_matrix.width + element_index]
+            * original_points.elements[element_index * original_points.width + col];
+    }
+    // new_shift elements are calculated by dividing with the denominator
+    new_shift.elements[row * new_shift.width + col] =
+        cell_value / denominator.elements[row];
+
+    // calculates mean-shift vector
+    mean_shift_vector.elements[row * new_shift.width + col] =
+        new_shift.elements[row * new_shift.width + col] -
+        shifted_points.elements[row * new_shift.width + col];
+}
+
+__global__ void denominator_kernel(Matrix denominator, Matrix kernel_matrix){
+    // each thread computes one element of denominator_kernel
+    // by accumulating results into cell_value
+    double cell_value = 0;
+    int row = blockIdx.x * blockDim.x + threadIdx.x;
+
+    // performs calculations only if thread's indexes are within matrix bounds
+    if (row >= denominator.height){
+        return;
+    }

+    for (int column = 0; column < kernel_matrix.width; ++column){
+         cell_value += kernel_matrix.elements[row * kernel_matrix.width + column];
+    }
+    denominator.elements[row] = cell_value;
 }
--- a/mean_shift_cuda/meanshift_kernels.h
+++ b/mean_shift_cuda/meanshift_kernels.h
@ -1,17 +1,18 @@
 #ifndef SERIAL_KERNELS_H    /*    Include guard    */
 #define SERIAL_KERNELS_H

-typedef struct{
+typedef struct {
    int width;
    int height;
    double *elements;
 } Matrix;

-//Function multiply_kernel calculates the product of matrices 1 and 2 into output.
-__global__ void multiply_kernel(Matrix matrix1, Matrix matrix2, Matrix output);
+__global__ void calculate_kernel_matrix_kernel(Matrix shifted_points, Matrix original_points,
+    double deviation, Matrix kernel_matrix);

-__global__ void calculate_kernel_matrix_kernel(Matrix shifted_points, Matrix original_points
-    , double deviation, Matrix kernel_matrix);
+//Function multiply_kernel calculates the product of matrices 1 and 2 into output.
+__global__ void shift_points_kernel(Matrix original_points, Matrix kernel_matrix, Matrix shifted_points,
+    Matrix new_shift, Matrix denominator, Matrix mean_shift_vector);

 __global__ void denominator_kernel(Matrix denominator, Matrix kernel_matrix);

--- a/mean_shift_cuda/meanshift_utils.cu
+++ b/mean_shift_cuda/meanshift_utils.cu
@ -5,12 +5,10 @@
 #include <string.h>

 #include "meanshift_utils.h"
-#include "meanshift_kernels.h"
+#include "meanshift_gpu_utils.h"

 #define OUTPUT_PREFIX "../output/output_"

-cudaDeviceProp device_properties;
-
 void get_args(int argc, char **argv, parameters *params){
    if (argc < 7) {
        printf("Usage: %s h e N D Pd Pl\nwhere:\n"
@ -61,7 +59,7 @@ void get_args(int argc, char **argv, parameters *params){
 void init(double ***vectors, char **labels){
    int bytes_read = 0;

-    set_Gpu();
+    set_GPU();

    if (params.verbose){
        printf("Reading dataset and labels...\n");
@ -72,7 +70,7 @@ void init(double ***vectors, char **labels){
    points_file = fopen(POINTS_FILENAME, "rb");
    if (points_file != NULL){
        // allocates memory for the array
-        (*vectors) = alloc_2d_double(NUMBER_OF_POINTS, DIMENSIONS);
+        (*vectors) = alloc_double(NUMBER_OF_POINTS, DIMENSIONS);
        // reads vectors dataset from file
        for (int i=0; i<NUMBER_OF_POINTS; i++){
            bytes_read = fread((*vectors)[i], sizeof(double), DIMENSIONS, points_file);
@ -125,129 +123,7 @@ void init(double ***vectors, char **labels){
    }
 }

-//Based on https://stackoverflow.com/a/28113186
-//Poio psagmeno link https://www.cs.virginia.edu/~csadmin/wiki/index.php/CUDA_Support/Choosing_a_GPU
-void set_Gpu(){
-    int devices_count = 0, max_multiprocessors = 0, max_device = 0;
-
-    // gets devices count checking for errors like no devices or no drivers to check for
-    // devices available
-    gpuErrchk( cudaGetDeviceCount(&devices_count) );
-    for(int device_index = 0; device_index < devices_count; ++device_index){
-        // gets current index device's properties
-        cudaDeviceProp this_device_properties;
-        gpuErrchk( cudaGetDeviceProperties(&this_device_properties, device_index) );
-
-        // stores best available device's index
-        // only devices with compute capability >= 2.0 are able to run the code
-        if (max_multiprocessors < this_device_properties.multiProcessorCount
-            && this_device_properties.major >= 2 && this_device_properties.minor >= 0){
-            // stores devices properties for later use
-            device_properties = this_device_properties;
-            max_multiprocessors = this_device_properties.multiProcessorCount;
-            max_device = device_index;
-        }
-    }
-    // sets the device
-    gpuErrchk( cudaSetDevice(max_device) );
-    if (params.verbose){
-        printf("Device chosen is \"%s\"\n"
-            "Device has %d multi processors and compute capability %d.%d\n"
-            "Max threads per block supported are %d\n\n"
-            , device_properties.name
-            , device_properties.multiProcessorCount, device_properties.major, device_properties.minor
-            , device_properties.maxThreadsPerBlock);
-    }
-}
-
-int meanshift(double **original_points, double ***shifted_points, int deviation
-    , parameters *opt){
-    static int iteration = 0;
-    static double **mean_shift_vector, **kernel_matrix, *denominator;
-
-    // allocates memory and copies original points on first iteration
-    if (iteration == 0 || (*shifted_points) == NULL){
-        (*shifted_points) = alloc_2d_double(NUMBER_OF_POINTS, DIMENSIONS);
-        duplicate(original_points, NUMBER_OF_POINTS, DIMENSIONS, shifted_points);
-
-        // allocates memory for mean shift vector
-        mean_shift_vector = alloc_2d_double(NUMBER_OF_POINTS, DIMENSIONS);
-        // initializes elements of mean_shift_vector to inf
-        for (int i=0;i<NUMBER_OF_POINTS;i++){
-            for (int j=0;j<DIMENSIONS;j++){
-                mean_shift_vector[i][j] = DBL_MAX;
-            }
-        }
-
-        // allocates memory for other arrays needed
-        kernel_matrix = alloc_2d_double(NUMBER_OF_POINTS, NUMBER_OF_POINTS);
-        denominator = (double *)malloc(NUMBER_OF_POINTS * sizeof(double));
-    }
-    // TODO move arrays to device and create global kernel for the iteration
-
-    // finds pairwise distance matrix (inside radius)
-    // [I, D] = rangesearch(x,y,h);
-    calculate_kernel_matrix((*shifted_points), original_points, deviation, &kernel_matrix);
-
-//    // calculate denominator
-//    for (int i=0; i<NUMBER_OF_POINTS; i++){
-//        double sum = 0;
-//        for (int j=0; j<NUMBER_OF_POINTS; j++){
-//            sum = sum + kernel_matrix[i][j];
-//        }
-//        denominator[i] = sum;
-//    }
-    denominator = calculate_denominator(kernel_matrix);
-
-    // creates new y vector
-    double **new_shift = alloc_2d_double(NUMBER_OF_POINTS, DIMENSIONS);
-
-    // builds nominator
-    multiply(kernel_matrix, original_points, &new_shift);
-
-    // divides element-wise
-    for (int i=0; i<NUMBER_OF_POINTS; i++){
-        for (int j=0; j<DIMENSIONS; j++){
-            new_shift[i][j] = new_shift[i][j] / denominator[i];
-            // calculates mean-shift vector at the same time
-            mean_shift_vector[i][j] = new_shift[i][j] - (*shifted_points)[i][j];
-        }
-    }
-
-    // frees previously shifted points, they're now garbage
-    free((*shifted_points)[0]);
-    // updates shifted points pointer to the new array address
-    shifted_points = &new_shift;
-
-    if (params.display){
-        save_matrix((*shifted_points), iteration);
-    }
-
-    // calculates norm of the new mean shift vector
-    double current_norm = norm(mean_shift_vector, NUMBER_OF_POINTS, DIMENSIONS);
-    if (params.verbose){
-        printf("Iteration n. %d, error %f \n", iteration, current_norm);
-    }
-
-    /** iterates until convergence **/
-    if (current_norm > opt->epsilon) {
-        ++iteration;
-        meanshift(original_points, shifted_points, deviation, opt);
-    }
-
-    if (iteration == 0){
-        // cleans up allocations
-        free(mean_shift_vector[0]);
-        free(mean_shift_vector);
-        free(kernel_matrix[0]);
-        free(kernel_matrix);
-        free(denominator);
-    }
-
-    return iteration;
-}
-
-
+// TODO check why there's is a difference in the norm calculate in matlab
 double norm(double **matrix, int rows, int cols){
    double sum=0, temp_mul=0;
    for (int i=0; i<rows; i++) {
@ -260,142 +136,7 @@ double norm(double **matrix, int rows, int cols){
    return norm;
 }

-void calculate_kernel_matrix(double **shifted_points, double **original_points, double deviation
-    , double ***kernel_matrix){
-    static bool first_iter = true;
-
-    // allocates memory for shifted_points in GPU and copies the array
-    Matrix d_shifted_points;
-    d_shifted_points.width = DIMENSIONS;
-    d_shifted_points.height = NUMBER_OF_POINTS;
-    int size = DIMENSIONS * NUMBER_OF_POINTS * sizeof(double);
-    gpuErrchk( cudaMalloc(&d_shifted_points.elements, size) );
-    gpuErrchk( cudaMemcpy(d_shifted_points.elements, &(shifted_points[0][0])
-        , size, cudaMemcpyHostToDevice) );
-
-    // allocates memory for original_points in GPU and copies the array
-    Matrix d_original_points;
-    d_original_points.width = DIMENSIONS;
-    d_original_points.height = NUMBER_OF_POINTS;
-    size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double);
-    gpuErrchk( cudaMalloc(&d_original_points.elements, size) );
-    gpuErrchk( cudaMemcpy(d_original_points.elements, &(original_points[0][0])
-        , size, cudaMemcpyHostToDevice) );
-
-    // allocates memory for kernel_matrix in GPU
-    Matrix d_kernel_matrix;
-    d_kernel_matrix.width = NUMBER_OF_POINTS;
-    d_kernel_matrix.height = NUMBER_OF_POINTS;
-    size = NUMBER_OF_POINTS * NUMBER_OF_POINTS * sizeof(double);
-    gpuErrchk( cudaMalloc(&d_kernel_matrix.elements, size) );
-
-    // get max sizes supported from the device
-    int max_block_size = (int)sqrt(device_properties.maxThreadsPerBlock);
-    int requested_block_size = max_block_size;
-    bool block_size_too_big = true;
-
-    dim3 dimBlock;
-    dim3 dimGrid;
-
-    do {
-        dimBlock.x = requested_block_size;
-        dimBlock.y = requested_block_size;
-        dimGrid.x = (d_kernel_matrix.height + dimBlock.x - 1) / dimBlock.x;
-        dimGrid.y = (d_kernel_matrix.width + dimBlock.y - 1) / dimBlock.y;
-
-        calculate_kernel_matrix_kernel<<<dimGrid, dimBlock>>>(d_shifted_points, d_original_points
-            , deviation, d_kernel_matrix);
-        if (cudaGetLastError() != cudaSuccess){
-            --requested_block_size;
-        } else {
-            block_size_too_big = false;
-            gpuErrchk( cudaDeviceSynchronize() );
-        }
-    } while(block_size_too_big);
-    
-    if (first_iter && params.verbose){
-        printf("calculate_kernel_matrix_kernel called with:\n");
-        printf("dimBlock.x = %d, dimBlock.y = %d\n", dimBlock.x, dimBlock.y);
-        printf("dimGrid.x = %d, dimGrid.y = %d\n\n", dimGrid.x, dimGrid.y);
-        first_iter = false;
-    }
-
-    size = NUMBER_OF_POINTS * NUMBER_OF_POINTS * sizeof(double);
-    gpuErrchk( cudaMemcpy(&((*kernel_matrix)[0][0]), d_kernel_matrix.elements
-        , size, cudaMemcpyDeviceToHost) );
-
-    gpuErrchk( cudaFree(d_shifted_points.elements) );
-    gpuErrchk( cudaFree(d_original_points.elements) );
-    gpuErrchk( cudaFree(d_kernel_matrix.elements) );
-}
-
-
-void multiply(double **kernel_matrix, double **original_points, double ***new_shift){
-    static bool first_iter = true;
-
-    // allocates memory for kernel_matrix in GPU and copies the array
-    Matrix d_kernel_matrix;
-    d_kernel_matrix.width = NUMBER_OF_POINTS;
-    d_kernel_matrix.height = NUMBER_OF_POINTS;
-    int size = NUMBER_OF_POINTS * NUMBER_OF_POINTS * sizeof(double);
-    gpuErrchk( cudaMalloc(&d_kernel_matrix.elements, size) );
-    gpuErrchk( cudaMemcpy(d_kernel_matrix.elements, &(kernel_matrix[0][0])
-        , size, cudaMemcpyHostToDevice) );
-
-    // allocates memory for original_points in GPU and copies the array
-    Matrix d_original_points;
-    d_original_points.width = DIMENSIONS;
-    d_original_points.height = NUMBER_OF_POINTS;
-    size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double);
-    gpuErrchk( cudaMalloc(&d_original_points.elements, size) );
-    gpuErrchk( cudaMemcpy(d_original_points.elements, &(original_points[0][0])
-        , size, cudaMemcpyHostToDevice) );
-
-    // allocates memory for new_shift in GPU
-    Matrix d_new_shift;
-    d_new_shift.width = DIMENSIONS;
-    d_new_shift.height = NUMBER_OF_POINTS;
-    size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double);
-    gpuErrchk( cudaMalloc(&d_new_shift.elements, size) );
-
-    // get max sizes supported from the device
-    int max_block_size = device_properties.maxThreadsPerBlock;
-    dim3 dimBlock((d_new_shift.height < sqrt(max_block_size)) ? d_new_shift.height : sqrt(max_block_size)
-        , (d_new_shift.width < sqrt(max_block_size)) ? d_new_shift.width : sqrt(max_block_size));
-    dim3 dimGrid((d_new_shift.height + dimBlock.x - 1) / dimBlock.x
-        , (d_new_shift.width + dimBlock.y - 1) / dimBlock.y);
-
-    if (first_iter && params.verbose){
-        printf("multiply_kernel called with:\n");
-        printf("dimBlock.x = %d, dimBlock.y = %d\n", dimBlock.x, dimBlock.y);
-        printf("dimGrid.x = %d, dimGrid.y = %d\n\n", dimGrid.x, dimGrid.y);
-        first_iter = false;
-    }
-
-    multiply_kernel<<<dimGrid, dimBlock>>>(d_kernel_matrix, d_original_points, d_new_shift);
-    gpuErrchk( cudaPeekAtLastError() );
-    gpuErrchk( cudaDeviceSynchronize() );
-
-    size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double);
-    gpuErrchk( cudaMemcpy(&((*new_shift)[0][0]), d_new_shift.elements
-        , size, cudaMemcpyDeviceToHost) );
-
-    gpuErrchk( cudaFree(d_kernel_matrix.elements) );
-    gpuErrchk( cudaFree(d_original_points.elements) );
-    gpuErrchk( cudaFree(d_new_shift.elements) );
-}
-
-double calculateDistance(double *y, double *x){
-    double sum = 0, dif;
-    for (int i=0; i<DIMENSIONS; i++){
-        dif = y[i]-x[i];
-        sum += dif * dif;
-    }
-    double distance = sqrt(sum);
-    return distance;
-}
-
-double **alloc_2d_double(int rows, int cols) {
+double **alloc_double(int rows, int cols) {
    double *data = (double *) malloc(rows*cols*sizeof(double));
    double **array = (double **) malloc(rows*sizeof(double*));
    for (int i=0; i<rows; i++)
@ -434,53 +175,4 @@ void save_matrix(double **matrix, int iteration){
        }
        fprintf(file, "\n");
    }
-}
-
-double * calculate_denominator(double **kernel_matrix){
-    static bool first_iter = true;
-
-    // allocates memory for denominator_matrix in GPU
-    Matrix d_denominator_matrix;
-    d_denominator_matrix.width = NUMBER_OF_POINTS;
-    d_denominator_matrix.height = 1;
-    int size = NUMBER_OF_POINTS * sizeof(double);
-    gpuErrchk( cudaMalloc(&d_denominator_matrix.elements, size) );
-
-    // allocates memory for kernel_matrix in GPU and copies the array
-    Matrix d_kernel_matrix;
-    d_kernel_matrix.width = NUMBER_OF_POINTS;
-    d_kernel_matrix.height = NUMBER_OF_POINTS;
-    size = NUMBER_OF_POINTS * NUMBER_OF_POINTS * sizeof(double);
-    gpuErrchk( cudaMalloc(&d_kernel_matrix.elements, size) );
-    gpuErrchk( cudaMemcpy(d_kernel_matrix.elements, &(kernel_matrix[0][0])
-            , size, cudaMemcpyHostToDevice) );
-
-    // get max sizes supported from the device
-    int max_block_size = device_properties.maxThreadsPerBlock;
-    dim3 dimBlock((d_denominator_matrix.height < sqrt(max_block_size)) ? d_denominator_matrix.height : sqrt(max_block_size)
-            , (d_denominator_matrix.width < sqrt(max_block_size)) ? d_denominator_matrix.width : sqrt(max_block_size));
-    dim3 dimGrid((d_denominator_matrix.height + dimBlock.x - 1) / dimBlock.x
-            , (d_denominator_matrix.width + dimBlock.y - 1) / dimBlock.y);
-
-    if (first_iter && params.verbose){
-        printf("calculate_denominator called with:\n");
-        printf("dimBlock.x = %d, dimBlock.y = %d\n", dimBlock.x, dimBlock.y);
-        printf("dimGrid.x = %d, dimGrid.y = %d\n\n", dimGrid.x, dimGrid.y);
-        first_iter = false;
-    }
-
-    denominator_kernel<<<dimGrid, dimBlock>>>(d_denominator_matrix, d_kernel_matrix);
-    gpuErrchk( cudaPeekAtLastError() );
-    gpuErrchk( cudaDeviceSynchronize() );
-
-
-    size = NUMBER_OF_POINTS * sizeof(double);
-    double ** denominator = (double**)malloc(size);
-    gpuErrchk( cudaMemcpy(&((*denominator)[0]), d_denominator_matrix.elements
-            ,size, cudaMemcpyDeviceToHost) );
-
-    gpuErrchk( cudaFree(d_kernel_matrix.elements) );
-    gpuErrchk( cudaFree(d_denominator_matrix.elements) );
-
-    return (*denominator);
 }
--- a/mean_shift_cuda/meanshift_utils.h
+++ b/mean_shift_cuda/meanshift_utils.h
@ -3,16 +3,6 @@

 #include <stdbool.h>

-//GPU error check snippet taken from:
-//https://stackoverflow.com/a/14038590
-#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
-inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true){
-   if (code != cudaSuccess){
-      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
-      if (abort) exit(code);
-   }
-}
-
 /*        Structs       */
 typedef struct parameters {
    double epsilon;
@ -20,45 +10,17 @@ typedef struct parameters {
    bool display;
 } parameters;

-/*        Global variables        */
-extern int DEVIATION;
-extern int NUMBER_OF_POINTS;
-extern int DIMENSIONS;
-extern char* POINTS_FILENAME;
-extern char* LABELS_FILENAME;
-extern parameters params;
-extern cudaDeviceProp device_properties;
-
 //Function get_args parses command line arguments.
 void get_args(int argc, char **argv, parameters *params);

 //Function init reads the dataset and label arrays from the corresponding files.
 void init(double ***vectors, char **labels);

-void set_Gpu();
-
-//Function meanshift recursively shifts original points according to th
-//mean-shift algorithm saving the result to shiftedPoints. Struct opt has user
-//options, h is the desirable deviation.
-int meanshift(double **original_points, double ***shifted_points, int h
-    , parameters *opt);
-
 //Function norm returns the second norm of matrix of dimensions rowsXcols.
 double norm(double **matrix, int rows, int cols);

-void calculate_kernel_matrix(double **shifted_points, double **original_points, double deviation
-    , double ***kernel_matrix);
-
-//Function multiply allocates memory in GPU, sends the data and calls the 
-//multiply kernel function.
-void multiply(double **kernel_matrix, double **original_points
-    , double ***new_shift);
-
-//Function calculateDistance returns the distance between x and y vectors.
-double calculateDistance(double *y, double *x);
-
-//Function alloc_2d_double allocates rows*cols bytes of continuous memory.
-double **alloc_2d_double(int rows, int cols);
+//Function alloc_double allocates rows*cols bytes of continuous memory.
+double **alloc_double(int rows, int cols);

 //Function duplicate copies the values of source array to dest array.
 void duplicate(double **source, int rows, int cols, double ***dest);
@ -68,11 +30,6 @@ void print_matrix(double **array, int rows, int cols);

 //Function save_matrix prints matrix in a csv file with path/filename
 //"output/output_iteration". If a file already exists new lines are concatenated.
-void save_matrix(double **matrix
-    , int iteration);
-
-//Function calculate_denominator allocates memory in GPU, sends the data and calls the
-//denominator kernel function.
-double * calculate_denominator(double **kernel_matrix);
+void save_matrix(double **matrix, int iteration);

 #endif //SERIAL_UTILS_H