kernel denominator

9 years ago · 6d9d9da3bc
5 changed files with 178 additions and 201 deletions
--- a/mean_shift_cuda/meanshift
+++ b/mean_shift_cuda/meanshift
--- a/mean_shift_cuda/meanshift_kernels.cu
+++ b/mean_shift_cuda/meanshift_kernels.cu
@ -1,8 +1,24 @@
 #include "meanshift_kernels.h"
 #include <stdio.h>
-__global__ void calculate_kernel_matrix_kernel(Matrix shifted_points, Matrix original_points,
+__global__ void multiply_kernel(Matrix matrix1, Matrix matrix2, Matrix output){
-    double deviation, Matrix kernel_matrix){
+    // Each thread computes one element of output
    // by accumulating results into cell_value
    double cell_value = 0;
    int row = blockIdx.x * blockDim.x + threadIdx.x;
    int col = blockIdx.y * blockDim.y + threadIdx.y;
    if (row + col < output.height * output.width){
        for (int element_index = 0; element_index < matrix1.width; ++element_index){
            cell_value += matrix1.elements[row * matrix1.width + element_index]
                * matrix2.elements[element_index * matrix2.width + col];
        }
        output.elements[row * output.width + col] = cell_value;
    }
 }
 __global__ void calculate_kernel_matrix_kernel(Matrix shifted_points, Matrix original_points
    , double deviation, Matrix kernel_matrix){
    // Each thread calculates one element of kernel_matrix
    int row = blockIdx.x * blockDim.x + threadIdx.x;
    int col = blockIdx.y * blockDim.y + threadIdx.y;
@ -32,44 +48,17 @@ __global__ void calculate_kernel_matrix_kernel(Matrix shifted_points, Matrix ori
    }
 }
-__global__ void shift_points_kernel(Matrix original_points, Matrix kernel_matrix, Matrix shifted_points,
+__global__ void denominator_kernel(Matrix denominator, Matrix kernel_matrix){
-    Matrix new_shift, Matrix denominator, Matrix mean_shift_vector){
+
    // Each thread computes one element of new_shift
    // by accumulating results into cell_value
    double cell_value = 0;
    int row = blockIdx.x * blockDim.x + threadIdx.x;
    int col = blockIdx.y * blockDim.y + threadIdx.y;
    // performs calculations only if indexes are within matrix bounds
    //if (row + col < new_shift.height * new_shift.width){
    if (row < new_shift.height){
        // calculates new_shift
        // builds nominator by multiplying kernel_matrix and original_points
        for (int element_index = 0; element_index < kernel_matrix.width; ++element_index){
            cell_value += kernel_matrix.elements[row * kernel_matrix.width + element_index]
                * original_points.elements[element_index * original_points.width + col];
        }
        // new_shift elements are calculated by dividing with the denominator
        new_shift.elements[row * new_shift.width + col] =
            cell_value / denominator.elements[row];
-        // calculates mean-shift vector
+    if (row * denominator.width + col > denominator.width * denominator.height){
-        mean_shift_vector.elements[row * new_shift.width + col] =
+        return;
            new_shift.elements[row * new_shift.width + col] -
            shifted_points.elements[row * new_shift.width + col];
    }
 }
-__global__ void denominator_kernel(Matrix denominator, Matrix kernel_matrix){
+    denominator.elements[col]=0;
-    // Each thread computes one element of denominator_kernel
+    denominator.elements[row] += kernel_matrix.elements[row*denominator.width + col];
    // by accumulating results into cell_value
    double cell_value = 0;
    int row = blockIdx.x * blockDim.x + threadIdx.x;
    if (row < denominator.height){
        for (int column = 0; column < kernel_matrix.width; ++column){
             cell_value += kernel_matrix.elements[row * kernel_matrix.width + column];
        }
        denominator.elements[row] = cell_value;
    }
 }
--- a/mean_shift_cuda/meanshift_kernels.h
+++ b/mean_shift_cuda/meanshift_kernels.h
@ -1,18 +1,17 @@
 #ifndef SERIAL_KERNELS_H    /*    Include guard    */
 #define SERIAL_KERNELS_H
-typedef struct {
+typedef struct{
    int width;
    int height;
    double *elements;
 } Matrix;
 __global__ void calculate_kernel_matrix_kernel(Matrix shifted_points, Matrix original_points,
    double deviation, Matrix kernel_matrix);
 //Function multiply_kernel calculates the product of matrices 1 and 2 into output.
-__global__ void shift_points_kernel(Matrix original_points, Matrix kernel_matrix, Matrix shifted_points,
+__global__ void multiply_kernel(Matrix matrix1, Matrix matrix2, Matrix output);
-    Matrix new_shift, Matrix denominator, Matrix mean_shift_vector);
+
 __global__ void calculate_kernel_matrix_kernel(Matrix shifted_points, Matrix original_points
    , double deviation, Matrix kernel_matrix);
 __global__ void denominator_kernel(Matrix denominator, Matrix kernel_matrix);
--- a/mean_shift_cuda/meanshift_utils.cu
+++ b/mean_shift_cuda/meanshift_utils.cu
@ -5,6 +5,7 @@
 #include <string.h>
 #include "meanshift_utils.h"
 #include "meanshift_kernels.h"
 #define OUTPUT_PREFIX "../output/output_"
@ -60,7 +61,7 @@ void get_args(int argc, char **argv, parameters *params){
 void init(double ***vectors, char **labels){
    int bytes_read = 0;
-    set_GPU();
+    set_Gpu();
    if (params.verbose){
        printf("Reading dataset and labels...\n");
@ -126,7 +127,7 @@ void init(double ***vectors, char **labels){
 //Based on https://stackoverflow.com/a/28113186
 //Poio psagmeno link https://www.cs.virginia.edu/~csadmin/wiki/index.php/CUDA_Support/Choosing_a_GPU
-void set_GPU(){
+void set_Gpu(){
    int devices_count = 0, max_multiprocessors = 0, max_device = 0;
    // gets devices count checking for errors like no devices or no drivers to check for
@ -161,20 +162,11 @@ void set_GPU(){
 int meanshift(double **original_points, double ***shifted_points, int deviation
    , parameters *opt){
    // host variables
    int size = 0;
    static int iteration = 0;
-    static double **kernel_matrix, *denominator, **mean_shift_vector;
+    static double **mean_shift_vector, **kernel_matrix, *denominator;
    double **new_shift;
    // device variables
    static Matrix d_original_points, d_shifted_points, d_kernel_matrix, d_denominator,
        d_mean_shift_vector;
    Matrix d_new_shift;
    // allocates memory and copies original points on first iteration
    if (iteration == 0 || (*shifted_points) == NULL){
        // allocates memory for shifted points array and copies original points into it
        (*shifted_points) = alloc_2d_double(NUMBER_OF_POINTS, DIMENSIONS);
        duplicate(original_points, NUMBER_OF_POINTS, DIMENSIONS, shifted_points);
@ -190,41 +182,42 @@ int meanshift(double **original_points, double ***shifted_points, int deviation
        // allocates memory for other arrays needed
        kernel_matrix = alloc_2d_double(NUMBER_OF_POINTS, NUMBER_OF_POINTS);
        denominator = (double *)malloc(NUMBER_OF_POINTS * sizeof(double));
        // allocates corresponding memory in device
        init_device_memory(original_points, *shifted_points, &d_original_points, &d_shifted_points,
            &d_kernel_matrix, &d_denominator, &d_mean_shift_vector);
    }
    // TODO move arrays to device and create global kernel for the iteration
    // finds pairwise distance matrix (inside radius)
    // [I, D] = rangesearch(x,y,h);
-    calculate_kernel_matrix(d_shifted_points, d_original_points, d_kernel_matrix, deviation,
+    calculate_kernel_matrix((*shifted_points), original_points, deviation, &kernel_matrix);
-        &kernel_matrix);
+
-
+//    // calculate denominator
-    // calculates denominator
+//    for (int i=0; i<NUMBER_OF_POINTS; i++){
-    calculate_denominator(d_kernel_matrix, d_denominator, &denominator);
+//        double sum = 0;
-
+//        for (int j=0; j<NUMBER_OF_POINTS; j++){
-    size = NUMBER_OF_POINTS * sizeof(double);
+//            sum = sum + kernel_matrix[i][j];
-    gpuErrchk( cudaMemcpy(d_denominator.elements, &(denominator[0])
+//        }
-        , size, cudaMemcpyHostToDevice) );
+//        denominator[i] = sum;
 //    }
    denominator = calculate_denominator(kernel_matrix);
    // creates new y vector
-    // allocates memory in every recursion
+    double **new_shift = alloc_2d_double(NUMBER_OF_POINTS, DIMENSIONS);
    new_shift = alloc_2d_double(NUMBER_OF_POINTS, DIMENSIONS);
    // allocates corresponding memory in device
    d_new_shift.width = DIMENSIONS;
    d_new_shift.height = NUMBER_OF_POINTS;
    size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double);
    gpuErrchk( cudaMalloc(&(d_new_shift.elements), size) );
-    shift_points(d_kernel_matrix, d_original_points, d_shifted_points, d_new_shift, d_denominator,
+    // builds nominator
-        d_mean_shift_vector, kernel_matrix, original_points, &new_shift, &mean_shift_vector);
+    multiply(kernel_matrix, original_points, &new_shift);
    // divides element-wise
    for (int i=0; i<NUMBER_OF_POINTS; i++){
        for (int j=0; j<DIMENSIONS; j++){
            new_shift[i][j] = new_shift[i][j] / denominator[i];
            // calculates mean-shift vector at the same time
            mean_shift_vector[i][j] = new_shift[i][j] - (*shifted_points)[i][j];
        }
    }
    // frees previously shifted points, they're now garbage
    free((*shifted_points)[0]);
    // updates shifted points pointer to the new array address
    shifted_points = &new_shift;
    d_shifted_points.elements = d_new_shift.elements;
    if (params.display){
        save_matrix((*shifted_points), iteration);
@ -249,14 +242,12 @@ int meanshift(double **original_points, double ***shifted_points, int deviation
        free(kernel_matrix[0]);
        free(kernel_matrix);
        free(denominator);
        free_device_memory(d_original_points, d_kernel_matrix, d_denominator, d_new_shift);
    }
    return iteration;
 }
-// TODO check why there's is a difference in the norm calculate in matlab
+
 double norm(double **matrix, int rows, int cols){
    double sum=0, temp_mul=0;
    for (int i=0; i<rows; i++) {
@ -269,57 +260,43 @@ double norm(double **matrix, int rows, int cols){
    return norm;
 }
-void init_device_memory(double **original_points, double **shifted_points,
+void calculate_kernel_matrix(double **shifted_points, double **original_points, double deviation
-    Matrix *d_original_points, Matrix *d_shifted_points, Matrix *d_kernel_matrix,
+    , double ***kernel_matrix){
-    Matrix *d_denominator, Matrix *d_mean_shift_vector){
+    static bool first_iter = true;
    int size;
-    // allocates memory for original_points in GPU and copies the array
+    // allocates memory for shifted_points in GPU and copies the array
-    d_original_points->width = DIMENSIONS;
+    Matrix d_shifted_points;
-    d_original_points->height = NUMBER_OF_POINTS;
+    d_shifted_points.width = DIMENSIONS;
-    size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double);
+    d_shifted_points.height = NUMBER_OF_POINTS;
-    gpuErrchk( cudaMalloc(&(d_original_points->elements), size) );
+    int size = DIMENSIONS * NUMBER_OF_POINTS * sizeof(double);
-    gpuErrchk( cudaMemcpy(d_original_points->elements, &(original_points[0][0])
+    gpuErrchk( cudaMalloc(&d_shifted_points.elements, size) );
    gpuErrchk( cudaMemcpy(d_shifted_points.elements, &(shifted_points[0][0])
        , size, cudaMemcpyHostToDevice) );
-    // allocates memory for shifted_points in GPU and copies the array
+    // allocates memory for original_points in GPU and copies the array
-    d_shifted_points->width = DIMENSIONS;
+    Matrix d_original_points;
-    d_shifted_points->height = NUMBER_OF_POINTS;
+    d_original_points.width = DIMENSIONS;
-    size = DIMENSIONS * NUMBER_OF_POINTS * sizeof(double);
+    d_original_points.height = NUMBER_OF_POINTS;
-    gpuErrchk( cudaMalloc(&(d_shifted_points->elements), size) );
+    size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double);
-    gpuErrchk( cudaMemcpy(d_shifted_points->elements, &(shifted_points[0][0])
+    gpuErrchk( cudaMalloc(&d_original_points.elements, size) );
    gpuErrchk( cudaMemcpy(d_original_points.elements, &(original_points[0][0])
        , size, cudaMemcpyHostToDevice) );
    // allocates memory for kernel_matrix in GPU
-    d_kernel_matrix->width = NUMBER_OF_POINTS;
+    Matrix d_kernel_matrix;
-    d_kernel_matrix->height = NUMBER_OF_POINTS;
+    d_kernel_matrix.width = NUMBER_OF_POINTS;
    d_kernel_matrix.height = NUMBER_OF_POINTS;
    size = NUMBER_OF_POINTS * NUMBER_OF_POINTS * sizeof(double);
-    gpuErrchk( cudaMalloc(&(d_kernel_matrix->elements), size) );
+    gpuErrchk( cudaMalloc(&d_kernel_matrix.elements, size) );
    // allocates memory for denominator in GPU
    d_denominator->width = 1;
    d_denominator->height = NUMBER_OF_POINTS;
    size = NUMBER_OF_POINTS * sizeof(double);
    gpuErrchk( cudaMalloc(&(d_denominator->elements), size) );
    // allocates memory for mean_shift_vector in GPU
    d_mean_shift_vector->width = DIMENSIONS;
    d_mean_shift_vector->height = NUMBER_OF_POINTS;
    size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double);
    gpuErrchk( cudaMalloc(&(d_mean_shift_vector->elements), size) );
 }
-void calculate_kernel_matrix(Matrix d_shifted_points, Matrix d_original_points,
+    // get max sizes supported from the device
-    Matrix d_kernel_matrix, double deviation, double ***kernel_matrix){
+    int max_block_size = (int)sqrt(device_properties.maxThreadsPerBlock);
-    int size;
+    int requested_block_size = max_block_size;
    static bool first_iter = true;
    // gets max block size supported from the device
    static int max_block_size = device_properties.maxThreadsPerBlock;
    static int requested_block_size = (int)sqrt(max_block_size);
    bool block_size_too_big = true;
    dim3 dimBlock;
    dim3 dimGrid;
    do {
        dimBlock.x = requested_block_size;
        dimBlock.y = requested_block_size;
@ -346,93 +323,65 @@ void calculate_kernel_matrix(Matrix d_shifted_points, Matrix d_original_points,
    size = NUMBER_OF_POINTS * NUMBER_OF_POINTS * sizeof(double);
    gpuErrchk( cudaMemcpy(&((*kernel_matrix)[0][0]), d_kernel_matrix.elements
        , size, cudaMemcpyDeviceToHost) );
 }
-void calculate_denominator(Matrix d_kernel_matrix, Matrix d_denominator, double **denominator){
+    gpuErrchk( cudaFree(d_shifted_points.elements) );
-    int size;
+    gpuErrchk( cudaFree(d_original_points.elements) );
-    static bool first_iter = true;
+    gpuErrchk( cudaFree(d_kernel_matrix.elements) );
-    // gets max block size supported from the device
+}
    static int requested_block_size = device_properties.maxThreadsPerBlock;
    bool block_size_too_big = true;
    dim3 dimBlock;
    dim3 dimGrid;
    do {
        dimBlock.x = requested_block_size;
        dimBlock.y = 1;
        dimGrid.x = (d_kernel_matrix.height + dimBlock.x - 1) / dimBlock.x;
        dimGrid.y = 1;
-        denominator_kernel<<<dimGrid, dimBlock>>>(d_denominator, d_kernel_matrix);
+void multiply(double **kernel_matrix, double **original_points, double ***new_shift){
-        if (cudaGetLastError() != cudaSuccess){
+    static bool first_iter = true;
            --requested_block_size;
        } else {
            block_size_too_big = false;
            gpuErrchk( cudaDeviceSynchronize() );
        }
    } while(block_size_too_big);
    if (first_iter && params.verbose){
        printf("calculate_denominator called with:\n");
        printf("dimBlock.x = %d, dimBlock.y = %d\n", dimBlock.x, dimBlock.y);
        printf("dimGrid.x = %d, dimGrid.y = %d\n\n", dimGrid.x, dimGrid.y);
        first_iter = false;
    }
-    size = NUMBER_OF_POINTS * sizeof(double);
+    // allocates memory for kernel_matrix in GPU and copies the array
-    gpuErrchk( cudaMemcpy(&((*denominator)[0]), d_denominator.elements
+    Matrix d_kernel_matrix;
-    	, size, cudaMemcpyDeviceToHost) );
+    d_kernel_matrix.width = NUMBER_OF_POINTS;
-}
+    d_kernel_matrix.height = NUMBER_OF_POINTS;
    int size = NUMBER_OF_POINTS * NUMBER_OF_POINTS * sizeof(double);
    gpuErrchk( cudaMalloc(&d_kernel_matrix.elements, size) );
    gpuErrchk( cudaMemcpy(d_kernel_matrix.elements, &(kernel_matrix[0][0])
        , size, cudaMemcpyHostToDevice) );
-void shift_points(Matrix d_kernel_matrix, Matrix d_original_points, Matrix d_shifted_points,
+    // allocates memory for original_points in GPU and copies the array
-    Matrix d_new_shift, Matrix d_denominator, Matrix d_mean_shift_vector, double **kernel_matrix,
+    Matrix d_original_points;
-    double **original_points, double ***new_shift, double ***mean_shift_vector){
+    d_original_points.width = DIMENSIONS;
-    int size;
+    d_original_points.height = NUMBER_OF_POINTS;
-    static bool first_iter = true;
+    size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double);
-    // gets max block size supported from the device
+    gpuErrchk( cudaMalloc(&d_original_points.elements, size) );
-    static int max_block_size = device_properties.maxThreadsPerBlock;
+    gpuErrchk( cudaMemcpy(d_original_points.elements, &(original_points[0][0])
-    static int requested_block_size = (int)(max_block_size / 2);
+        , size, cudaMemcpyHostToDevice) );
    bool block_size_too_big = true;
-    dim3 dimBlock;
+    // allocates memory for new_shift in GPU
-    dim3 dimGrid;
+    Matrix d_new_shift;
-    do {
+    d_new_shift.width = DIMENSIONS;
-        dimBlock.x = requested_block_size;
+    d_new_shift.height = NUMBER_OF_POINTS;
-        dimBlock.y = 2;
+    size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double);
-        dimGrid.x = (d_denominator.height + dimBlock.x - 1) / dimBlock.x;
+    gpuErrchk( cudaMalloc(&d_new_shift.elements, size) );
        dimGrid.y = 1;
-        shift_points_kernel<<<dimGrid, dimBlock>>>(d_original_points, d_kernel_matrix, d_shifted_points,
+    // get max sizes supported from the device
-            d_new_shift, d_denominator, d_mean_shift_vector);
+    int max_block_size = device_properties.maxThreadsPerBlock;
-        if (cudaGetLastError() != cudaSuccess){
+    dim3 dimBlock((d_new_shift.height < sqrt(max_block_size)) ? d_new_shift.height : sqrt(max_block_size)
-            --requested_block_size;
+        , (d_new_shift.width < sqrt(max_block_size)) ? d_new_shift.width : sqrt(max_block_size));
-        } else {
+    dim3 dimGrid((d_new_shift.height + dimBlock.x - 1) / dimBlock.x
-            block_size_too_big = false;
+        , (d_new_shift.width + dimBlock.y - 1) / dimBlock.y);
            gpuErrchk( cudaDeviceSynchronize() );
        }
    } while(block_size_too_big);
    if (first_iter && params.verbose){
-        printf("shift_points_kernel called with:\n");
+        printf("multiply_kernel called with:\n");
        printf("dimBlock.x = %d, dimBlock.y = %d\n", dimBlock.x, dimBlock.y);
        printf("dimGrid.x = %d, dimGrid.y = %d\n\n", dimGrid.x, dimGrid.y);
        first_iter = false;
    }
    multiply_kernel<<<dimGrid, dimBlock>>>(d_kernel_matrix, d_original_points, d_new_shift);
    gpuErrchk( cudaPeekAtLastError() );
    gpuErrchk( cudaDeviceSynchronize() );
    size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double);
    gpuErrchk( cudaMemcpy(&((*new_shift)[0][0]), d_new_shift.elements
        , size, cudaMemcpyDeviceToHost) );
    gpuErrchk( cudaMemcpy(&((*mean_shift_vector)[0][0]), d_mean_shift_vector.elements
        , size, cudaMemcpyDeviceToHost) );
 }
 void free_device_memory(Matrix d_original_points, Matrix d_kernel_matrix, Matrix d_denominator,
    Matrix d_new_shift){
    // frees all memory previously allocated in device
    gpuErrchk( cudaFree(d_original_points.elements) );
    gpuErrchk( cudaFree(d_kernel_matrix.elements) );
-    //gpuErrchk( cudaFree(d_shifted_points.elements) );
+    gpuErrchk( cudaFree(d_original_points.elements) );
    gpuErrchk( cudaFree(d_denominator.elements) );
    gpuErrchk( cudaFree(d_new_shift.elements) );
 }
@ -485,4 +434,53 @@ void save_matrix(double **matrix, int iteration){
        }
        fprintf(file, "\n");
    }
 }
 double * calculate_denominator(double **kernel_matrix){
    static bool first_iter = true;
    // allocates memory for denominator_matrix in GPU
    Matrix d_denominator_matrix;
    d_denominator_matrix.width = NUMBER_OF_POINTS;
    d_denominator_matrix.height = 1;
    int size = NUMBER_OF_POINTS * sizeof(double);
    gpuErrchk( cudaMalloc(&d_denominator_matrix.elements, size) );
    // allocates memory for kernel_matrix in GPU and copies the array
    Matrix d_kernel_matrix;
    d_kernel_matrix.width = NUMBER_OF_POINTS;
    d_kernel_matrix.height = NUMBER_OF_POINTS;
    size = NUMBER_OF_POINTS * NUMBER_OF_POINTS * sizeof(double);
    gpuErrchk( cudaMalloc(&d_kernel_matrix.elements, size) );
    gpuErrchk( cudaMemcpy(d_kernel_matrix.elements, &(kernel_matrix[0][0])
            , size, cudaMemcpyHostToDevice) );
    // get max sizes supported from the device
    int max_block_size = device_properties.maxThreadsPerBlock;
    dim3 dimBlock((d_denominator_matrix.height < sqrt(max_block_size)) ? d_denominator_matrix.height : sqrt(max_block_size)
            , (d_denominator_matrix.width < sqrt(max_block_size)) ? d_denominator_matrix.width : sqrt(max_block_size));
    dim3 dimGrid((d_denominator_matrix.height + dimBlock.x - 1) / dimBlock.x
            , (d_denominator_matrix.width + dimBlock.y - 1) / dimBlock.y);
    if (first_iter && params.verbose){
        printf("calculate_denominator called with:\n");
        printf("dimBlock.x = %d, dimBlock.y = %d\n", dimBlock.x, dimBlock.y);
        printf("dimGrid.x = %d, dimGrid.y = %d\n\n", dimGrid.x, dimGrid.y);
        first_iter = false;
    }
    denominator_kernel<<<dimGrid, dimBlock>>>(d_denominator_matrix, d_kernel_matrix);
    gpuErrchk( cudaPeekAtLastError() );
    gpuErrchk( cudaDeviceSynchronize() );
    size = NUMBER_OF_POINTS * sizeof(double);
    double ** denominator = (double**)malloc(size);
    gpuErrchk( cudaMemcpy(&((*denominator)[0]), d_denominator_matrix.elements
            ,size, cudaMemcpyDeviceToHost) );
    gpuErrchk( cudaFree(d_kernel_matrix.elements) );
    gpuErrchk( cudaFree(d_denominator_matrix.elements) );
    return (*denominator);
 }
--- a/mean_shift_cuda/meanshift_utils.h
+++ b/mean_shift_cuda/meanshift_utils.h
@ -2,7 +2,6 @@
 #define SERIAL_UTILS_H
 #include <stdbool.h>
 #include "meanshift_kernels.h"
 //GPU error check snippet taken from:
 //https://stackoverflow.com/a/14038590
@ -36,7 +35,7 @@ void get_args(int argc, char **argv, parameters *params);
 //Function init reads the dataset and label arrays from the corresponding files.
 void init(double ***vectors, char **labels);
-void set_GPU();
+void set_Gpu();
 //Function meanshift recursively shifts original points according to th
 //mean-shift algorithm saving the result to shiftedPoints. Struct opt has user
@ -44,24 +43,16 @@ void set_GPU();
 int meanshift(double **original_points, double ***shifted_points, int h
    , parameters *opt);
 void init_device_memory(double **original_points, double **shifted_points,
    Matrix *d_original_points, Matrix *d_shifted_points,
    Matrix *d_kernel_matrix, Matrix *d_denominator, Matrix *d_new_shift);
 //Function norm returns the second norm of matrix of dimensions rowsXcols.
 double norm(double **matrix, int rows, int cols);
-void calculate_kernel_matrix(Matrix d_shifted_points, Matrix d_original_points,
+void calculate_kernel_matrix(double **shifted_points, double **original_points, double deviation
-    Matrix d_kernel_matrix, double deviation, double ***kernel_matrix);
+    , double ***kernel_matrix);
 //Function multiply allocates memory in GPU, sends the data and calls the 
 //multiply kernel function.
-void shift_points(Matrix d_kernel_matrix, Matrix d_original_points, Matrix d_shifted_points,
+void multiply(double **kernel_matrix, double **original_points
-    Matrix d_new_shift, Matrix d_denominator, Matrix d_mean_shift_vector, double **kernel_matrix,
+    , double ***new_shift);
    double **original_points, double ***new_shift, double ***mean_shift_vector);
 void free_device_memory(Matrix d_original_points, Matrix d_kernel_matrix, Matrix d_denominator,
    Matrix d_new_shift);
 //Function calculateDistance returns the distance between x and y vectors.
 double calculateDistance(double *y, double *x);
@ -82,6 +73,6 @@ void save_matrix(double **matrix
 //Function calculate_denominator allocates memory in GPU, sends the data and calls the
 //denominator kernel function.
-void calculate_denominator(Matrix d_kernel_matrix, Matrix d_denominator, double **denominator);
+double * calculate_denominator(double **kernel_matrix);
 #endif //SERIAL_UTILS_H