authParallelAndDistributedS.../mean_shift_cuda/meanshift_utils.cu

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <float.h>
#include <string.h>

#include "meanshift_utils.h"
#include "meanshift_kernels.h"

#define OUTPUT_PREFIX "../output/output_"

cudaDeviceProp device_properties;

void get_args(int argc, char **argv, parameters *params){
    if (argc < 7) {
        printf("Usage: %s h e N D Pd Pl\nwhere:\n"
        "\th is the variance\n"
        "\te is the min distance, between two points, that is taken into account in computations\n"
        "\tN is the the number of points\n"
        "\tD is the number of dimensions of each point\n"
        "\tPd is the path of the dataset file\n"
        "\tPl is the path of the labels file\n"
        "\n\t--verbose | -v is an optional flag to enable execution information output"
        "\n\t--output | -o is an optional flag to enable points output in each iteration", argv[0]);
        exit(1);
    }

    DEVIATION = atoi(argv[1]);
    params->epsilon = atof(argv[2]);
    NUMBER_OF_POINTS = atoi(argv[3]);
    DIMENSIONS = atoi(argv[4]);
    POINTS_FILENAME = argv[5];
    LABELS_FILENAME = argv[6];
    params->verbose = false;
    params->display = false;

    if (argc > 7){
        for (int index=7; index<argc; ++index){
            if (!strcmp(argv[index], "--verbose") || !strcmp(argv[index], "-v")){
                params->verbose = true;
            } else if (!strcmp(argv[index], "--output") || !strcmp(argv[index], "-o")){
                params->display = true;
            } else {
                printf("Couldn't parse argument %d: %s\n", index, argv[index]);
                exit(EXIT_FAILURE);
            }
        }
    }

    /*printf("DEVIATION = %d\n"
        "epsilon = %f\n"
        "NUMBER_OF_POINTS = %d\n"
        "DIMENSIONS = %d\n"
        "POINTS_FILENAME = %s\n"
        "LABELS_FILENAME = %s\n"
        "verbose = %d\n"
        "display = %d\n", DEVIATION, params->epsilon, NUMBER_OF_POINTS, DIMENSIONS, POINTS_FILENAME
            , LABELS_FILENAME, params->verbose, params->display);*/
}

void init(double ***vectors, char **labels){
    int bytes_read = 0;

    set_Gpu();

    if (params.verbose){
        printf("Reading dataset and labels...\n");
    }

    // initializes vectors
    FILE *points_file;
    points_file = fopen(POINTS_FILENAME, "rb");
    if (points_file != NULL){
        // allocates memory for the array
        (*vectors) = alloc_2d_double(NUMBER_OF_POINTS, DIMENSIONS);
        // reads vectors dataset from file
        for (int i=0; i<NUMBER_OF_POINTS; i++){
            bytes_read = fread((*vectors)[i], sizeof(double), DIMENSIONS, points_file);
            if ( bytes_read != DIMENSIONS ){
                if(feof(points_file)){
                    printf("Premature end of file reached.\n");
                } else{
                    printf("Error reading points file.");
                }
                fclose(points_file);
                exit(EXIT_FAILURE);
            }
        }
    } else {
        printf("Error reading dataset file.\n");
        exit(EXIT_FAILURE);
    }
    fclose(points_file);

    // initializes file that will contain the labels (train)
    FILE *labels_file;
    labels_file = fopen(LABELS_FILENAME, "rb");
    if (labels_file != NULL){
        // NOTE : Labels were classified as <class 'numpy.uint8'>
        // variables of type uint8 are stored as 1-byte (8-bit) unsigned integers
        // gets number of labels
        fseek(labels_file, 0L, SEEK_END);
        long int pos = ftell(labels_file);
        rewind(labels_file);
        int label_elements = pos/ sizeof(char);

        // allocates memory for the array
        *labels = (char*)malloc(label_elements* sizeof(char));
        fseek(labels_file, 0L, SEEK_SET);
        bytes_read = fread((*labels), sizeof(char), label_elements, labels_file);
        if ( bytes_read != label_elements ){
            if(feof(points_file)){
                printf("Premature end of file reached.\n");
            } else{
                printf("Error reading points file.");
            }
            fclose(labels_file);
            exit(EXIT_FAILURE);
        }
    }
    fclose(labels_file);

    if (params.verbose){
        printf("Done.\n\n");
    }
}

//Based on https://stackoverflow.com/a/28113186
//Poio psagmeno link https://www.cs.virginia.edu/~csadmin/wiki/index.php/CUDA_Support/Choosing_a_GPU
void set_Gpu(){
    int devices_count = 0, max_multiprocessors = 0, max_device = 0;

    // gets devices count checking for errors like no devices or no drivers to check for
    // devices available
    gpuErrchk( cudaGetDeviceCount(&devices_count) );
    for(int device_index = 0; device_index < devices_count; ++device_index){
        // gets current index device's properties
        cudaDeviceProp this_device_properties;
        gpuErrchk( cudaGetDeviceProperties(&this_device_properties, device_index) );

        // stores best available device's index
        // only devices with compute capability >= 2.0 are able to run the code
        if (max_multiprocessors < this_device_properties.multiProcessorCount
            && this_device_properties.major >= 2 && this_device_properties.minor >= 0){
            // stores devices properties for later use
            device_properties = this_device_properties;
            max_multiprocessors = this_device_properties.multiProcessorCount;
            max_device = device_index;
        }
    }
    // sets the device
    gpuErrchk( cudaSetDevice(max_device) );
    if (params.verbose){
        printf("Device chosen is \"%s\"\n"
            "Device has %d multi processors and compute capability %d.%d\n"
            "Max threads per block supported are %d\n\n"
            , device_properties.name
            , device_properties.multiProcessorCount, device_properties.major, device_properties.minor
            , device_properties.maxThreadsPerBlock);
    }
}

int meanshift(double **original_points, double ***shifted_points, int deviation
    , parameters *opt){
    static int iteration = 0;
    static double **mean_shift_vector, **kernel_matrix, *denominator;

    // allocates memory and copies original points on first iteration
    if (iteration == 0 || (*shifted_points) == NULL){
        (*shifted_points) = alloc_2d_double(NUMBER_OF_POINTS, DIMENSIONS);
        duplicate(original_points, NUMBER_OF_POINTS, DIMENSIONS, shifted_points);

        // allocates memory for mean shift vector
        mean_shift_vector = alloc_2d_double(NUMBER_OF_POINTS, DIMENSIONS);
        // initializes elements of mean_shift_vector to inf
        for (int i=0;i<NUMBER_OF_POINTS;i++){
            for (int j=0;j<DIMENSIONS;j++){
                mean_shift_vector[i][j] = DBL_MAX;
            }
        }

        // allocates memory for other arrays needed
        kernel_matrix = alloc_2d_double(NUMBER_OF_POINTS, NUMBER_OF_POINTS);
        denominator = (double *)malloc(NUMBER_OF_POINTS * sizeof(double));
    }
    // TODO move arrays to device and create global kernel for the iteration

    // finds pairwise distance matrix (inside radius)
    // [I, D] = rangesearch(x,y,h);
    calculate_kernel_matrix((*shifted_points), original_points, deviation, &kernel_matrix);

//    // calculate denominator
//    for (int i=0; i<NUMBER_OF_POINTS; i++){
//        double sum = 0;
//        for (int j=0; j<NUMBER_OF_POINTS; j++){
//            sum = sum + kernel_matrix[i][j];
//        }
//        denominator[i] = sum;
//    }
    denominator = calculate_denominator(kernel_matrix);

    // creates new y vector
    double **new_shift = alloc_2d_double(NUMBER_OF_POINTS, DIMENSIONS);

    // builds nominator
    multiply(kernel_matrix, original_points, &new_shift);

    // divides element-wise
    for (int i=0; i<NUMBER_OF_POINTS; i++){
        for (int j=0; j<DIMENSIONS; j++){
            new_shift[i][j] = new_shift[i][j] / denominator[i];
            // calculates mean-shift vector at the same time
            mean_shift_vector[i][j] = new_shift[i][j] - (*shifted_points)[i][j];
        }
    }

    // frees previously shifted points, they're now garbage
    free((*shifted_points)[0]);
    // updates shifted points pointer to the new array address
    shifted_points = &new_shift;

    if (params.display){
        save_matrix((*shifted_points), iteration);
    }

    // calculates norm of the new mean shift vector
    double current_norm = norm(mean_shift_vector, NUMBER_OF_POINTS, DIMENSIONS);
    if (params.verbose){
        printf("Iteration n. %d, error %f \n", iteration, current_norm);
    }

    /** iterates until convergence **/
    if (current_norm > opt->epsilon) {
        ++iteration;
        meanshift(original_points, shifted_points, deviation, opt);
    }

    if (iteration == 0){
        // cleans up allocations
        free(mean_shift_vector[0]);
        free(mean_shift_vector);
        free(kernel_matrix[0]);
        free(kernel_matrix);
        free(denominator);
    }

    return iteration;
}


double norm(double **matrix, int rows, int cols){
    double sum=0, temp_mul=0;
    for (int i=0; i<rows; i++) {
        for (int j=0; j<cols; j++) {
            temp_mul = matrix[i][j] * matrix[i][j];
            sum = sum + temp_mul;
        }
    }
    double norm = sqrt(sum);
    return norm;
}

void calculate_kernel_matrix(double **shifted_points, double **original_points, double deviation
    , double ***kernel_matrix){
    static bool first_iter = true;

    // allocates memory for shifted_points in GPU and copies the array
    Matrix d_shifted_points;
    d_shifted_points.width = DIMENSIONS;
    d_shifted_points.height = NUMBER_OF_POINTS;
    int size = DIMENSIONS * NUMBER_OF_POINTS * sizeof(double);
    gpuErrchk( cudaMalloc(&d_shifted_points.elements, size) );
    gpuErrchk( cudaMemcpy(d_shifted_points.elements, &(shifted_points[0][0])
        , size, cudaMemcpyHostToDevice) );

    // allocates memory for original_points in GPU and copies the array
    Matrix d_original_points;
    d_original_points.width = DIMENSIONS;
    d_original_points.height = NUMBER_OF_POINTS;
    size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double);
    gpuErrchk( cudaMalloc(&d_original_points.elements, size) );
    gpuErrchk( cudaMemcpy(d_original_points.elements, &(original_points[0][0])
        , size, cudaMemcpyHostToDevice) );

    // allocates memory for kernel_matrix in GPU
    Matrix d_kernel_matrix;
    d_kernel_matrix.width = NUMBER_OF_POINTS;
    d_kernel_matrix.height = NUMBER_OF_POINTS;
    size = NUMBER_OF_POINTS * NUMBER_OF_POINTS * sizeof(double);
    gpuErrchk( cudaMalloc(&d_kernel_matrix.elements, size) );

    // get max sizes supported from the device
    int max_block_size = (int)sqrt(device_properties.maxThreadsPerBlock);
    int requested_block_size = max_block_size;
    bool block_size_too_big = true;

    dim3 dimBlock;
    dim3 dimGrid;

    do {
        dimBlock.x = requested_block_size;
        dimBlock.y = requested_block_size;
        dimGrid.x = (d_kernel_matrix.height + dimBlock.x - 1) / dimBlock.x;
        dimGrid.y = (d_kernel_matrix.width + dimBlock.y - 1) / dimBlock.y;

        calculate_kernel_matrix_kernel<<<dimGrid, dimBlock>>>(d_shifted_points, d_original_points
            , deviation, d_kernel_matrix);
        if (cudaGetLastError() != cudaSuccess){
            --requested_block_size;
        } else {
            block_size_too_big = false;
            gpuErrchk( cudaDeviceSynchronize() );
        }
    } while(block_size_too_big);
    
    if (first_iter && params.verbose){
        printf("calculate_kernel_matrix_kernel called with:\n");
        printf("dimBlock.x = %d, dimBlock.y = %d\n", dimBlock.x, dimBlock.y);
        printf("dimGrid.x = %d, dimGrid.y = %d\n\n", dimGrid.x, dimGrid.y);
        first_iter = false;
    }

    size = NUMBER_OF_POINTS * NUMBER_OF_POINTS * sizeof(double);
    gpuErrchk( cudaMemcpy(&((*kernel_matrix)[0][0]), d_kernel_matrix.elements
        , size, cudaMemcpyDeviceToHost) );

    gpuErrchk( cudaFree(d_shifted_points.elements) );
    gpuErrchk( cudaFree(d_original_points.elements) );
    gpuErrchk( cudaFree(d_kernel_matrix.elements) );
}


void multiply(double **kernel_matrix, double **original_points, double ***new_shift){
    static bool first_iter = true;

    // allocates memory for kernel_matrix in GPU and copies the array
    Matrix d_kernel_matrix;
    d_kernel_matrix.width = NUMBER_OF_POINTS;
    d_kernel_matrix.height = NUMBER_OF_POINTS;
    int size = NUMBER_OF_POINTS * NUMBER_OF_POINTS * sizeof(double);
    gpuErrchk( cudaMalloc(&d_kernel_matrix.elements, size) );
    gpuErrchk( cudaMemcpy(d_kernel_matrix.elements, &(kernel_matrix[0][0])
        , size, cudaMemcpyHostToDevice) );

    // allocates memory for original_points in GPU and copies the array
    Matrix d_original_points;
    d_original_points.width = DIMENSIONS;
    d_original_points.height = NUMBER_OF_POINTS;
    size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double);
    gpuErrchk( cudaMalloc(&d_original_points.elements, size) );
    gpuErrchk( cudaMemcpy(d_original_points.elements, &(original_points[0][0])
        , size, cudaMemcpyHostToDevice) );

    // allocates memory for new_shift in GPU
    Matrix d_new_shift;
    d_new_shift.width = DIMENSIONS;
    d_new_shift.height = NUMBER_OF_POINTS;
    size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double);
    gpuErrchk( cudaMalloc(&d_new_shift.elements, size) );

    // get max sizes supported from the device
    int max_block_size = device_properties.maxThreadsPerBlock;
    dim3 dimBlock((d_new_shift.height < sqrt(max_block_size)) ? d_new_shift.height : sqrt(max_block_size)
        , (d_new_shift.width < sqrt(max_block_size)) ? d_new_shift.width : sqrt(max_block_size));
    dim3 dimGrid((d_new_shift.height + dimBlock.x - 1) / dimBlock.x
        , (d_new_shift.width + dimBlock.y - 1) / dimBlock.y);

    if (first_iter && params.verbose){
        printf("multiply_kernel called with:\n");
        printf("dimBlock.x = %d, dimBlock.y = %d\n", dimBlock.x, dimBlock.y);
        printf("dimGrid.x = %d, dimGrid.y = %d\n\n", dimGrid.x, dimGrid.y);
        first_iter = false;
    }

    multiply_kernel<<<dimGrid, dimBlock>>>(d_kernel_matrix, d_original_points, d_new_shift);
    gpuErrchk( cudaPeekAtLastError() );
    gpuErrchk( cudaDeviceSynchronize() );

    size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double);
    gpuErrchk( cudaMemcpy(&((*new_shift)[0][0]), d_new_shift.elements
        , size, cudaMemcpyDeviceToHost) );

    gpuErrchk( cudaFree(d_kernel_matrix.elements) );
    gpuErrchk( cudaFree(d_original_points.elements) );
    gpuErrchk( cudaFree(d_new_shift.elements) );
}

double calculateDistance(double *y, double *x){
    double sum = 0, dif;
    for (int i=0; i<DIMENSIONS; i++){
        dif = y[i]-x[i];
        sum += dif * dif;
    }
    double distance = sqrt(sum);
    return distance;
}

double **alloc_2d_double(int rows, int cols) {
    double *data = (double *) malloc(rows*cols*sizeof(double));
    double **array = (double **) malloc(rows*sizeof(double*));
    for (int i=0; i<rows; i++)
        array[i] = &(data[cols*i]);
    return array;
}

void duplicate(double **source, int rows, int cols, double ***dest){
    for (int i=0; i<rows; i++){
        for (int j=0; j<cols; j++){
            (*dest)[i][j] = source[i][j];
        }
    }
}

void print_matrix(double **array, int rows, int cols){
    for (int i=0; i<cols; i++){
        for (int j=0; j<rows; j++){
            printf("%f ", array[j][i]);
        }
        printf("\n");
    }
}

void save_matrix(double **matrix, int iteration){
    char filename[50];
    snprintf(filename, sizeof(filename), "%s%d", "../output/output_", iteration);
    FILE *file;
    file = fopen(filename, "w");
    for (int rows=0; rows<NUMBER_OF_POINTS; ++rows){
        for (int cols=0; cols<DIMENSIONS; ++cols){
            fprintf(file, "%f", matrix[rows][cols]);
            if (cols != DIMENSIONS - 1){
                fprintf(file, ",");
            }
        }
        fprintf(file, "\n");
    }
}

double * calculate_denominator(double **kernel_matrix){
    static bool first_iter = true;

    // allocates memory for denominator_matrix in GPU
    Matrix d_denominator_matrix;
    d_denominator_matrix.width = NUMBER_OF_POINTS;
    d_denominator_matrix.height = 1;
    int size = NUMBER_OF_POINTS * sizeof(double);
    gpuErrchk( cudaMalloc(&d_denominator_matrix.elements, size) );

    // allocates memory for kernel_matrix in GPU and copies the array
    Matrix d_kernel_matrix;
    d_kernel_matrix.width = NUMBER_OF_POINTS;
    d_kernel_matrix.height = NUMBER_OF_POINTS;
    size = NUMBER_OF_POINTS * NUMBER_OF_POINTS * sizeof(double);
    gpuErrchk( cudaMalloc(&d_kernel_matrix.elements, size) );
    gpuErrchk( cudaMemcpy(d_kernel_matrix.elements, &(kernel_matrix[0][0])
            , size, cudaMemcpyHostToDevice) );

    // get max sizes supported from the device
    int max_block_size = device_properties.maxThreadsPerBlock;
    dim3 dimBlock((d_denominator_matrix.height < sqrt(max_block_size)) ? d_denominator_matrix.height : sqrt(max_block_size)
            , (d_denominator_matrix.width < sqrt(max_block_size)) ? d_denominator_matrix.width : sqrt(max_block_size));
    dim3 dimGrid((d_denominator_matrix.height + dimBlock.x - 1) / dimBlock.x
            , (d_denominator_matrix.width + dimBlock.y - 1) / dimBlock.y);

    if (first_iter && params.verbose){
        printf("calculate_denominator called with:\n");
        printf("dimBlock.x = %d, dimBlock.y = %d\n", dimBlock.x, dimBlock.y);
        printf("dimGrid.x = %d, dimGrid.y = %d\n\n", dimGrid.x, dimGrid.y);
        first_iter = false;
    }

    denominator_kernel<<<dimGrid, dimBlock>>>(d_denominator_matrix, d_kernel_matrix);
    gpuErrchk( cudaPeekAtLastError() );
    gpuErrchk( cudaDeviceSynchronize() );


    size = NUMBER_OF_POINTS * sizeof(double);
    double ** denominator = (double**)malloc(size);
    gpuErrchk( cudaMemcpy(&((*denominator)[0]), d_denominator_matrix.elements
            ,size, cudaMemcpyDeviceToHost) );

    gpuErrchk( cudaFree(d_kernel_matrix.elements) );
    gpuErrchk( cudaFree(d_denominator_matrix.elements) );

    return (*denominator);
}
Add kernels header and implementation, fix Makefile 7 years ago			`#include <stdio.h>`
			`#include <stdlib.h>`
			`#include <math.h>`
			`#include <float.h>`
			`#include <string.h>`

			`#include "meanshift_utils.h"`
			`#include "meanshift_kernels.h"`

			`#define OUTPUT_PREFIX "../output/output_"`
Minor tweaks 7 years ago
			`cudaDeviceProp device_properties;`

			`void get_args(int argc, char *argv, parameters params){`
			`if (argc < 7) {`
			`printf("Usage: %s h e N D Pd Pl\nwhere:\n"`
			`"\th is the variance\n"`
			`"\te is the min distance, between two points, that is taken into account in computations\n"`
			`"\tN is the the number of points\n"`
			`"\tD is the number of dimensions of each point\n"`
			`"\tPd is the path of the dataset file\n"`
			`"\tPl is the path of the labels file\n"`
			`"\n\t--verbose \| -v is an optional flag to enable execution information output"`
			`"\n\t--output \| -o is an optional flag to enable points output in each iteration", argv[0]);`
Add kernels header and implementation, fix Makefile 7 years ago			`exit(1);`
			`}`

			`DEVIATION = atoi(argv[1]);`
Minor tweaks 7 years ago			`params->epsilon = atof(argv[2]);`
			`NUMBER_OF_POINTS = atoi(argv[3]);`
			`DIMENSIONS = atoi(argv[4]);`
			`POINTS_FILENAME = argv[5];`
			`LABELS_FILENAME = argv[6];`
			`params->verbose = false;`
			`params->display = false;`

			`if (argc > 7){`
			`for (int index=7; index<argc; ++index){`
			`if (!strcmp(argv[index], "--verbose") \|\| !strcmp(argv[index], "-v")){`
			`params->verbose = true;`
			`} else if (!strcmp(argv[index], "--output") \|\| !strcmp(argv[index], "-o")){`
			`params->display = true;`
			`} else {`
			`printf("Couldn't parse argument %d: %s\n", index, argv[index]);`
			`exit(EXIT_FAILURE);`
			`}`
			`}`
			`}`

			`/*printf("DEVIATION = %d\n"`
			`"epsilon = %f\n"`
			`"NUMBER_OF_POINTS = %d\n"`
			`"DIMENSIONS = %d\n"`
			`"POINTS_FILENAME = %s\n"`
			`"LABELS_FILENAME = %s\n"`
			`"verbose = %d\n"`
			`"display = %d\n", DEVIATION, params->epsilon, NUMBER_OF_POINTS, DIMENSIONS, POINTS_FILENAME`
			`, LABELS_FILENAME, params->verbose, params->display);*/`
Add kernels header and implementation, fix Makefile 7 years ago			`}`

Minor tweaks 7 years ago			`void init(double *vectors, char labels){`
Add kernels header and implementation, fix Makefile 7 years ago			`int bytes_read = 0;`
Minor tweaks 7 years ago
			`set_Gpu();`

			`if (params.verbose){`
			`printf("Reading dataset and labels...\n");`
			`}`

Add kernels header and implementation, fix Makefile 7 years ago			`// initializes vectors`
			`FILE *points_file;`
			`points_file = fopen(POINTS_FILENAME, "rb");`
			`if (points_file != NULL){`
			`// allocates memory for the array`
			`(*vectors) = alloc_2d_double(NUMBER_OF_POINTS, DIMENSIONS);`
			`// reads vectors dataset from file`
			`for (int i=0; i<NUMBER_OF_POINTS; i++){`
			`bytes_read = fread((*vectors)[i], sizeof(double), DIMENSIONS, points_file);`
			`if ( bytes_read != DIMENSIONS ){`
			`if(feof(points_file)){`
			`printf("Premature end of file reached.\n");`
			`} else{`
			`printf("Error reading points file.");`
			`}`
			`fclose(points_file);`
			`exit(EXIT_FAILURE);`
			`}`
			`}`
			`} else {`
			`printf("Error reading dataset file.\n");`
			`exit(EXIT_FAILURE);`
			`}`
			`fclose(points_file);`

			`// initializes file that will contain the labels (train)`
			`FILE *labels_file;`
			`labels_file = fopen(LABELS_FILENAME, "rb");`
			`if (labels_file != NULL){`
			`// NOTE : Labels were classified as <class 'numpy.uint8'>`
			`// variables of type uint8 are stored as 1-byte (8-bit) unsigned integers`
			`// gets number of labels`
			`fseek(labels_file, 0L, SEEK_END);`
			`long int pos = ftell(labels_file);`
			`rewind(labels_file);`
			`int label_elements = pos/ sizeof(char);`

			`// allocates memory for the array`
			`labels = (char)malloc(label_elements* sizeof(char));`
			`fseek(labels_file, 0L, SEEK_SET);`
			`bytes_read = fread((*labels), sizeof(char), label_elements, labels_file);`
			`if ( bytes_read != label_elements ){`
			`if(feof(points_file)){`
			`printf("Premature end of file reached.\n");`
			`} else{`
			`printf("Error reading points file.");`
			`}`
			`fclose(labels_file);`
			`exit(EXIT_FAILURE);`
			`}`
			`}`
			`fclose(labels_file);`

Minor tweaks 7 years ago			`if (params.verbose){`
			`printf("Done.\n\n");`
			`}`
			`}`

			`//Based on https://stackoverflow.com/a/28113186`
			`//Poio psagmeno link https://www.cs.virginia.edu/~csadmin/wiki/index.php/CUDA_Support/Choosing_a_GPU`
			`void set_Gpu(){`
			`int devices_count = 0, max_multiprocessors = 0, max_device = 0;`

			`// gets devices count checking for errors like no devices or no drivers to check for`
			`// devices available`
			`gpuErrchk( cudaGetDeviceCount(&devices_count) );`
			`for(int device_index = 0; device_index < devices_count; ++device_index){`
			`// gets current index device's properties`
			`cudaDeviceProp this_device_properties;`
			`gpuErrchk( cudaGetDeviceProperties(&this_device_properties, device_index) );`

			`// stores best available device's index`
			`// only devices with compute capability >= 2.0 are able to run the code`
			`if (max_multiprocessors < this_device_properties.multiProcessorCount`
			`&& this_device_properties.major >= 2 && this_device_properties.minor >= 0){`
			`// stores devices properties for later use`
			`device_properties = this_device_properties;`
			`max_multiprocessors = this_device_properties.multiProcessorCount;`
			`max_device = device_index;`
			`}`
			`}`
			`// sets the device`
			`gpuErrchk( cudaSetDevice(max_device) );`
			`if (params.verbose){`
			`printf("Device chosen is \"%s\"\n"`
			`"Device has %d multi processors and compute capability %d.%d\n"`
Add calculate_kernel_matrix_kernel 7 years ago			`"Max threads per block supported are %d\n\n"`
Minor tweaks 7 years ago			`, device_properties.name`
			`, device_properties.multiProcessorCount, device_properties.major, device_properties.minor`
Add calculate_kernel_matrix_kernel 7 years ago			`, device_properties.maxThreadsPerBlock);`
Minor tweaks 7 years ago			`}`
Add kernels header and implementation, fix Makefile 7 years ago			`}`

			`int meanshift(double original_points, double *shifted_points, int deviation`
			`, parameters *opt){`
			`static int iteration = 0;`
			`static double mean_shift_vector, kernel_matrix, *denominator;`

			`// allocates memory and copies original points on first iteration`
			`if (iteration == 0 \|\| (*shifted_points) == NULL){`
			`(*shifted_points) = alloc_2d_double(NUMBER_OF_POINTS, DIMENSIONS);`
			`duplicate(original_points, NUMBER_OF_POINTS, DIMENSIONS, shifted_points);`

			`// allocates memory for mean shift vector`
			`mean_shift_vector = alloc_2d_double(NUMBER_OF_POINTS, DIMENSIONS);`
			`// initializes elements of mean_shift_vector to inf`
			`for (int i=0;i<NUMBER_OF_POINTS;i++){`
			`for (int j=0;j<DIMENSIONS;j++){`
			`mean_shift_vector[i][j] = DBL_MAX;`
			`}`
			`}`

			`// allocates memory for other arrays needed`
			`kernel_matrix = alloc_2d_double(NUMBER_OF_POINTS, NUMBER_OF_POINTS);`
			`denominator = (double )malloc(NUMBER_OF_POINTS sizeof(double));`
			`}`
kernel calculate denominator 7 years ago			`// TODO move arrays to device and create global kernel for the iteration`
Add kernels header and implementation, fix Makefile 7 years ago
			`// finds pairwise distance matrix (inside radius)`
			`// [I, D] = rangesearch(x,y,h);`
Add calculate_kernel_matrix_kernel 7 years ago			`calculate_kernel_matrix((*shifted_points), original_points, deviation, &kernel_matrix);`
kernel calculate denominator 7 years ago
			`// // calculate denominator`
			`// for (int i=0; i<NUMBER_OF_POINTS; i++){`
			`// double sum = 0;`
			`// for (int j=0; j<NUMBER_OF_POINTS; j++){`
			`// sum = sum + kernel_matrix[i][j];`
			`// }`
			`// denominator[i] = sum;`
			`// }`
compile error fixex 7 years ago			`denominator = calculate_denominator(kernel_matrix);`
Add kernels header and implementation, fix Makefile 7 years ago
			`// creates new y vector`
			`double **new_shift = alloc_2d_double(NUMBER_OF_POINTS, DIMENSIONS);`

			`// builds nominator`
Fix multiply, Add error checking for CUDA calls 7 years ago			`multiply(kernel_matrix, original_points, &new_shift);`
Add kernels header and implementation, fix Makefile 7 years ago
			`// divides element-wise`
			`for (int i=0; i<NUMBER_OF_POINTS; i++){`
			`for (int j=0; j<DIMENSIONS; j++){`
			`new_shift[i][j] = new_shift[i][j] / denominator[i];`
			`// calculates mean-shift vector at the same time`
			`mean_shift_vector[i][j] = new_shift[i][j] - (*shifted_points)[i][j];`
			`}`
			`}`

			`// frees previously shifted points, they're now garbage`
			`free((*shifted_points)[0]);`
			`// updates shifted points pointer to the new array address`
			`shifted_points = &new_shift;`

Minor tweaks 7 years ago			`if (params.display){`
			`save_matrix((*shifted_points), iteration);`
			`}`
Add kernels header and implementation, fix Makefile 7 years ago
			`// calculates norm of the new mean shift vector`
			`double current_norm = norm(mean_shift_vector, NUMBER_OF_POINTS, DIMENSIONS);`
Minor tweaks 7 years ago			`if (params.verbose){`
			`printf("Iteration n. %d, error %f \n", iteration, current_norm);`
			`}`
Add kernels header and implementation, fix Makefile 7 years ago
			`/ iterates until convergence /`
			`if (current_norm > opt->epsilon) {`
			`++iteration;`
			`meanshift(original_points, shifted_points, deviation, opt);`
			`}`

			`if (iteration == 0){`
Minor tweaks 7 years ago			`// cleans up allocations`
Add kernels header and implementation, fix Makefile 7 years ago			`free(mean_shift_vector[0]);`
			`free(mean_shift_vector);`
			`free(kernel_matrix[0]);`
			`free(kernel_matrix);`
			`free(denominator);`
			`}`

			`return iteration;`
			`}`

kernel calculate denominator 7 years ago
Add kernels header and implementation, fix Makefile 7 years ago			`double norm(double **matrix, int rows, int cols){`
			`double sum=0, temp_mul=0;`
			`for (int i=0; i<rows; i++) {`
			`for (int j=0; j<cols; j++) {`
			`temp_mul = matrix[i][j] * matrix[i][j];`
			`sum = sum + temp_mul;`
			`}`
			`}`
			`double norm = sqrt(sum);`
			`return norm;`
			`}`

Add calculate_kernel_matrix_kernel 7 years ago			`void calculate_kernel_matrix(double shifted_points, double original_points, double deviation`
			`, double ***kernel_matrix){`
			`static bool first_iter = true;`

			`// allocates memory for shifted_points in GPU and copies the array`
			`Matrix d_shifted_points;`
			`d_shifted_points.width = DIMENSIONS;`
			`d_shifted_points.height = NUMBER_OF_POINTS;`
			`int size = DIMENSIONS * NUMBER_OF_POINTS * sizeof(double);`
			`gpuErrchk( cudaMalloc(&d_shifted_points.elements, size) );`
			`gpuErrchk( cudaMemcpy(d_shifted_points.elements, &(shifted_points[0][0])`
			`, size, cudaMemcpyHostToDevice) );`

			`// allocates memory for original_points in GPU and copies the array`
			`Matrix d_original_points;`
			`d_original_points.width = DIMENSIONS;`
			`d_original_points.height = NUMBER_OF_POINTS;`
			`size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double);`
			`gpuErrchk( cudaMalloc(&d_original_points.elements, size) );`
			`gpuErrchk( cudaMemcpy(d_original_points.elements, &(original_points[0][0])`
			`, size, cudaMemcpyHostToDevice) );`

			`// allocates memory for kernel_matrix in GPU`
			`Matrix d_kernel_matrix;`
			`d_kernel_matrix.width = NUMBER_OF_POINTS;`
			`d_kernel_matrix.height = NUMBER_OF_POINTS;`
			`size = NUMBER_OF_POINTS * NUMBER_OF_POINTS * sizeof(double);`
			`gpuErrchk( cudaMalloc(&d_kernel_matrix.elements, size) );`

			`// get max sizes supported from the device`
			`int max_block_size = (int)sqrt(device_properties.maxThreadsPerBlock);`
			`int requested_block_size = max_block_size;`
			`bool block_size_too_big = true;`

			`dim3 dimBlock;`
			`dim3 dimGrid;`

			`do {`
			`dimBlock.x = requested_block_size;`
			`dimBlock.y = requested_block_size;`
			`dimGrid.x = (d_kernel_matrix.height + dimBlock.x - 1) / dimBlock.x;`
			`dimGrid.y = (d_kernel_matrix.width + dimBlock.y - 1) / dimBlock.y;`

			`calculate_kernel_matrix_kernel<<<dimGrid, dimBlock>>>(d_shifted_points, d_original_points`
			`, deviation, d_kernel_matrix);`
			`if (cudaGetLastError() != cudaSuccess){`
			`--requested_block_size;`
			`} else {`
			`block_size_too_big = false;`
			`gpuErrchk( cudaDeviceSynchronize() );`
			`}`
			`} while(block_size_too_big);`

			`if (first_iter && params.verbose){`
			`printf("calculate_kernel_matrix_kernel called with:\n");`
			`printf("dimBlock.x = %d, dimBlock.y = %d\n", dimBlock.x, dimBlock.y);`
			`printf("dimGrid.x = %d, dimGrid.y = %d\n\n", dimGrid.x, dimGrid.y);`
			`first_iter = false;`
			`}`

			`size = NUMBER_OF_POINTS * NUMBER_OF_POINTS * sizeof(double);`
			`gpuErrchk( cudaMemcpy(&((*kernel_matrix)[0][0]), d_kernel_matrix.elements`
			`, size, cudaMemcpyDeviceToHost) );`

			`gpuErrchk( cudaFree(d_shifted_points.elements) );`
			`gpuErrchk( cudaFree(d_original_points.elements) );`
			`gpuErrchk( cudaFree(d_kernel_matrix.elements) );`
			`}`


Fix multiply, Add error checking for CUDA calls 7 years ago			`void multiply(double kernel_matrix, double original_points, double ***new_shift){`
Add calculate_kernel_matrix_kernel 7 years ago			`static bool first_iter = true;`
Minor tweaks 7 years ago
Add calculate_kernel_matrix_kernel 7 years ago			`// allocates memory for kernel_matrix in GPU and copies the array`
			`Matrix d_kernel_matrix;`
Fix multiply, Add error checking for CUDA calls 7 years ago			`d_kernel_matrix.width = NUMBER_OF_POINTS;`
			`d_kernel_matrix.height = NUMBER_OF_POINTS;`
			`int size = NUMBER_OF_POINTS * NUMBER_OF_POINTS * sizeof(double);`
			`gpuErrchk( cudaMalloc(&d_kernel_matrix.elements, size) );`
			`gpuErrchk( cudaMemcpy(d_kernel_matrix.elements, &(kernel_matrix[0][0])`
Add calculate_kernel_matrix_kernel 7 years ago			`, size, cudaMemcpyHostToDevice) );`
Fix multiply, Add error checking for CUDA calls 7 years ago
			`// allocates memory for original_points in GPU and copies the array`
			`Matrix d_original_points;`
			`d_original_points.width = DIMENSIONS;`
			`d_original_points.height = NUMBER_OF_POINTS;`
			`size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double);`
			`gpuErrchk( cudaMalloc(&d_original_points.elements, size) );`
			`gpuErrchk( cudaMemcpy(d_original_points.elements, &(original_points[0][0])`
Add calculate_kernel_matrix_kernel 7 years ago			`, size, cudaMemcpyHostToDevice) );`
Fix multiply, Add error checking for CUDA calls 7 years ago
Add calculate_kernel_matrix_kernel 7 years ago			`// allocates memory for new_shift in GPU`
Fix multiply, Add error checking for CUDA calls 7 years ago			`Matrix d_new_shift;`
			`d_new_shift.width = DIMENSIONS;`
			`d_new_shift.height = NUMBER_OF_POINTS;`
			`size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double);`
			`gpuErrchk( cudaMalloc(&d_new_shift.elements, size) );`

Add calculate_kernel_matrix_kernel 7 years ago			`// get max sizes supported from the device`
			`int max_block_size = device_properties.maxThreadsPerBlock;`
			`dim3 dimBlock((d_new_shift.height < sqrt(max_block_size)) ? d_new_shift.height : sqrt(max_block_size)`
			`, (d_new_shift.width < sqrt(max_block_size)) ? d_new_shift.width : sqrt(max_block_size));`
Minor tweaks 7 years ago			`dim3 dimGrid((d_new_shift.height + dimBlock.x - 1) / dimBlock.x`
			`, (d_new_shift.width + dimBlock.y - 1) / dimBlock.y);`
Fix multiply, Add error checking for CUDA calls 7 years ago
Add calculate_kernel_matrix_kernel 7 years ago			`if (first_iter && params.verbose){`
			`printf("multiply_kernel called with:\n");`
Minor tweaks 7 years ago			`printf("dimBlock.x = %d, dimBlock.y = %d\n", dimBlock.x, dimBlock.y);`
			`printf("dimGrid.x = %d, dimGrid.y = %d\n\n", dimGrid.x, dimGrid.y);`
Add calculate_kernel_matrix_kernel 7 years ago			`first_iter = false;`
Minor tweaks 7 years ago			`}`
Fix multiply, Add error checking for CUDA calls 7 years ago
			`multiply_kernel<<<dimGrid, dimBlock>>>(d_kernel_matrix, d_original_points, d_new_shift);`
			`gpuErrchk( cudaPeekAtLastError() );`
			`gpuErrchk( cudaDeviceSynchronize() );`

			`size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double);`
			`gpuErrchk( cudaMemcpy(&((*new_shift)[0][0]), d_new_shift.elements`
Add calculate_kernel_matrix_kernel 7 years ago			`, size, cudaMemcpyDeviceToHost) );`
Fix multiply, Add error checking for CUDA calls 7 years ago
			`gpuErrchk( cudaFree(d_kernel_matrix.elements) );`
			`gpuErrchk( cudaFree(d_original_points.elements) );`
			`gpuErrchk( cudaFree(d_new_shift.elements) );`
			`}`

Add kernels header and implementation, fix Makefile 7 years ago			`double calculateDistance(double y, double x){`
			`double sum = 0, dif;`
			`for (int i=0; i<DIMENSIONS; i++){`
			`dif = y[i]-x[i];`
			`sum += dif * dif;`
			`}`
			`double distance = sqrt(sum);`
			`return distance;`
			`}`

			`double **alloc_2d_double(int rows, int cols) {`
			`double data = (double ) malloc(rowscolssizeof(double));`
			`double array = (double ) malloc(rowssizeof(double));`
			`for (int i=0; i<rows; i++)`
			`array[i] = &(data[cols*i]);`
			`return array;`
			`}`

			`void duplicate(double source, int rows, int cols, double *dest){`
			`for (int i=0; i<rows; i++){`
			`for (int j=0; j<cols; j++){`
			`(*dest)[i][j] = source[i][j];`
			`}`
			`}`
			`}`

			`void print_matrix(double **array, int rows, int cols){`
			`for (int i=0; i<cols; i++){`
			`for (int j=0; j<rows; j++){`
			`printf("%f ", array[j][i]);`
			`}`
			`printf("\n");`
			`}`
			`}`

			`void save_matrix(double **matrix, int iteration){`
			`char filename[50];`
			`snprintf(filename, sizeof(filename), "%s%d", "../output/output_", iteration);`
			`FILE *file;`
			`file = fopen(filename, "w");`
			`for (int rows=0; rows<NUMBER_OF_POINTS; ++rows){`
			`for (int cols=0; cols<DIMENSIONS; ++cols){`
			`fprintf(file, "%f", matrix[rows][cols]);`
			`if (cols != DIMENSIONS - 1){`
			`fprintf(file, ",");`
			`}`
			`}`
			`fprintf(file, "\n");`
			`}`
			`}`
kernel calculate denominator 7 years ago
compile error fixes 7 years ago			`double * calculate_denominator(double **kernel_matrix){`
kernel calculate denominator 7 years ago			`static bool first_iter = true;`

			`// allocates memory for denominator_matrix in GPU`
			`Matrix d_denominator_matrix;`
			`d_denominator_matrix.width = NUMBER_OF_POINTS;`
compile error fixes 7 years ago			`d_denominator_matrix.height = 1;`
kernel calculate denominator 7 years ago			`int size = NUMBER_OF_POINTS * sizeof(double);`
			`gpuErrchk( cudaMalloc(&d_denominator_matrix.elements, size) );`

			`// allocates memory for kernel_matrix in GPU and copies the array`
			`Matrix d_kernel_matrix;`
			`d_kernel_matrix.width = NUMBER_OF_POINTS;`
			`d_kernel_matrix.height = NUMBER_OF_POINTS;`
compile error fixes 7 years ago			`size = NUMBER_OF_POINTS * NUMBER_OF_POINTS * sizeof(double);`
kernel calculate denominator 7 years ago			`gpuErrchk( cudaMalloc(&d_kernel_matrix.elements, size) );`
			`gpuErrchk( cudaMemcpy(d_kernel_matrix.elements, &(kernel_matrix[0][0])`
			`, size, cudaMemcpyHostToDevice) );`

			`// get max sizes supported from the device`
			`int max_block_size = device_properties.maxThreadsPerBlock;`
compile error fixes 7 years ago			`dim3 dimBlock((d_denominator_matrix.height < sqrt(max_block_size)) ? d_denominator_matrix.height : sqrt(max_block_size)`
			`, (d_denominator_matrix.width < sqrt(max_block_size)) ? d_denominator_matrix.width : sqrt(max_block_size));`
			`dim3 dimGrid((d_denominator_matrix.height + dimBlock.x - 1) / dimBlock.x`
			`, (d_denominator_matrix.width + dimBlock.y - 1) / dimBlock.y);`
kernel calculate denominator 7 years ago
			`if (first_iter && params.verbose){`
			`printf("calculate_denominator called with:\n");`
			`printf("dimBlock.x = %d, dimBlock.y = %d\n", dimBlock.x, dimBlock.y);`
			`printf("dimGrid.x = %d, dimGrid.y = %d\n\n", dimGrid.x, dimGrid.y);`
			`first_iter = false;`
			`}`

compile error fixex 7 years ago			`denominator_kernel<<<dimGrid, dimBlock>>>(d_denominator_matrix, d_kernel_matrix);`
kernel calculate denominator 7 years ago			`gpuErrchk( cudaPeekAtLastError() );`
			`gpuErrchk( cudaDeviceSynchronize() );`

compile error fixes 7 years ago
			`size = NUMBER_OF_POINTS * sizeof(double);`
compile error fixex 7 years ago			`double denominator = (double)malloc(size);`
kernel calculate denominator 7 years ago			`gpuErrchk( cudaMemcpy(&((*denominator)[0]), d_denominator_matrix.elements`
			`,size, cudaMemcpyDeviceToHost) );`

			`gpuErrchk( cudaFree(d_kernel_matrix.elements) );`
compile error fixes 7 years ago			`gpuErrchk( cudaFree(d_denominator_matrix.elements) );`

compile error fixex 7 years ago			`return (*denominator);`
kernel calculate denominator 7 years ago			`}`