#include #include #include #include #include #include "meanshift_utils.h" #include "meanshift_kernels.h" #define OUTPUT_PREFIX "../output/output_" cudaDeviceProp device_properties; void get_args(int argc, char **argv, parameters *params){ if (argc < 7) { printf("Usage: %s h e N D Pd Pl\nwhere:\n" "\th is the variance\n" "\te is the min distance, between two points, that is taken into account in computations\n" "\tN is the the number of points\n" "\tD is the number of dimensions of each point\n" "\tPd is the path of the dataset file\n" "\tPl is the path of the labels file\n" "\n\t--verbose | -v is an optional flag to enable execution information output" "\n\t--output | -o is an optional flag to enable points output in each iteration", argv[0]); exit(1); } DEVIATION = atoi(argv[1]); params->epsilon = atof(argv[2]); NUMBER_OF_POINTS = atoi(argv[3]); DIMENSIONS = atoi(argv[4]); POINTS_FILENAME = argv[5]; LABELS_FILENAME = argv[6]; params->verbose = false; params->display = false; if (argc > 7){ for (int index=7; indexverbose = true; } else if (!strcmp(argv[index], "--output") || !strcmp(argv[index], "-o")){ params->display = true; } else { printf("Couldn't parse argument %d: %s\n", index, argv[index]); exit(EXIT_FAILURE); } } } /*printf("DEVIATION = %d\n" "epsilon = %f\n" "NUMBER_OF_POINTS = %d\n" "DIMENSIONS = %d\n" "POINTS_FILENAME = %s\n" "LABELS_FILENAME = %s\n" "verbose = %d\n" "display = %d\n", DEVIATION, params->epsilon, NUMBER_OF_POINTS, DIMENSIONS, POINTS_FILENAME , LABELS_FILENAME, params->verbose, params->display);*/ } void init(double ***vectors, char **labels){ int bytes_read = 0; set_Gpu(); if (params.verbose){ printf("Reading dataset and labels...\n"); } // initializes vectors FILE *points_file; points_file = fopen(POINTS_FILENAME, "rb"); if (points_file != NULL){ // allocates memory for the array (*vectors) = alloc_2d_double(NUMBER_OF_POINTS, DIMENSIONS); // reads vectors dataset from file for (int i=0; i // variables of type uint8 are stored as 1-byte (8-bit) unsigned integers // gets number of labels fseek(labels_file, 0L, SEEK_END); long int pos = ftell(labels_file); rewind(labels_file); int label_elements = pos/ sizeof(char); // allocates memory for the array *labels = (char*)malloc(label_elements* sizeof(char)); fseek(labels_file, 0L, SEEK_SET); bytes_read = fread((*labels), sizeof(char), label_elements, labels_file); if ( bytes_read != label_elements ){ if(feof(points_file)){ printf("Premature end of file reached.\n"); } else{ printf("Error reading points file."); } fclose(labels_file); exit(EXIT_FAILURE); } } fclose(labels_file); if (params.verbose){ printf("Done.\n\n"); } } //Based on https://stackoverflow.com/a/28113186 //Poio psagmeno link https://www.cs.virginia.edu/~csadmin/wiki/index.php/CUDA_Support/Choosing_a_GPU void set_Gpu(){ int devices_count = 0, max_multiprocessors = 0, max_device = 0; // gets devices count checking for errors like no devices or no drivers to check for // devices available gpuErrchk( cudaGetDeviceCount(&devices_count) ); for(int device_index = 0; device_index < devices_count; ++device_index){ // gets current index device's properties cudaDeviceProp this_device_properties; gpuErrchk( cudaGetDeviceProperties(&this_device_properties, device_index) ); // stores best available device's index // only devices with compute capability >= 2.0 are able to run the code if (max_multiprocessors < this_device_properties.multiProcessorCount && this_device_properties.major >= 2 && this_device_properties.minor >= 0){ // stores devices properties for later use device_properties = this_device_properties; max_multiprocessors = this_device_properties.multiProcessorCount; max_device = device_index; } } // sets the device gpuErrchk( cudaSetDevice(max_device) ); if (params.verbose){ printf("Device chosen is \"%s\"\n" "Device has %d multi processors and compute capability %d.%d\n" "Max threads per block supported are %d\n\n" , device_properties.name , device_properties.multiProcessorCount, device_properties.major, device_properties.minor , device_properties.maxThreadsPerBlock); } } int meanshift(double **original_points, double ***shifted_points, int deviation , parameters *opt){ static int iteration = 0; static double **mean_shift_vector, **kernel_matrix, *denominator; // allocates memory and copies original points on first iteration if (iteration == 0 || (*shifted_points) == NULL){ (*shifted_points) = alloc_2d_double(NUMBER_OF_POINTS, DIMENSIONS); duplicate(original_points, NUMBER_OF_POINTS, DIMENSIONS, shifted_points); // allocates memory for mean shift vector mean_shift_vector = alloc_2d_double(NUMBER_OF_POINTS, DIMENSIONS); // initializes elements of mean_shift_vector to inf for (int i=0;i opt->epsilon) { ++iteration; meanshift(original_points, shifted_points, deviation, opt); } if (iteration == 0){ // cleans up allocations free(mean_shift_vector[0]); free(mean_shift_vector); free(kernel_matrix[0]); free(kernel_matrix); free(denominator); } return iteration; } double norm(double **matrix, int rows, int cols){ double sum=0, temp_mul=0; for (int i=0; i>>(d_shifted_points, d_original_points , deviation, d_kernel_matrix); if (cudaGetLastError() != cudaSuccess){ --requested_block_size; } else { block_size_too_big = false; gpuErrchk( cudaDeviceSynchronize() ); } } while(block_size_too_big); if (first_iter && params.verbose){ printf("calculate_kernel_matrix_kernel called with:\n"); printf("dimBlock.x = %d, dimBlock.y = %d\n", dimBlock.x, dimBlock.y); printf("dimGrid.x = %d, dimGrid.y = %d\n\n", dimGrid.x, dimGrid.y); first_iter = false; } size = NUMBER_OF_POINTS * NUMBER_OF_POINTS * sizeof(double); gpuErrchk( cudaMemcpy(&((*kernel_matrix)[0][0]), d_kernel_matrix.elements , size, cudaMemcpyDeviceToHost) ); gpuErrchk( cudaFree(d_shifted_points.elements) ); gpuErrchk( cudaFree(d_original_points.elements) ); gpuErrchk( cudaFree(d_kernel_matrix.elements) ); } void multiply(double **kernel_matrix, double **original_points, double ***new_shift){ static bool first_iter = true; // allocates memory for kernel_matrix in GPU and copies the array Matrix d_kernel_matrix; d_kernel_matrix.width = NUMBER_OF_POINTS; d_kernel_matrix.height = NUMBER_OF_POINTS; int size = NUMBER_OF_POINTS * NUMBER_OF_POINTS * sizeof(double); gpuErrchk( cudaMalloc(&d_kernel_matrix.elements, size) ); gpuErrchk( cudaMemcpy(d_kernel_matrix.elements, &(kernel_matrix[0][0]) , size, cudaMemcpyHostToDevice) ); // allocates memory for original_points in GPU and copies the array Matrix d_original_points; d_original_points.width = DIMENSIONS; d_original_points.height = NUMBER_OF_POINTS; size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double); gpuErrchk( cudaMalloc(&d_original_points.elements, size) ); gpuErrchk( cudaMemcpy(d_original_points.elements, &(original_points[0][0]) , size, cudaMemcpyHostToDevice) ); // allocates memory for new_shift in GPU Matrix d_new_shift; d_new_shift.width = DIMENSIONS; d_new_shift.height = NUMBER_OF_POINTS; size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double); gpuErrchk( cudaMalloc(&d_new_shift.elements, size) ); // get max sizes supported from the device int max_block_size = device_properties.maxThreadsPerBlock; dim3 dimBlock((d_new_shift.height < sqrt(max_block_size)) ? d_new_shift.height : sqrt(max_block_size) , (d_new_shift.width < sqrt(max_block_size)) ? d_new_shift.width : sqrt(max_block_size)); dim3 dimGrid((d_new_shift.height + dimBlock.x - 1) / dimBlock.x , (d_new_shift.width + dimBlock.y - 1) / dimBlock.y); if (first_iter && params.verbose){ printf("multiply_kernel called with:\n"); printf("dimBlock.x = %d, dimBlock.y = %d\n", dimBlock.x, dimBlock.y); printf("dimGrid.x = %d, dimGrid.y = %d\n\n", dimGrid.x, dimGrid.y); first_iter = false; } multiply_kernel<<>>(d_kernel_matrix, d_original_points, d_new_shift); gpuErrchk( cudaPeekAtLastError() ); gpuErrchk( cudaDeviceSynchronize() ); size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double); gpuErrchk( cudaMemcpy(&((*new_shift)[0][0]), d_new_shift.elements , size, cudaMemcpyDeviceToHost) ); gpuErrchk( cudaFree(d_kernel_matrix.elements) ); gpuErrchk( cudaFree(d_original_points.elements) ); gpuErrchk( cudaFree(d_new_shift.elements) ); } double calculateDistance(double *y, double *x){ double sum = 0, dif; for (int i=0; i>>(d_denominator_matrix, d_kernel_matrix, T); gpuErrchk( cudaPeekAtLastError() ); gpuErrchk( cudaDeviceSynchronize() ); size = NUMBER_OF_POINTS * sizeof(double); double * denominator = malloc(size); gpuErrchk( cudaMemcpy(&((*denominator)[0]), d_denominator_matrix.elements ,size, cudaMemcpyDeviceToHost) ); gpuErrchk( cudaFree(d_kernel_matrix.elements) ); gpuErrchk( cudaFree(d_denominator_matrix.elements) ); return denominator; }