#include #include #include #include #include #include #include #include "meanshift_utils.h" #include "meanshift_gpu_utils.h" cudaDeviceProp device_properties; struct timeval start_w_time, end_w_time; double seq; //Based on: // https://www.cs.virginia.edu/~csadmin/wiki/index.php/CUDA_Support/Choosing_a_GPU void set_GPU(){ int devices_count = 0, max_multiprocessors = 0, max_device = 0; // gets devices count checking for errors like no devices or no drivers to check for // devices available gpuErrchk( cudaGetDeviceCount(&devices_count) ); for(int device_index = 0; device_index < devices_count; ++device_index){ // gets current index device's properties cudaDeviceProp this_device_properties; gpuErrchk( cudaGetDeviceProperties(&this_device_properties, device_index) ); // stores best available device's index // only devices with compute capability >= 2.0 are able to run the code if (max_multiprocessors < this_device_properties.multiProcessorCount && this_device_properties.major >= 2 && this_device_properties.minor >= 0){ // stores devices properties for later use device_properties = this_device_properties; max_multiprocessors = this_device_properties.multiProcessorCount; max_device = device_index; } } // sets the device gpuErrchk( cudaSetDevice(max_device) ); if (params.verbose){ printf("Device chosen is \"%s\"\n" "Device has %d multi processors and compute capability %d.%d\n" "Max threads per block supported are %d\n\n" , device_properties.name , device_properties.multiProcessorCount, device_properties.major, device_properties.minor , device_properties.maxThreadsPerBlock); } } int meanshift(double **original_points, double ***shifted_points, int deviation){ // host variables int size = 0; static int recursion = 0; static double **kernel_matrix, **mean_shift_vector, w_memcpy_time; double **new_shift, current_norm = 0, tmp_w_memcpy_time; bool is_first_recursion = false; // device variables static Matrix d_original_points, d_shifted_points, d_kernel_matrix, d_denominator, d_mean_shift_vector; Matrix d_new_shift; // allocates memory and copies original points on first recursion if (recursion == 0 || (*shifted_points) == NULL){ is_first_recursion = true; // allocates memory for shifted points array and copies original points into it (*shifted_points) = alloc_double(NUMBER_OF_POINTS, DIMENSIONS); duplicate(original_points, NUMBER_OF_POINTS, DIMENSIONS, shifted_points); // allocates memory for mean shift vector mean_shift_vector = alloc_double(NUMBER_OF_POINTS, DIMENSIONS); // initializes elements of mean_shift_vector to inf for (int i=0;i params.epsilon) { ++recursion; meanshift(original_points, shifted_points, deviation); } if (is_first_recursion){ if (params.verbose){ printf("\nCopying between device and host wall clock time = %f\n", w_memcpy_time); } // cleans up allocations free(mean_shift_vector[0]); free(mean_shift_vector); free(kernel_matrix[0]); free(kernel_matrix); free_device_memory(d_original_points, d_kernel_matrix, d_denominator, d_shifted_points); } return recursion; } void init_device_memory(double **original_points, double **shifted_points, Matrix *d_original_points, Matrix *d_shifted_points, Matrix *d_kernel_matrix, Matrix *d_denominator, Matrix *d_mean_shift_vector){ int size; // allocates memory for original_points in GPU and copies the array d_original_points->width = DIMENSIONS; d_original_points->height = NUMBER_OF_POINTS; size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double); gpuErrchk( cudaMalloc(&(d_original_points->elements), size) ); gpuErrchk( cudaMemcpy(d_original_points->elements, &(original_points[0][0]) , size, cudaMemcpyHostToDevice) ); // allocates memory for shifted_points in GPU and copies the array d_shifted_points->width = DIMENSIONS; d_shifted_points->height = NUMBER_OF_POINTS; size = DIMENSIONS * NUMBER_OF_POINTS * sizeof(double); gpuErrchk( cudaMalloc(&(d_shifted_points->elements), size) ); gpuErrchk( cudaMemcpy(d_shifted_points->elements, &(shifted_points[0][0]) , size, cudaMemcpyHostToDevice) ); // allocates memory for kernel_matrix in GPU d_kernel_matrix->width = NUMBER_OF_POINTS; d_kernel_matrix->height = NUMBER_OF_POINTS; size = NUMBER_OF_POINTS * NUMBER_OF_POINTS * sizeof(double); gpuErrchk( cudaMalloc(&(d_kernel_matrix->elements), size) ); // allocates memory for denominator in GPU d_denominator->width = 1; d_denominator->height = NUMBER_OF_POINTS; size = NUMBER_OF_POINTS * sizeof(double); gpuErrchk( cudaMalloc(&(d_denominator->elements), size) ); // allocates memory for mean_shift_vector in GPU d_mean_shift_vector->width = DIMENSIONS; d_mean_shift_vector->height = NUMBER_OF_POINTS; size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double); gpuErrchk( cudaMalloc(&(d_mean_shift_vector->elements), size) ); } void calculate_kernel_matrix(Matrix d_shifted_points, Matrix d_original_points, Matrix d_kernel_matrix, double deviation, double ***kernel_matrix, double *w_memcpy_time){ int size; static bool first_iter = true; // gets max block size supported from the device static int max_block_size = device_properties.maxThreadsPerBlock; static int requested_block_size = (int)sqrt(max_block_size); bool block_size_too_big = true; dim3 dimBlock; dim3 dimGrid; do { dimBlock.x = requested_block_size; dimBlock.y = requested_block_size; dimGrid.x = (d_kernel_matrix.height + dimBlock.x - 1) / dimBlock.x; dimGrid.y = (d_kernel_matrix.width + dimBlock.y - 1) / dimBlock.y; calculate_kernel_matrix_kernel<<>>(d_shifted_points, d_original_points , deviation, d_kernel_matrix); if (cudaGetLastError() != cudaSuccess){ --requested_block_size; } else { block_size_too_big = false; gpuErrchk( cudaDeviceSynchronize() ); } } while(block_size_too_big); if (first_iter && params.verbose){ printf("calculate_kernel_matrix_kernel called with:\n"); printf("dimBlock.x = %d, dimBlock.y = %d\n", dimBlock.x, dimBlock.y); printf("dimGrid.x = %d, dimGrid.y = %d\n\n", dimGrid.x, dimGrid.y); first_iter = false; } size = NUMBER_OF_POINTS * NUMBER_OF_POINTS * sizeof(double); // tic gettimeofday (&start_w_time, NULL); gpuErrchk( cudaMemcpy(&((*kernel_matrix)[0][0]), d_kernel_matrix.elements , size, cudaMemcpyDeviceToHost) ); // toc gettimeofday (&end_w_time, NULL); *w_memcpy_time = (double)((end_w_time.tv_usec - start_w_time.tv_usec) / 1.0e6 + end_w_time.tv_sec - start_w_time.tv_sec); } void calculate_denominator(Matrix d_kernel_matrix, Matrix d_denominator){ static bool first_iter = true; // gets max block size supported from the device static int requested_block_size = device_properties.maxThreadsPerBlock; bool block_size_too_big = true; dim3 dimBlock; dim3 dimGrid; do { dimBlock.x = requested_block_size; dimBlock.y = 1; dimGrid.x = (d_kernel_matrix.height + dimBlock.x - 1) / dimBlock.x; dimGrid.y = 1; denominator_kernel<<>>(d_denominator, d_kernel_matrix); if (cudaGetLastError() != cudaSuccess){ --requested_block_size; } else { block_size_too_big = false; gpuErrchk( cudaDeviceSynchronize() ); } } while(block_size_too_big); if (first_iter && params.verbose){ printf("calculate_denominator called with:\n"); printf("dimBlock.x = %d, dimBlock.y = %d\n", dimBlock.x, dimBlock.y); printf("dimGrid.x = %d, dimGrid.y = %d\n\n", dimGrid.x, dimGrid.y); first_iter = false; } } void shift_points(Matrix d_kernel_matrix, Matrix d_original_points, Matrix d_shifted_points, Matrix d_new_shift, Matrix d_denominator, Matrix d_mean_shift_vector, double **kernel_matrix, double **original_points, double ***new_shift, double ***mean_shift_vector, double *w_memcpy_time){ int size; static bool first_iter = true; // gets max block size supported from the device static int max_block_size = device_properties.maxThreadsPerBlock; static int requested_block_size = (int)(max_block_size / d_new_shift.width); bool block_size_too_big = true; dim3 dimBlock; dim3 dimGrid; do { dimBlock.x = requested_block_size; dimBlock.y = d_new_shift.width; dimGrid.x = (d_denominator.height + dimBlock.x - 1) / dimBlock.x; dimGrid.y = 1; shift_points_kernel<<>>(d_original_points, d_kernel_matrix, d_shifted_points, d_new_shift, d_denominator, d_mean_shift_vector); if (cudaGetLastError() != cudaSuccess){ --requested_block_size; } else { block_size_too_big = false; gpuErrchk( cudaDeviceSynchronize() ); } } while(block_size_too_big); if (first_iter && params.verbose){ printf("shift_points_kernel called with:\n"); printf("dimBlock.x = %d, dimBlock.y = %d\n", dimBlock.x, dimBlock.y); printf("dimGrid.x = %d, dimGrid.y = %d\n\n", dimGrid.x, dimGrid.y); first_iter = false; } size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double); // tic gettimeofday (&start_w_time, NULL); gpuErrchk( cudaMemcpy(&((*new_shift)[0][0]), d_new_shift.elements , size, cudaMemcpyDeviceToHost) ); gpuErrchk( cudaMemcpy(&((*mean_shift_vector)[0][0]), d_mean_shift_vector.elements , size, cudaMemcpyDeviceToHost) ); // toc gettimeofday (&end_w_time, NULL); *w_memcpy_time = (double)((end_w_time.tv_usec - start_w_time.tv_usec) / 1.0e6 + end_w_time.tv_sec - start_w_time.tv_sec); } void free_device_memory(Matrix d_original_points, Matrix d_kernel_matrix, Matrix d_denominator, Matrix d_shifted_points){ // frees all memory previously allocated in device gpuErrchk( cudaFree(d_original_points.elements) ); gpuErrchk( cudaFree(d_kernel_matrix.elements) ); gpuErrchk( cudaFree(d_denominator.elements) ); gpuErrchk( cudaFree(d_shifted_points.elements) ); }