diff --git a/mean_shift_cuda/Makefile b/mean_shift_cuda/Makefile index b2fb2c8..6a10a8f 100644 --- a/mean_shift_cuda/Makefile +++ b/mean_shift_cuda/Makefile @@ -11,7 +11,7 @@ C_FLAGS = -lm -O3 -I. COMPILE_FLAGS = $(HOST_COMPILER) -x cu $(CUDA_FLAGS) -dc $(C_FLAGS) LINK_FLAGS = $(HOST_COMPILER) $(CUDA_FLAGS) $(C_FLAGS) -OBJ = meanshift.o meanshift_utils.o meanshift_kernels.o +OBJ = meanshift.o meanshift_utils.o meanshift_gpu_utils.o meanshift_kernels.o DEPS = meanshift_utils.h meanshift_kernels.h RM = rm -f diff --git a/mean_shift_cuda/meanshift b/mean_shift_cuda/meanshift deleted file mode 100644 index dc21f7a..0000000 Binary files a/mean_shift_cuda/meanshift and /dev/null differ diff --git a/mean_shift_cuda/meanshift.cu b/mean_shift_cuda/meanshift.cu index 778d081..dd1b2e5 100644 --- a/mean_shift_cuda/meanshift.cu +++ b/mean_shift_cuda/meanshift.cu @@ -3,6 +3,7 @@ #include #include "meanshift_utils.h" +#include "meanshift_gpu_utils.h" int DEVIATION = 1; int NUMBER_OF_POINTS = 600; diff --git a/mean_shift_cuda/meanshift_gpu_utils.cu b/mean_shift_cuda/meanshift_gpu_utils.cu new file mode 100644 index 0000000..0c5c95e --- /dev/null +++ b/mean_shift_cuda/meanshift_gpu_utils.cu @@ -0,0 +1,308 @@ +#include +#include +#include +#include +#include + +#include "meanshift_utils.h" +#include "meanshift_gpu_utils.h" + +cudaDeviceProp device_properties; + +//Based on https://stackoverflow.com/a/28113186 +//Pio psagmeno link https://www.cs.virginia.edu/~csadmin/wiki/index.php/CUDA_Support/Choosing_a_GPU +void set_GPU(){ + int devices_count = 0, max_multiprocessors = 0, max_device = 0; + + // gets devices count checking for errors like no devices or no drivers to check for + // devices available + gpuErrchk( cudaGetDeviceCount(&devices_count) ); + for(int device_index = 0; device_index < devices_count; ++device_index){ + // gets current index device's properties + cudaDeviceProp this_device_properties; + gpuErrchk( cudaGetDeviceProperties(&this_device_properties, device_index) ); + + // stores best available device's index + // only devices with compute capability >= 2.0 are able to run the code + if (max_multiprocessors < this_device_properties.multiProcessorCount + && this_device_properties.major >= 2 && this_device_properties.minor >= 0){ + // stores devices properties for later use + device_properties = this_device_properties; + max_multiprocessors = this_device_properties.multiProcessorCount; + max_device = device_index; + } + } + // sets the device + gpuErrchk( cudaSetDevice(max_device) ); + if (params.verbose){ + printf("Device chosen is \"%s\"\n" + "Device has %d multi processors and compute capability %d.%d\n" + "Max threads per block supported are %d\n\n" + , device_properties.name + , device_properties.multiProcessorCount, device_properties.major, device_properties.minor + , device_properties.maxThreadsPerBlock); + } +} + +int meanshift(double **original_points, double ***shifted_points, int deviation + , parameters *opt){ + // host variables + int size = 0; + static int iteration = 0; + static double **kernel_matrix, *denominator, **mean_shift_vector; + double **new_shift; + + // device variables + static Matrix d_original_points, d_shifted_points, d_kernel_matrix, d_denominator, + d_mean_shift_vector; + Matrix d_new_shift; + + // allocates memory and copies original points on first iteration + if (iteration == 0 || (*shifted_points) == NULL){ + // allocates memory for shifted points array and copies original points into it + (*shifted_points) = alloc_double(NUMBER_OF_POINTS, DIMENSIONS); + duplicate(original_points, NUMBER_OF_POINTS, DIMENSIONS, shifted_points); + + // allocates memory for mean shift vector + mean_shift_vector = alloc_double(NUMBER_OF_POINTS, DIMENSIONS); + // initializes elements of mean_shift_vector to inf + for (int i=0;i opt->epsilon) { + ++iteration; + meanshift(original_points, shifted_points, deviation, opt); + } + + if (iteration == 0){ + // cleans up allocations + free(mean_shift_vector[0]); + free(mean_shift_vector); + free(kernel_matrix[0]); + free(kernel_matrix); + free(denominator); + + free_device_memory(d_original_points, d_kernel_matrix, d_denominator, d_new_shift); + } + + return iteration; +} + +void init_device_memory(double **original_points, double **shifted_points, + Matrix *d_original_points, Matrix *d_shifted_points, Matrix *d_kernel_matrix, + Matrix *d_denominator, Matrix *d_mean_shift_vector){ + int size; + + // allocates memory for original_points in GPU and copies the array + d_original_points->width = DIMENSIONS; + d_original_points->height = NUMBER_OF_POINTS; + size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double); + gpuErrchk( cudaMalloc(&(d_original_points->elements), size) ); + gpuErrchk( cudaMemcpy(d_original_points->elements, &(original_points[0][0]) + , size, cudaMemcpyHostToDevice) ); + + // allocates memory for shifted_points in GPU and copies the array + d_shifted_points->width = DIMENSIONS; + d_shifted_points->height = NUMBER_OF_POINTS; + size = DIMENSIONS * NUMBER_OF_POINTS * sizeof(double); + gpuErrchk( cudaMalloc(&(d_shifted_points->elements), size) ); + gpuErrchk( cudaMemcpy(d_shifted_points->elements, &(shifted_points[0][0]) + , size, cudaMemcpyHostToDevice) ); + + // allocates memory for kernel_matrix in GPU + d_kernel_matrix->width = NUMBER_OF_POINTS; + d_kernel_matrix->height = NUMBER_OF_POINTS; + size = NUMBER_OF_POINTS * NUMBER_OF_POINTS * sizeof(double); + gpuErrchk( cudaMalloc(&(d_kernel_matrix->elements), size) ); + + // allocates memory for denominator in GPU + d_denominator->width = 1; + d_denominator->height = NUMBER_OF_POINTS; + size = NUMBER_OF_POINTS * sizeof(double); + gpuErrchk( cudaMalloc(&(d_denominator->elements), size) ); + + // allocates memory for mean_shift_vector in GPU + d_mean_shift_vector->width = DIMENSIONS; + d_mean_shift_vector->height = NUMBER_OF_POINTS; + size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double); + gpuErrchk( cudaMalloc(&(d_mean_shift_vector->elements), size) ); +} + +void calculate_kernel_matrix(Matrix d_shifted_points, Matrix d_original_points, + Matrix d_kernel_matrix, double deviation, double ***kernel_matrix){ + int size; + static bool first_iter = true; + // gets max block size supported from the device + static int max_block_size = device_properties.maxThreadsPerBlock; + static int requested_block_size = (int)sqrt(max_block_size); + bool block_size_too_big = true; + + dim3 dimBlock; + dim3 dimGrid; + do { + dimBlock.x = requested_block_size; + dimBlock.y = requested_block_size; + dimGrid.x = (d_kernel_matrix.height + dimBlock.x - 1) / dimBlock.x; + dimGrid.y = (d_kernel_matrix.width + dimBlock.y - 1) / dimBlock.y; + + calculate_kernel_matrix_kernel<<>>(d_shifted_points, d_original_points + , deviation, d_kernel_matrix); + if (cudaGetLastError() != cudaSuccess){ + --requested_block_size; + } else { + block_size_too_big = false; + gpuErrchk( cudaDeviceSynchronize() ); + } + } while(block_size_too_big); + + if (first_iter && params.verbose){ + printf("calculate_kernel_matrix_kernel called with:\n"); + printf("dimBlock.x = %d, dimBlock.y = %d\n", dimBlock.x, dimBlock.y); + printf("dimGrid.x = %d, dimGrid.y = %d\n\n", dimGrid.x, dimGrid.y); + first_iter = false; + } + + size = NUMBER_OF_POINTS * NUMBER_OF_POINTS * sizeof(double); + gpuErrchk( cudaMemcpy(&((*kernel_matrix)[0][0]), d_kernel_matrix.elements + , size, cudaMemcpyDeviceToHost) ); +} + +void calculate_denominator(Matrix d_kernel_matrix, Matrix d_denominator, double **denominator){ + int size; + static bool first_iter = true; + // gets max block size supported from the device + static int requested_block_size = device_properties.maxThreadsPerBlock; + bool block_size_too_big = true; + + dim3 dimBlock; + dim3 dimGrid; + do { + dimBlock.x = requested_block_size; + dimBlock.y = 1; + dimGrid.x = (d_kernel_matrix.height + dimBlock.x - 1) / dimBlock.x; + dimGrid.y = 1; + + denominator_kernel<<>>(d_denominator, d_kernel_matrix); + if (cudaGetLastError() != cudaSuccess){ + --requested_block_size; + } else { + block_size_too_big = false; + gpuErrchk( cudaDeviceSynchronize() ); + } + } while(block_size_too_big); + + if (first_iter && params.verbose){ + printf("calculate_denominator called with:\n"); + printf("dimBlock.x = %d, dimBlock.y = %d\n", dimBlock.x, dimBlock.y); + printf("dimGrid.x = %d, dimGrid.y = %d\n\n", dimGrid.x, dimGrid.y); + first_iter = false; + } + + size = NUMBER_OF_POINTS * sizeof(double); + gpuErrchk( cudaMemcpy(&((*denominator)[0]), d_denominator.elements + , size, cudaMemcpyDeviceToHost) ); +} + +void shift_points(Matrix d_kernel_matrix, Matrix d_original_points, Matrix d_shifted_points, + Matrix d_new_shift, Matrix d_denominator, Matrix d_mean_shift_vector, double **kernel_matrix, + double **original_points, double ***new_shift, double ***mean_shift_vector){ + int size; + static bool first_iter = true; + // gets max block size supported from the device + static int max_block_size = device_properties.maxThreadsPerBlock; + static int requested_block_size = (int)(max_block_size / d_new_shift.width); + bool block_size_too_big = true; + + dim3 dimBlock; + dim3 dimGrid; + do { + dimBlock.x = requested_block_size; + dimBlock.y = d_new_shift.width; + dimGrid.x = (d_denominator.height + dimBlock.x - 1) / dimBlock.x; + dimGrid.y = 1; + + shift_points_kernel<<>>(d_original_points, d_kernel_matrix, d_shifted_points, + d_new_shift, d_denominator, d_mean_shift_vector); + if (cudaGetLastError() != cudaSuccess){ + --requested_block_size; + } else { + block_size_too_big = false; + gpuErrchk( cudaDeviceSynchronize() ); + } + } while(block_size_too_big); + + if (first_iter && params.verbose){ + printf("shift_points_kernel called with:\n"); + printf("dimBlock.x = %d, dimBlock.y = %d\n", dimBlock.x, dimBlock.y); + printf("dimGrid.x = %d, dimGrid.y = %d\n\n", dimGrid.x, dimGrid.y); + first_iter = false; + } + + size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double); + gpuErrchk( cudaMemcpy(&((*new_shift)[0][0]), d_new_shift.elements + , size, cudaMemcpyDeviceToHost) ); + gpuErrchk( cudaMemcpy(&((*mean_shift_vector)[0][0]), d_mean_shift_vector.elements + , size, cudaMemcpyDeviceToHost) ); +} + +void free_device_memory(Matrix d_original_points, Matrix d_kernel_matrix, Matrix d_denominator, + Matrix d_new_shift){ + // frees all memory previously allocated in device + gpuErrchk( cudaFree(d_original_points.elements) ); + gpuErrchk( cudaFree(d_kernel_matrix.elements) ); + gpuErrchk( cudaFree(d_denominator.elements) ); + gpuErrchk( cudaFree(d_new_shift.elements) ); +} \ No newline at end of file diff --git a/mean_shift_cuda/meanshift_gpu_utils.h b/mean_shift_cuda/meanshift_gpu_utils.h new file mode 100644 index 0000000..b357bba --- /dev/null +++ b/mean_shift_cuda/meanshift_gpu_utils.h @@ -0,0 +1,53 @@ +#ifndef SERIAL_GPU_UTILS_H /* Include guard */ +#define SERIAL_GPU_UTILS_H + +#include "meanshift_kernels.h" + +//GPU error check snippet taken from: +//https://stackoverflow.com/a/14038590 +#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); } +inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true){ + if (code != cudaSuccess){ + fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); + if (abort) exit(code); + } +} + +/* Global variables */ +extern int DEVIATION; +extern int NUMBER_OF_POINTS; +extern int DIMENSIONS; +extern char* POINTS_FILENAME; +extern char* LABELS_FILENAME; +extern parameters params; +extern cudaDeviceProp device_properties; + +void set_GPU(); + +//Function meanshift recursively shifts original points according to th +//mean-shift algorithm saving the result to shiftedPoints. Struct opt has user +//options, h is the desirable deviation. +int meanshift(double **original_points, double ***shifted_points, int h + , parameters *opt); + +void init_device_memory(double **original_points, double **shifted_points, + Matrix *d_original_points, Matrix *d_shifted_points, + Matrix *d_kernel_matrix, Matrix *d_denominator, Matrix *d_new_shift); + +void calculate_kernel_matrix(Matrix d_shifted_points, Matrix d_original_points, + Matrix d_kernel_matrix, double deviation, double ***kernel_matrix); + +//Function multiply allocates memory in GPU, sends the data and calls the +//multiply kernel function. +void shift_points(Matrix d_kernel_matrix, Matrix d_original_points, Matrix d_shifted_points, + Matrix d_new_shift, Matrix d_denominator, Matrix d_mean_shift_vector, double **kernel_matrix, + double **original_points, double ***new_shift, double ***mean_shift_vector); + +void free_device_memory(Matrix d_original_points, Matrix d_kernel_matrix, Matrix d_denominator, + Matrix d_new_shift); + +//Function calculate_denominator allocates memory in GPU, sends the data and calls the +//denominator kernel function. +void calculate_denominator(Matrix d_kernel_matrix, Matrix d_denominator, double **denominator); + +#endif //SERIAL_GPU_UTILS_H \ No newline at end of file diff --git a/mean_shift_cuda/meanshift_kernels.cu b/mean_shift_cuda/meanshift_kernels.cu index 16b8710..04ff883 100644 --- a/mean_shift_cuda/meanshift_kernels.cu +++ b/mean_shift_cuda/meanshift_kernels.cu @@ -1,28 +1,14 @@ #include "meanshift_kernels.h" #include -__global__ void multiply_kernel(Matrix matrix1, Matrix matrix2, Matrix output){ - // Each thread computes one element of output - // by accumulating results into cell_value - double cell_value = 0; +__global__ void calculate_kernel_matrix_kernel(Matrix shifted_points, Matrix original_points, + double deviation, Matrix kernel_matrix){ + // each thread calculates one element of kernel_matrix int row = blockIdx.x * blockDim.x + threadIdx.x; int col = blockIdx.y * blockDim.y + threadIdx.y; - if (row + col < output.height * output.width){ - for (int element_index = 0; element_index < matrix1.width; ++element_index){ - cell_value += matrix1.elements[row * matrix1.width + element_index] - * matrix2.elements[element_index * matrix2.width + col]; - } - output.elements[row * output.width + col] = cell_value; - } -} - -__global__ void calculate_kernel_matrix_kernel(Matrix shifted_points, Matrix original_points - , double deviation, Matrix kernel_matrix){ - // Each thread calculates one element of kernel_matrix - int row = blockIdx.x * blockDim.x + threadIdx.x; - int col = blockIdx.y * blockDim.y + threadIdx.y; - if (row * kernel_matrix.width + col > kernel_matrix.width * kernel_matrix.height){ + // performs calculations only if thread's indexes are within matrix bounds + if (row * kernel_matrix.width + col >= kernel_matrix.width * kernel_matrix.height){ return; } @@ -48,17 +34,48 @@ __global__ void calculate_kernel_matrix_kernel(Matrix shifted_points, Matrix ori } } -__global__ void denominator_kernel(Matrix denominator, Matrix kernel_matrix){ - +__global__ void shift_points_kernel(Matrix original_points, Matrix kernel_matrix, Matrix shifted_points, + Matrix new_shift, Matrix denominator, Matrix mean_shift_vector){ + // each thread computes one element of new_shift + // by accumulating results into cell_value + double cell_value = 0; int row = blockIdx.x * blockDim.x + threadIdx.x; int col = blockIdx.y * blockDim.y + threadIdx.y; - - if (row * denominator.width + col > denominator.width * denominator.height){ + // performs calculations only if thread's indexes are within matrix bounds + if (row * new_shift.width + col >= new_shift.width * new_shift.height){ return; } - denominator.elements[col]=0; - denominator.elements[row] += kernel_matrix.elements[row*denominator.width + col]; + // calculates new_shift + // builds nominator by multiplying kernel_matrix and original_points + for (int element_index = 0; element_index < kernel_matrix.width; ++element_index){ + cell_value += kernel_matrix.elements[row * kernel_matrix.width + element_index] + * original_points.elements[element_index * original_points.width + col]; + } + // new_shift elements are calculated by dividing with the denominator + new_shift.elements[row * new_shift.width + col] = + cell_value / denominator.elements[row]; + + // calculates mean-shift vector + mean_shift_vector.elements[row * new_shift.width + col] = + new_shift.elements[row * new_shift.width + col] - + shifted_points.elements[row * new_shift.width + col]; +} + +__global__ void denominator_kernel(Matrix denominator, Matrix kernel_matrix){ + // each thread computes one element of denominator_kernel + // by accumulating results into cell_value + double cell_value = 0; + int row = blockIdx.x * blockDim.x + threadIdx.x; + + // performs calculations only if thread's indexes are within matrix bounds + if (row >= denominator.height){ + return; + } + for (int column = 0; column < kernel_matrix.width; ++column){ + cell_value += kernel_matrix.elements[row * kernel_matrix.width + column]; + } + denominator.elements[row] = cell_value; } \ No newline at end of file diff --git a/mean_shift_cuda/meanshift_kernels.h b/mean_shift_cuda/meanshift_kernels.h index 0f9398d..d33641e 100644 --- a/mean_shift_cuda/meanshift_kernels.h +++ b/mean_shift_cuda/meanshift_kernels.h @@ -1,17 +1,18 @@ #ifndef SERIAL_KERNELS_H /* Include guard */ #define SERIAL_KERNELS_H -typedef struct{ +typedef struct { int width; int height; double *elements; } Matrix; -//Function multiply_kernel calculates the product of matrices 1 and 2 into output. -__global__ void multiply_kernel(Matrix matrix1, Matrix matrix2, Matrix output); +__global__ void calculate_kernel_matrix_kernel(Matrix shifted_points, Matrix original_points, + double deviation, Matrix kernel_matrix); -__global__ void calculate_kernel_matrix_kernel(Matrix shifted_points, Matrix original_points - , double deviation, Matrix kernel_matrix); +//Function multiply_kernel calculates the product of matrices 1 and 2 into output. +__global__ void shift_points_kernel(Matrix original_points, Matrix kernel_matrix, Matrix shifted_points, + Matrix new_shift, Matrix denominator, Matrix mean_shift_vector); __global__ void denominator_kernel(Matrix denominator, Matrix kernel_matrix); diff --git a/mean_shift_cuda/meanshift_utils.cu b/mean_shift_cuda/meanshift_utils.cu index f636dc7..60b6cf7 100644 --- a/mean_shift_cuda/meanshift_utils.cu +++ b/mean_shift_cuda/meanshift_utils.cu @@ -5,12 +5,10 @@ #include #include "meanshift_utils.h" -#include "meanshift_kernels.h" +#include "meanshift_gpu_utils.h" #define OUTPUT_PREFIX "../output/output_" -cudaDeviceProp device_properties; - void get_args(int argc, char **argv, parameters *params){ if (argc < 7) { printf("Usage: %s h e N D Pd Pl\nwhere:\n" @@ -61,7 +59,7 @@ void get_args(int argc, char **argv, parameters *params){ void init(double ***vectors, char **labels){ int bytes_read = 0; - set_Gpu(); + set_GPU(); if (params.verbose){ printf("Reading dataset and labels...\n"); @@ -72,7 +70,7 @@ void init(double ***vectors, char **labels){ points_file = fopen(POINTS_FILENAME, "rb"); if (points_file != NULL){ // allocates memory for the array - (*vectors) = alloc_2d_double(NUMBER_OF_POINTS, DIMENSIONS); + (*vectors) = alloc_double(NUMBER_OF_POINTS, DIMENSIONS); // reads vectors dataset from file for (int i=0; i= 2.0 are able to run the code - if (max_multiprocessors < this_device_properties.multiProcessorCount - && this_device_properties.major >= 2 && this_device_properties.minor >= 0){ - // stores devices properties for later use - device_properties = this_device_properties; - max_multiprocessors = this_device_properties.multiProcessorCount; - max_device = device_index; - } - } - // sets the device - gpuErrchk( cudaSetDevice(max_device) ); - if (params.verbose){ - printf("Device chosen is \"%s\"\n" - "Device has %d multi processors and compute capability %d.%d\n" - "Max threads per block supported are %d\n\n" - , device_properties.name - , device_properties.multiProcessorCount, device_properties.major, device_properties.minor - , device_properties.maxThreadsPerBlock); - } -} - -int meanshift(double **original_points, double ***shifted_points, int deviation - , parameters *opt){ - static int iteration = 0; - static double **mean_shift_vector, **kernel_matrix, *denominator; - - // allocates memory and copies original points on first iteration - if (iteration == 0 || (*shifted_points) == NULL){ - (*shifted_points) = alloc_2d_double(NUMBER_OF_POINTS, DIMENSIONS); - duplicate(original_points, NUMBER_OF_POINTS, DIMENSIONS, shifted_points); - - // allocates memory for mean shift vector - mean_shift_vector = alloc_2d_double(NUMBER_OF_POINTS, DIMENSIONS); - // initializes elements of mean_shift_vector to inf - for (int i=0;i opt->epsilon) { - ++iteration; - meanshift(original_points, shifted_points, deviation, opt); - } - - if (iteration == 0){ - // cleans up allocations - free(mean_shift_vector[0]); - free(mean_shift_vector); - free(kernel_matrix[0]); - free(kernel_matrix); - free(denominator); - } - - return iteration; -} - - +// TODO check why there's is a difference in the norm calculate in matlab double norm(double **matrix, int rows, int cols){ double sum=0, temp_mul=0; for (int i=0; i>>(d_shifted_points, d_original_points - , deviation, d_kernel_matrix); - if (cudaGetLastError() != cudaSuccess){ - --requested_block_size; - } else { - block_size_too_big = false; - gpuErrchk( cudaDeviceSynchronize() ); - } - } while(block_size_too_big); - - if (first_iter && params.verbose){ - printf("calculate_kernel_matrix_kernel called with:\n"); - printf("dimBlock.x = %d, dimBlock.y = %d\n", dimBlock.x, dimBlock.y); - printf("dimGrid.x = %d, dimGrid.y = %d\n\n", dimGrid.x, dimGrid.y); - first_iter = false; - } - - size = NUMBER_OF_POINTS * NUMBER_OF_POINTS * sizeof(double); - gpuErrchk( cudaMemcpy(&((*kernel_matrix)[0][0]), d_kernel_matrix.elements - , size, cudaMemcpyDeviceToHost) ); - - gpuErrchk( cudaFree(d_shifted_points.elements) ); - gpuErrchk( cudaFree(d_original_points.elements) ); - gpuErrchk( cudaFree(d_kernel_matrix.elements) ); -} - - -void multiply(double **kernel_matrix, double **original_points, double ***new_shift){ - static bool first_iter = true; - - // allocates memory for kernel_matrix in GPU and copies the array - Matrix d_kernel_matrix; - d_kernel_matrix.width = NUMBER_OF_POINTS; - d_kernel_matrix.height = NUMBER_OF_POINTS; - int size = NUMBER_OF_POINTS * NUMBER_OF_POINTS * sizeof(double); - gpuErrchk( cudaMalloc(&d_kernel_matrix.elements, size) ); - gpuErrchk( cudaMemcpy(d_kernel_matrix.elements, &(kernel_matrix[0][0]) - , size, cudaMemcpyHostToDevice) ); - - // allocates memory for original_points in GPU and copies the array - Matrix d_original_points; - d_original_points.width = DIMENSIONS; - d_original_points.height = NUMBER_OF_POINTS; - size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double); - gpuErrchk( cudaMalloc(&d_original_points.elements, size) ); - gpuErrchk( cudaMemcpy(d_original_points.elements, &(original_points[0][0]) - , size, cudaMemcpyHostToDevice) ); - - // allocates memory for new_shift in GPU - Matrix d_new_shift; - d_new_shift.width = DIMENSIONS; - d_new_shift.height = NUMBER_OF_POINTS; - size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double); - gpuErrchk( cudaMalloc(&d_new_shift.elements, size) ); - - // get max sizes supported from the device - int max_block_size = device_properties.maxThreadsPerBlock; - dim3 dimBlock((d_new_shift.height < sqrt(max_block_size)) ? d_new_shift.height : sqrt(max_block_size) - , (d_new_shift.width < sqrt(max_block_size)) ? d_new_shift.width : sqrt(max_block_size)); - dim3 dimGrid((d_new_shift.height + dimBlock.x - 1) / dimBlock.x - , (d_new_shift.width + dimBlock.y - 1) / dimBlock.y); - - if (first_iter && params.verbose){ - printf("multiply_kernel called with:\n"); - printf("dimBlock.x = %d, dimBlock.y = %d\n", dimBlock.x, dimBlock.y); - printf("dimGrid.x = %d, dimGrid.y = %d\n\n", dimGrid.x, dimGrid.y); - first_iter = false; - } - - multiply_kernel<<>>(d_kernel_matrix, d_original_points, d_new_shift); - gpuErrchk( cudaPeekAtLastError() ); - gpuErrchk( cudaDeviceSynchronize() ); - - size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double); - gpuErrchk( cudaMemcpy(&((*new_shift)[0][0]), d_new_shift.elements - , size, cudaMemcpyDeviceToHost) ); - - gpuErrchk( cudaFree(d_kernel_matrix.elements) ); - gpuErrchk( cudaFree(d_original_points.elements) ); - gpuErrchk( cudaFree(d_new_shift.elements) ); -} - -double calculateDistance(double *y, double *x){ - double sum = 0, dif; - for (int i=0; i>>(d_denominator_matrix, d_kernel_matrix); - gpuErrchk( cudaPeekAtLastError() ); - gpuErrchk( cudaDeviceSynchronize() ); - - - size = NUMBER_OF_POINTS * sizeof(double); - double ** denominator = (double**)malloc(size); - gpuErrchk( cudaMemcpy(&((*denominator)[0]), d_denominator_matrix.elements - ,size, cudaMemcpyDeviceToHost) ); - - gpuErrchk( cudaFree(d_kernel_matrix.elements) ); - gpuErrchk( cudaFree(d_denominator_matrix.elements) ); - - return (*denominator); } \ No newline at end of file diff --git a/mean_shift_cuda/meanshift_utils.h b/mean_shift_cuda/meanshift_utils.h index fbf4394..ce6c0cd 100644 --- a/mean_shift_cuda/meanshift_utils.h +++ b/mean_shift_cuda/meanshift_utils.h @@ -3,16 +3,6 @@ #include -//GPU error check snippet taken from: -//https://stackoverflow.com/a/14038590 -#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); } -inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true){ - if (code != cudaSuccess){ - fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); - if (abort) exit(code); - } -} - /* Structs */ typedef struct parameters { double epsilon; @@ -20,45 +10,17 @@ typedef struct parameters { bool display; } parameters; -/* Global variables */ -extern int DEVIATION; -extern int NUMBER_OF_POINTS; -extern int DIMENSIONS; -extern char* POINTS_FILENAME; -extern char* LABELS_FILENAME; -extern parameters params; -extern cudaDeviceProp device_properties; - //Function get_args parses command line arguments. void get_args(int argc, char **argv, parameters *params); //Function init reads the dataset and label arrays from the corresponding files. void init(double ***vectors, char **labels); -void set_Gpu(); - -//Function meanshift recursively shifts original points according to th -//mean-shift algorithm saving the result to shiftedPoints. Struct opt has user -//options, h is the desirable deviation. -int meanshift(double **original_points, double ***shifted_points, int h - , parameters *opt); - //Function norm returns the second norm of matrix of dimensions rowsXcols. double norm(double **matrix, int rows, int cols); -void calculate_kernel_matrix(double **shifted_points, double **original_points, double deviation - , double ***kernel_matrix); - -//Function multiply allocates memory in GPU, sends the data and calls the -//multiply kernel function. -void multiply(double **kernel_matrix, double **original_points - , double ***new_shift); - -//Function calculateDistance returns the distance between x and y vectors. -double calculateDistance(double *y, double *x); - -//Function alloc_2d_double allocates rows*cols bytes of continuous memory. -double **alloc_2d_double(int rows, int cols); +//Function alloc_double allocates rows*cols bytes of continuous memory. +double **alloc_double(int rows, int cols); //Function duplicate copies the values of source array to dest array. void duplicate(double **source, int rows, int cols, double ***dest); @@ -68,11 +30,6 @@ void print_matrix(double **array, int rows, int cols); //Function save_matrix prints matrix in a csv file with path/filename //"output/output_iteration". If a file already exists new lines are concatenated. -void save_matrix(double **matrix - , int iteration); - -//Function calculate_denominator allocates memory in GPU, sends the data and calls the -//denominator kernel function. -double * calculate_denominator(double **kernel_matrix); +void save_matrix(double **matrix, int iteration); #endif //SERIAL_UTILS_H \ No newline at end of file