diff --git a/mean_shift_cuda/meanshift b/mean_shift_cuda/meanshift new file mode 100644 index 0000000..dc21f7a Binary files /dev/null and b/mean_shift_cuda/meanshift differ diff --git a/mean_shift_cuda/meanshift_kernels.cu b/mean_shift_cuda/meanshift_kernels.cu index 533b928..16b8710 100644 --- a/mean_shift_cuda/meanshift_kernels.cu +++ b/mean_shift_cuda/meanshift_kernels.cu @@ -1,8 +1,24 @@ #include "meanshift_kernels.h" #include -__global__ void calculate_kernel_matrix_kernel(Matrix shifted_points, Matrix original_points, - double deviation, Matrix kernel_matrix){ +__global__ void multiply_kernel(Matrix matrix1, Matrix matrix2, Matrix output){ + // Each thread computes one element of output + // by accumulating results into cell_value + double cell_value = 0; + int row = blockIdx.x * blockDim.x + threadIdx.x; + int col = blockIdx.y * blockDim.y + threadIdx.y; + + if (row + col < output.height * output.width){ + for (int element_index = 0; element_index < matrix1.width; ++element_index){ + cell_value += matrix1.elements[row * matrix1.width + element_index] + * matrix2.elements[element_index * matrix2.width + col]; + } + output.elements[row * output.width + col] = cell_value; + } +} + +__global__ void calculate_kernel_matrix_kernel(Matrix shifted_points, Matrix original_points + , double deviation, Matrix kernel_matrix){ // Each thread calculates one element of kernel_matrix int row = blockIdx.x * blockDim.x + threadIdx.x; int col = blockIdx.y * blockDim.y + threadIdx.y; @@ -32,44 +48,17 @@ __global__ void calculate_kernel_matrix_kernel(Matrix shifted_points, Matrix ori } } -__global__ void shift_points_kernel(Matrix original_points, Matrix kernel_matrix, Matrix shifted_points, - Matrix new_shift, Matrix denominator, Matrix mean_shift_vector){ - // Each thread computes one element of new_shift - // by accumulating results into cell_value - double cell_value = 0; +__global__ void denominator_kernel(Matrix denominator, Matrix kernel_matrix){ + int row = blockIdx.x * blockDim.x + threadIdx.x; int col = blockIdx.y * blockDim.y + threadIdx.y; - // performs calculations only if indexes are within matrix bounds - //if (row + col < new_shift.height * new_shift.width){ - if (row < new_shift.height){ - // calculates new_shift - // builds nominator by multiplying kernel_matrix and original_points - for (int element_index = 0; element_index < kernel_matrix.width; ++element_index){ - cell_value += kernel_matrix.elements[row * kernel_matrix.width + element_index] - * original_points.elements[element_index * original_points.width + col]; - } - // new_shift elements are calculated by dividing with the denominator - new_shift.elements[row * new_shift.width + col] = - cell_value / denominator.elements[row]; - // calculates mean-shift vector - mean_shift_vector.elements[row * new_shift.width + col] = - new_shift.elements[row * new_shift.width + col] - - shifted_points.elements[row * new_shift.width + col]; + if (row * denominator.width + col > denominator.width * denominator.height){ + return; } -} -__global__ void denominator_kernel(Matrix denominator, Matrix kernel_matrix){ - // Each thread computes one element of denominator_kernel - // by accumulating results into cell_value - double cell_value = 0; - int row = blockIdx.x * blockDim.x + threadIdx.x; + denominator.elements[col]=0; + denominator.elements[row] += kernel_matrix.elements[row*denominator.width + col]; - if (row < denominator.height){ - for (int column = 0; column < kernel_matrix.width; ++column){ - cell_value += kernel_matrix.elements[row * kernel_matrix.width + column]; - } - denominator.elements[row] = cell_value; - } } \ No newline at end of file diff --git a/mean_shift_cuda/meanshift_kernels.h b/mean_shift_cuda/meanshift_kernels.h index d33641e..0f9398d 100644 --- a/mean_shift_cuda/meanshift_kernels.h +++ b/mean_shift_cuda/meanshift_kernels.h @@ -1,18 +1,17 @@ #ifndef SERIAL_KERNELS_H /* Include guard */ #define SERIAL_KERNELS_H -typedef struct { +typedef struct{ int width; int height; double *elements; } Matrix; -__global__ void calculate_kernel_matrix_kernel(Matrix shifted_points, Matrix original_points, - double deviation, Matrix kernel_matrix); - //Function multiply_kernel calculates the product of matrices 1 and 2 into output. -__global__ void shift_points_kernel(Matrix original_points, Matrix kernel_matrix, Matrix shifted_points, - Matrix new_shift, Matrix denominator, Matrix mean_shift_vector); +__global__ void multiply_kernel(Matrix matrix1, Matrix matrix2, Matrix output); + +__global__ void calculate_kernel_matrix_kernel(Matrix shifted_points, Matrix original_points + , double deviation, Matrix kernel_matrix); __global__ void denominator_kernel(Matrix denominator, Matrix kernel_matrix); diff --git a/mean_shift_cuda/meanshift_utils.cu b/mean_shift_cuda/meanshift_utils.cu index e602ff6..f636dc7 100644 --- a/mean_shift_cuda/meanshift_utils.cu +++ b/mean_shift_cuda/meanshift_utils.cu @@ -5,6 +5,7 @@ #include #include "meanshift_utils.h" +#include "meanshift_kernels.h" #define OUTPUT_PREFIX "../output/output_" @@ -60,7 +61,7 @@ void get_args(int argc, char **argv, parameters *params){ void init(double ***vectors, char **labels){ int bytes_read = 0; - set_GPU(); + set_Gpu(); if (params.verbose){ printf("Reading dataset and labels...\n"); @@ -126,7 +127,7 @@ void init(double ***vectors, char **labels){ //Based on https://stackoverflow.com/a/28113186 //Poio psagmeno link https://www.cs.virginia.edu/~csadmin/wiki/index.php/CUDA_Support/Choosing_a_GPU -void set_GPU(){ +void set_Gpu(){ int devices_count = 0, max_multiprocessors = 0, max_device = 0; // gets devices count checking for errors like no devices or no drivers to check for @@ -161,20 +162,11 @@ void set_GPU(){ int meanshift(double **original_points, double ***shifted_points, int deviation , parameters *opt){ - // host variables - int size = 0; static int iteration = 0; - static double **kernel_matrix, *denominator, **mean_shift_vector; - double **new_shift; - - // device variables - static Matrix d_original_points, d_shifted_points, d_kernel_matrix, d_denominator, - d_mean_shift_vector; - Matrix d_new_shift; + static double **mean_shift_vector, **kernel_matrix, *denominator; // allocates memory and copies original points on first iteration if (iteration == 0 || (*shifted_points) == NULL){ - // allocates memory for shifted points array and copies original points into it (*shifted_points) = alloc_2d_double(NUMBER_OF_POINTS, DIMENSIONS); duplicate(original_points, NUMBER_OF_POINTS, DIMENSIONS, shifted_points); @@ -190,41 +182,42 @@ int meanshift(double **original_points, double ***shifted_points, int deviation // allocates memory for other arrays needed kernel_matrix = alloc_2d_double(NUMBER_OF_POINTS, NUMBER_OF_POINTS); denominator = (double *)malloc(NUMBER_OF_POINTS * sizeof(double)); - - // allocates corresponding memory in device - init_device_memory(original_points, *shifted_points, &d_original_points, &d_shifted_points, - &d_kernel_matrix, &d_denominator, &d_mean_shift_vector); } + // TODO move arrays to device and create global kernel for the iteration // finds pairwise distance matrix (inside radius) // [I, D] = rangesearch(x,y,h); - calculate_kernel_matrix(d_shifted_points, d_original_points, d_kernel_matrix, deviation, - &kernel_matrix); - - // calculates denominator - calculate_denominator(d_kernel_matrix, d_denominator, &denominator); - - size = NUMBER_OF_POINTS * sizeof(double); - gpuErrchk( cudaMemcpy(d_denominator.elements, &(denominator[0]) - , size, cudaMemcpyHostToDevice) ); + calculate_kernel_matrix((*shifted_points), original_points, deviation, &kernel_matrix); + +// // calculate denominator +// for (int i=0; iwidth = DIMENSIONS; - d_original_points->height = NUMBER_OF_POINTS; - size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double); - gpuErrchk( cudaMalloc(&(d_original_points->elements), size) ); - gpuErrchk( cudaMemcpy(d_original_points->elements, &(original_points[0][0]) + // allocates memory for shifted_points in GPU and copies the array + Matrix d_shifted_points; + d_shifted_points.width = DIMENSIONS; + d_shifted_points.height = NUMBER_OF_POINTS; + int size = DIMENSIONS * NUMBER_OF_POINTS * sizeof(double); + gpuErrchk( cudaMalloc(&d_shifted_points.elements, size) ); + gpuErrchk( cudaMemcpy(d_shifted_points.elements, &(shifted_points[0][0]) , size, cudaMemcpyHostToDevice) ); - // allocates memory for shifted_points in GPU and copies the array - d_shifted_points->width = DIMENSIONS; - d_shifted_points->height = NUMBER_OF_POINTS; - size = DIMENSIONS * NUMBER_OF_POINTS * sizeof(double); - gpuErrchk( cudaMalloc(&(d_shifted_points->elements), size) ); - gpuErrchk( cudaMemcpy(d_shifted_points->elements, &(shifted_points[0][0]) + // allocates memory for original_points in GPU and copies the array + Matrix d_original_points; + d_original_points.width = DIMENSIONS; + d_original_points.height = NUMBER_OF_POINTS; + size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double); + gpuErrchk( cudaMalloc(&d_original_points.elements, size) ); + gpuErrchk( cudaMemcpy(d_original_points.elements, &(original_points[0][0]) , size, cudaMemcpyHostToDevice) ); // allocates memory for kernel_matrix in GPU - d_kernel_matrix->width = NUMBER_OF_POINTS; - d_kernel_matrix->height = NUMBER_OF_POINTS; + Matrix d_kernel_matrix; + d_kernel_matrix.width = NUMBER_OF_POINTS; + d_kernel_matrix.height = NUMBER_OF_POINTS; size = NUMBER_OF_POINTS * NUMBER_OF_POINTS * sizeof(double); - gpuErrchk( cudaMalloc(&(d_kernel_matrix->elements), size) ); - - // allocates memory for denominator in GPU - d_denominator->width = 1; - d_denominator->height = NUMBER_OF_POINTS; - size = NUMBER_OF_POINTS * sizeof(double); - gpuErrchk( cudaMalloc(&(d_denominator->elements), size) ); - - // allocates memory for mean_shift_vector in GPU - d_mean_shift_vector->width = DIMENSIONS; - d_mean_shift_vector->height = NUMBER_OF_POINTS; - size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double); - gpuErrchk( cudaMalloc(&(d_mean_shift_vector->elements), size) ); -} + gpuErrchk( cudaMalloc(&d_kernel_matrix.elements, size) ); -void calculate_kernel_matrix(Matrix d_shifted_points, Matrix d_original_points, - Matrix d_kernel_matrix, double deviation, double ***kernel_matrix){ - int size; - static bool first_iter = true; - // gets max block size supported from the device - static int max_block_size = device_properties.maxThreadsPerBlock; - static int requested_block_size = (int)sqrt(max_block_size); + // get max sizes supported from the device + int max_block_size = (int)sqrt(device_properties.maxThreadsPerBlock); + int requested_block_size = max_block_size; bool block_size_too_big = true; dim3 dimBlock; dim3 dimGrid; + do { dimBlock.x = requested_block_size; dimBlock.y = requested_block_size; @@ -346,93 +323,65 @@ void calculate_kernel_matrix(Matrix d_shifted_points, Matrix d_original_points, size = NUMBER_OF_POINTS * NUMBER_OF_POINTS * sizeof(double); gpuErrchk( cudaMemcpy(&((*kernel_matrix)[0][0]), d_kernel_matrix.elements , size, cudaMemcpyDeviceToHost) ); -} -void calculate_denominator(Matrix d_kernel_matrix, Matrix d_denominator, double **denominator){ - int size; - static bool first_iter = true; - // gets max block size supported from the device - static int requested_block_size = device_properties.maxThreadsPerBlock; - bool block_size_too_big = true; + gpuErrchk( cudaFree(d_shifted_points.elements) ); + gpuErrchk( cudaFree(d_original_points.elements) ); + gpuErrchk( cudaFree(d_kernel_matrix.elements) ); +} - dim3 dimBlock; - dim3 dimGrid; - do { - dimBlock.x = requested_block_size; - dimBlock.y = 1; - dimGrid.x = (d_kernel_matrix.height + dimBlock.x - 1) / dimBlock.x; - dimGrid.y = 1; - denominator_kernel<<>>(d_denominator, d_kernel_matrix); - if (cudaGetLastError() != cudaSuccess){ - --requested_block_size; - } else { - block_size_too_big = false; - gpuErrchk( cudaDeviceSynchronize() ); - } - } while(block_size_too_big); - - if (first_iter && params.verbose){ - printf("calculate_denominator called with:\n"); - printf("dimBlock.x = %d, dimBlock.y = %d\n", dimBlock.x, dimBlock.y); - printf("dimGrid.x = %d, dimGrid.y = %d\n\n", dimGrid.x, dimGrid.y); - first_iter = false; - } +void multiply(double **kernel_matrix, double **original_points, double ***new_shift){ + static bool first_iter = true; - size = NUMBER_OF_POINTS * sizeof(double); - gpuErrchk( cudaMemcpy(&((*denominator)[0]), d_denominator.elements - , size, cudaMemcpyDeviceToHost) ); -} + // allocates memory for kernel_matrix in GPU and copies the array + Matrix d_kernel_matrix; + d_kernel_matrix.width = NUMBER_OF_POINTS; + d_kernel_matrix.height = NUMBER_OF_POINTS; + int size = NUMBER_OF_POINTS * NUMBER_OF_POINTS * sizeof(double); + gpuErrchk( cudaMalloc(&d_kernel_matrix.elements, size) ); + gpuErrchk( cudaMemcpy(d_kernel_matrix.elements, &(kernel_matrix[0][0]) + , size, cudaMemcpyHostToDevice) ); -void shift_points(Matrix d_kernel_matrix, Matrix d_original_points, Matrix d_shifted_points, - Matrix d_new_shift, Matrix d_denominator, Matrix d_mean_shift_vector, double **kernel_matrix, - double **original_points, double ***new_shift, double ***mean_shift_vector){ - int size; - static bool first_iter = true; - // gets max block size supported from the device - static int max_block_size = device_properties.maxThreadsPerBlock; - static int requested_block_size = (int)(max_block_size / 2); - bool block_size_too_big = true; + // allocates memory for original_points in GPU and copies the array + Matrix d_original_points; + d_original_points.width = DIMENSIONS; + d_original_points.height = NUMBER_OF_POINTS; + size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double); + gpuErrchk( cudaMalloc(&d_original_points.elements, size) ); + gpuErrchk( cudaMemcpy(d_original_points.elements, &(original_points[0][0]) + , size, cudaMemcpyHostToDevice) ); - dim3 dimBlock; - dim3 dimGrid; - do { - dimBlock.x = requested_block_size; - dimBlock.y = 2; - dimGrid.x = (d_denominator.height + dimBlock.x - 1) / dimBlock.x; - dimGrid.y = 1; + // allocates memory for new_shift in GPU + Matrix d_new_shift; + d_new_shift.width = DIMENSIONS; + d_new_shift.height = NUMBER_OF_POINTS; + size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double); + gpuErrchk( cudaMalloc(&d_new_shift.elements, size) ); - shift_points_kernel<<>>(d_original_points, d_kernel_matrix, d_shifted_points, - d_new_shift, d_denominator, d_mean_shift_vector); - if (cudaGetLastError() != cudaSuccess){ - --requested_block_size; - } else { - block_size_too_big = false; - gpuErrchk( cudaDeviceSynchronize() ); - } - } while(block_size_too_big); + // get max sizes supported from the device + int max_block_size = device_properties.maxThreadsPerBlock; + dim3 dimBlock((d_new_shift.height < sqrt(max_block_size)) ? d_new_shift.height : sqrt(max_block_size) + , (d_new_shift.width < sqrt(max_block_size)) ? d_new_shift.width : sqrt(max_block_size)); + dim3 dimGrid((d_new_shift.height + dimBlock.x - 1) / dimBlock.x + , (d_new_shift.width + dimBlock.y - 1) / dimBlock.y); if (first_iter && params.verbose){ - printf("shift_points_kernel called with:\n"); + printf("multiply_kernel called with:\n"); printf("dimBlock.x = %d, dimBlock.y = %d\n", dimBlock.x, dimBlock.y); printf("dimGrid.x = %d, dimGrid.y = %d\n\n", dimGrid.x, dimGrid.y); first_iter = false; } + multiply_kernel<<>>(d_kernel_matrix, d_original_points, d_new_shift); + gpuErrchk( cudaPeekAtLastError() ); + gpuErrchk( cudaDeviceSynchronize() ); + size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double); gpuErrchk( cudaMemcpy(&((*new_shift)[0][0]), d_new_shift.elements , size, cudaMemcpyDeviceToHost) ); - gpuErrchk( cudaMemcpy(&((*mean_shift_vector)[0][0]), d_mean_shift_vector.elements - , size, cudaMemcpyDeviceToHost) ); -} -void free_device_memory(Matrix d_original_points, Matrix d_kernel_matrix, Matrix d_denominator, - Matrix d_new_shift){ - // frees all memory previously allocated in device - gpuErrchk( cudaFree(d_original_points.elements) ); gpuErrchk( cudaFree(d_kernel_matrix.elements) ); - //gpuErrchk( cudaFree(d_shifted_points.elements) ); - gpuErrchk( cudaFree(d_denominator.elements) ); + gpuErrchk( cudaFree(d_original_points.elements) ); gpuErrchk( cudaFree(d_new_shift.elements) ); } @@ -485,4 +434,53 @@ void save_matrix(double **matrix, int iteration){ } fprintf(file, "\n"); } +} + +double * calculate_denominator(double **kernel_matrix){ + static bool first_iter = true; + + // allocates memory for denominator_matrix in GPU + Matrix d_denominator_matrix; + d_denominator_matrix.width = NUMBER_OF_POINTS; + d_denominator_matrix.height = 1; + int size = NUMBER_OF_POINTS * sizeof(double); + gpuErrchk( cudaMalloc(&d_denominator_matrix.elements, size) ); + + // allocates memory for kernel_matrix in GPU and copies the array + Matrix d_kernel_matrix; + d_kernel_matrix.width = NUMBER_OF_POINTS; + d_kernel_matrix.height = NUMBER_OF_POINTS; + size = NUMBER_OF_POINTS * NUMBER_OF_POINTS * sizeof(double); + gpuErrchk( cudaMalloc(&d_kernel_matrix.elements, size) ); + gpuErrchk( cudaMemcpy(d_kernel_matrix.elements, &(kernel_matrix[0][0]) + , size, cudaMemcpyHostToDevice) ); + + // get max sizes supported from the device + int max_block_size = device_properties.maxThreadsPerBlock; + dim3 dimBlock((d_denominator_matrix.height < sqrt(max_block_size)) ? d_denominator_matrix.height : sqrt(max_block_size) + , (d_denominator_matrix.width < sqrt(max_block_size)) ? d_denominator_matrix.width : sqrt(max_block_size)); + dim3 dimGrid((d_denominator_matrix.height + dimBlock.x - 1) / dimBlock.x + , (d_denominator_matrix.width + dimBlock.y - 1) / dimBlock.y); + + if (first_iter && params.verbose){ + printf("calculate_denominator called with:\n"); + printf("dimBlock.x = %d, dimBlock.y = %d\n", dimBlock.x, dimBlock.y); + printf("dimGrid.x = %d, dimGrid.y = %d\n\n", dimGrid.x, dimGrid.y); + first_iter = false; + } + + denominator_kernel<<>>(d_denominator_matrix, d_kernel_matrix); + gpuErrchk( cudaPeekAtLastError() ); + gpuErrchk( cudaDeviceSynchronize() ); + + + size = NUMBER_OF_POINTS * sizeof(double); + double ** denominator = (double**)malloc(size); + gpuErrchk( cudaMemcpy(&((*denominator)[0]), d_denominator_matrix.elements + ,size, cudaMemcpyDeviceToHost) ); + + gpuErrchk( cudaFree(d_kernel_matrix.elements) ); + gpuErrchk( cudaFree(d_denominator_matrix.elements) ); + + return (*denominator); } \ No newline at end of file diff --git a/mean_shift_cuda/meanshift_utils.h b/mean_shift_cuda/meanshift_utils.h index 0232116..fbf4394 100644 --- a/mean_shift_cuda/meanshift_utils.h +++ b/mean_shift_cuda/meanshift_utils.h @@ -2,7 +2,6 @@ #define SERIAL_UTILS_H #include -#include "meanshift_kernels.h" //GPU error check snippet taken from: //https://stackoverflow.com/a/14038590 @@ -36,7 +35,7 @@ void get_args(int argc, char **argv, parameters *params); //Function init reads the dataset and label arrays from the corresponding files. void init(double ***vectors, char **labels); -void set_GPU(); +void set_Gpu(); //Function meanshift recursively shifts original points according to th //mean-shift algorithm saving the result to shiftedPoints. Struct opt has user @@ -44,24 +43,16 @@ void set_GPU(); int meanshift(double **original_points, double ***shifted_points, int h , parameters *opt); -void init_device_memory(double **original_points, double **shifted_points, - Matrix *d_original_points, Matrix *d_shifted_points, - Matrix *d_kernel_matrix, Matrix *d_denominator, Matrix *d_new_shift); - //Function norm returns the second norm of matrix of dimensions rowsXcols. double norm(double **matrix, int rows, int cols); -void calculate_kernel_matrix(Matrix d_shifted_points, Matrix d_original_points, - Matrix d_kernel_matrix, double deviation, double ***kernel_matrix); +void calculate_kernel_matrix(double **shifted_points, double **original_points, double deviation + , double ***kernel_matrix); //Function multiply allocates memory in GPU, sends the data and calls the //multiply kernel function. -void shift_points(Matrix d_kernel_matrix, Matrix d_original_points, Matrix d_shifted_points, - Matrix d_new_shift, Matrix d_denominator, Matrix d_mean_shift_vector, double **kernel_matrix, - double **original_points, double ***new_shift, double ***mean_shift_vector); - -void free_device_memory(Matrix d_original_points, Matrix d_kernel_matrix, Matrix d_denominator, - Matrix d_new_shift); +void multiply(double **kernel_matrix, double **original_points + , double ***new_shift); //Function calculateDistance returns the distance between x and y vectors. double calculateDistance(double *y, double *x); @@ -82,6 +73,6 @@ void save_matrix(double **matrix //Function calculate_denominator allocates memory in GPU, sends the data and calls the //denominator kernel function. -void calculate_denominator(Matrix d_kernel_matrix, Matrix d_denominator, double **denominator); +double * calculate_denominator(double **kernel_matrix); #endif //SERIAL_UTILS_H \ No newline at end of file