diff --git a/mean_shift_cuda/meanshift b/mean_shift_cuda/meanshift deleted file mode 100644 index dc21f7a..0000000 Binary files a/mean_shift_cuda/meanshift and /dev/null differ diff --git a/mean_shift_cuda/meanshift_kernels.cu b/mean_shift_cuda/meanshift_kernels.cu index 16b8710..3407613 100644 --- a/mean_shift_cuda/meanshift_kernels.cu +++ b/mean_shift_cuda/meanshift_kernels.cu @@ -1,24 +1,8 @@ #include "meanshift_kernels.h" #include -__global__ void multiply_kernel(Matrix matrix1, Matrix matrix2, Matrix output){ - // Each thread computes one element of output - // by accumulating results into cell_value - double cell_value = 0; - int row = blockIdx.x * blockDim.x + threadIdx.x; - int col = blockIdx.y * blockDim.y + threadIdx.y; - - if (row + col < output.height * output.width){ - for (int element_index = 0; element_index < matrix1.width; ++element_index){ - cell_value += matrix1.elements[row * matrix1.width + element_index] - * matrix2.elements[element_index * matrix2.width + col]; - } - output.elements[row * output.width + col] = cell_value; - } -} - -__global__ void calculate_kernel_matrix_kernel(Matrix shifted_points, Matrix original_points - , double deviation, Matrix kernel_matrix){ +__global__ void calculate_kernel_matrix_kernel(Matrix shifted_points, Matrix original_points, + double deviation, Matrix kernel_matrix){ // Each thread calculates one element of kernel_matrix int row = blockIdx.x * blockDim.x + threadIdx.x; int col = blockIdx.y * blockDim.y + threadIdx.y; @@ -48,17 +32,43 @@ __global__ void calculate_kernel_matrix_kernel(Matrix shifted_points, Matrix ori } } +__global__ void shift_points_kernel(Matrix original_points, Matrix kernel_matrix, Matrix shifted_points, + Matrix new_shift, Matrix denominator, Matrix mean_shift_vector){ + // Each thread computes one element of new_shift + // by accumulating results into cell_value + double cell_value = 0; + int row = blockIdx.x * blockDim.x + threadIdx.x; + int col = blockIdx.y * blockDim.y + threadIdx.y; + + // performs calculations only if indexes are within matrix bounds + //if (row + col < new_shift.height * new_shift.width){ + if (row < new_shift.height){ + // calculates new_shift + // builds nominator by multiplying kernel_matrix and original_points + for (int element_index = 0; element_index < kernel_matrix.width; ++element_index){ + cell_value += kernel_matrix.elements[row * kernel_matrix.width + element_index] + * original_points.elements[element_index * original_points.width + col]; + } + // new_shift elements are calculated by dividing with the denominator + new_shift.elements[row * new_shift.width + col] = + cell_value / denominator.elements[row]; + + // calculates mean-shift vector + mean_shift_vector.elements[row * new_shift.width + col] = + new_shift.elements[row * new_shift.width + col] - + shifted_points.elements[row * new_shift.width + col]; + } +} + __global__ void denominator_kernel(Matrix denominator, Matrix kernel_matrix){ int row = blockIdx.x * blockDim.x + threadIdx.x; int col = blockIdx.y * blockDim.y + threadIdx.y; - if (row * denominator.width + col > denominator.width * denominator.height){ return; } denominator.elements[col]=0; denominator.elements[row] += kernel_matrix.elements[row*denominator.width + col]; - } \ No newline at end of file diff --git a/mean_shift_cuda/meanshift_kernels.h b/mean_shift_cuda/meanshift_kernels.h index 0f9398d..d33641e 100644 --- a/mean_shift_cuda/meanshift_kernels.h +++ b/mean_shift_cuda/meanshift_kernels.h @@ -1,17 +1,18 @@ #ifndef SERIAL_KERNELS_H /* Include guard */ #define SERIAL_KERNELS_H -typedef struct{ +typedef struct { int width; int height; double *elements; } Matrix; -//Function multiply_kernel calculates the product of matrices 1 and 2 into output. -__global__ void multiply_kernel(Matrix matrix1, Matrix matrix2, Matrix output); +__global__ void calculate_kernel_matrix_kernel(Matrix shifted_points, Matrix original_points, + double deviation, Matrix kernel_matrix); -__global__ void calculate_kernel_matrix_kernel(Matrix shifted_points, Matrix original_points - , double deviation, Matrix kernel_matrix); +//Function multiply_kernel calculates the product of matrices 1 and 2 into output. +__global__ void shift_points_kernel(Matrix original_points, Matrix kernel_matrix, Matrix shifted_points, + Matrix new_shift, Matrix denominator, Matrix mean_shift_vector); __global__ void denominator_kernel(Matrix denominator, Matrix kernel_matrix); diff --git a/mean_shift_cuda/meanshift_utils.cu b/mean_shift_cuda/meanshift_utils.cu index f636dc7..ce08ad3 100644 --- a/mean_shift_cuda/meanshift_utils.cu +++ b/mean_shift_cuda/meanshift_utils.cu @@ -5,7 +5,6 @@ #include #include "meanshift_utils.h" -#include "meanshift_kernels.h" #define OUTPUT_PREFIX "../output/output_" @@ -61,7 +60,7 @@ void get_args(int argc, char **argv, parameters *params){ void init(double ***vectors, char **labels){ int bytes_read = 0; - set_Gpu(); + set_GPU(); if (params.verbose){ printf("Reading dataset and labels...\n"); @@ -127,7 +126,7 @@ void init(double ***vectors, char **labels){ //Based on https://stackoverflow.com/a/28113186 //Poio psagmeno link https://www.cs.virginia.edu/~csadmin/wiki/index.php/CUDA_Support/Choosing_a_GPU -void set_Gpu(){ +void set_GPU(){ int devices_count = 0, max_multiprocessors = 0, max_device = 0; // gets devices count checking for errors like no devices or no drivers to check for @@ -162,11 +161,20 @@ void set_Gpu(){ int meanshift(double **original_points, double ***shifted_points, int deviation , parameters *opt){ + // host variables + int size = 0; static int iteration = 0; - static double **mean_shift_vector, **kernel_matrix, *denominator; + static double **kernel_matrix, *denominator, **mean_shift_vector; + double **new_shift; + + // device variables + static Matrix d_original_points, d_shifted_points, d_kernel_matrix, d_denominator, + d_mean_shift_vector; + Matrix d_new_shift; // allocates memory and copies original points on first iteration if (iteration == 0 || (*shifted_points) == NULL){ + // allocates memory for shifted points array and copies original points into it (*shifted_points) = alloc_2d_double(NUMBER_OF_POINTS, DIMENSIONS); duplicate(original_points, NUMBER_OF_POINTS, DIMENSIONS, shifted_points); @@ -182,42 +190,49 @@ int meanshift(double **original_points, double ***shifted_points, int deviation // allocates memory for other arrays needed kernel_matrix = alloc_2d_double(NUMBER_OF_POINTS, NUMBER_OF_POINTS); denominator = (double *)malloc(NUMBER_OF_POINTS * sizeof(double)); + + // allocates corresponding memory in device + init_device_memory(original_points, *shifted_points, &d_original_points, &d_shifted_points, + &d_kernel_matrix, &d_denominator, &d_mean_shift_vector); } - // TODO move arrays to device and create global kernel for the iteration // finds pairwise distance matrix (inside radius) // [I, D] = rangesearch(x,y,h); - calculate_kernel_matrix((*shifted_points), original_points, deviation, &kernel_matrix); - -// // calculate denominator -// for (int i=0; iwidth = DIMENSIONS; + d_original_points->height = NUMBER_OF_POINTS; size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double); - gpuErrchk( cudaMalloc(&d_original_points.elements, size) ); - gpuErrchk( cudaMemcpy(d_original_points.elements, &(original_points[0][0]) + gpuErrchk( cudaMalloc(&(d_original_points->elements), size) ); + gpuErrchk( cudaMemcpy(d_original_points->elements, &(original_points[0][0]) + , size, cudaMemcpyHostToDevice) ); + + // allocates memory for shifted_points in GPU and copies the array + d_shifted_points->width = DIMENSIONS; + d_shifted_points->height = NUMBER_OF_POINTS; + size = DIMENSIONS * NUMBER_OF_POINTS * sizeof(double); + gpuErrchk( cudaMalloc(&(d_shifted_points->elements), size) ); + gpuErrchk( cudaMemcpy(d_shifted_points->elements, &(shifted_points[0][0]) , size, cudaMemcpyHostToDevice) ); // allocates memory for kernel_matrix in GPU - Matrix d_kernel_matrix; - d_kernel_matrix.width = NUMBER_OF_POINTS; - d_kernel_matrix.height = NUMBER_OF_POINTS; + d_kernel_matrix->width = NUMBER_OF_POINTS; + d_kernel_matrix->height = NUMBER_OF_POINTS; size = NUMBER_OF_POINTS * NUMBER_OF_POINTS * sizeof(double); - gpuErrchk( cudaMalloc(&d_kernel_matrix.elements, size) ); + gpuErrchk( cudaMalloc(&(d_kernel_matrix->elements), size) ); - // get max sizes supported from the device - int max_block_size = (int)sqrt(device_properties.maxThreadsPerBlock); - int requested_block_size = max_block_size; + // allocates memory for denominator in GPU + d_denominator->width = 1; + d_denominator->height = NUMBER_OF_POINTS; + size = NUMBER_OF_POINTS * sizeof(double); + gpuErrchk( cudaMalloc(&(d_denominator->elements), size) ); + + // allocates memory for mean_shift_vector in GPU + d_mean_shift_vector->width = DIMENSIONS; + d_mean_shift_vector->height = NUMBER_OF_POINTS; + size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double); + gpuErrchk( cudaMalloc(&(d_mean_shift_vector->elements), size) ); +} + +void calculate_kernel_matrix(Matrix d_shifted_points, Matrix d_original_points, + Matrix d_kernel_matrix, double deviation, double ***kernel_matrix){ + int size; + static bool first_iter = true; + // gets max block size supported from the device + int max_block_size = device_properties.maxThreadsPerBlock; + int requested_block_size = (int)sqrt(max_block_size); bool block_size_too_big = true; dim3 dimBlock; dim3 dimGrid; - do { dimBlock.x = requested_block_size; dimBlock.y = requested_block_size; @@ -323,65 +354,106 @@ void calculate_kernel_matrix(double **shifted_points, double **original_points, size = NUMBER_OF_POINTS * NUMBER_OF_POINTS * sizeof(double); gpuErrchk( cudaMemcpy(&((*kernel_matrix)[0][0]), d_kernel_matrix.elements , size, cudaMemcpyDeviceToHost) ); - - gpuErrchk( cudaFree(d_shifted_points.elements) ); - gpuErrchk( cudaFree(d_original_points.elements) ); - gpuErrchk( cudaFree(d_kernel_matrix.elements) ); } - -void multiply(double **kernel_matrix, double **original_points, double ***new_shift){ +double * calculate_denominator(double **kernel_matrix){ static bool first_iter = true; + // allocates memory for denominator_matrix in GPU + Matrix d_denominator_matrix; + d_denominator_matrix.width = NUMBER_OF_POINTS; + d_denominator_matrix.height = 1; + int size = NUMBER_OF_POINTS * sizeof(double); + gpuErrchk( cudaMalloc(&d_denominator_matrix.elements, size) ); + // allocates memory for kernel_matrix in GPU and copies the array Matrix d_kernel_matrix; d_kernel_matrix.width = NUMBER_OF_POINTS; d_kernel_matrix.height = NUMBER_OF_POINTS; - int size = NUMBER_OF_POINTS * NUMBER_OF_POINTS * sizeof(double); + size = NUMBER_OF_POINTS * NUMBER_OF_POINTS * sizeof(double); gpuErrchk( cudaMalloc(&d_kernel_matrix.elements, size) ); gpuErrchk( cudaMemcpy(d_kernel_matrix.elements, &(kernel_matrix[0][0]) - , size, cudaMemcpyHostToDevice) ); - - // allocates memory for original_points in GPU and copies the array - Matrix d_original_points; - d_original_points.width = DIMENSIONS; - d_original_points.height = NUMBER_OF_POINTS; - size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double); - gpuErrchk( cudaMalloc(&d_original_points.elements, size) ); - gpuErrchk( cudaMemcpy(d_original_points.elements, &(original_points[0][0]) - , size, cudaMemcpyHostToDevice) ); - - // allocates memory for new_shift in GPU - Matrix d_new_shift; - d_new_shift.width = DIMENSIONS; - d_new_shift.height = NUMBER_OF_POINTS; - size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double); - gpuErrchk( cudaMalloc(&d_new_shift.elements, size) ); + , size, cudaMemcpyHostToDevice) ); // get max sizes supported from the device int max_block_size = device_properties.maxThreadsPerBlock; - dim3 dimBlock((d_new_shift.height < sqrt(max_block_size)) ? d_new_shift.height : sqrt(max_block_size) - , (d_new_shift.width < sqrt(max_block_size)) ? d_new_shift.width : sqrt(max_block_size)); - dim3 dimGrid((d_new_shift.height + dimBlock.x - 1) / dimBlock.x - , (d_new_shift.width + dimBlock.y - 1) / dimBlock.y); + dim3 dimBlock((d_denominator_matrix.height < sqrt(max_block_size)) ? d_denominator_matrix.height : sqrt(max_block_size) + , (d_denominator_matrix.width < sqrt(max_block_size)) ? d_denominator_matrix.width : sqrt(max_block_size)); + dim3 dimGrid((d_denominator_matrix.height + dimBlock.x - 1) / dimBlock.x + , (d_denominator_matrix.width + dimBlock.y - 1) / dimBlock.y); if (first_iter && params.verbose){ - printf("multiply_kernel called with:\n"); + printf("calculate_denominator called with:\n"); printf("dimBlock.x = %d, dimBlock.y = %d\n", dimBlock.x, dimBlock.y); printf("dimGrid.x = %d, dimGrid.y = %d\n\n", dimGrid.x, dimGrid.y); first_iter = false; } - multiply_kernel<<>>(d_kernel_matrix, d_original_points, d_new_shift); + denominator_kernel<<>>(d_denominator_matrix, d_kernel_matrix); gpuErrchk( cudaPeekAtLastError() ); gpuErrchk( cudaDeviceSynchronize() ); + + size = NUMBER_OF_POINTS * sizeof(double); + double ** denominator = (double**)malloc(size); + gpuErrchk( cudaMemcpy(&((*denominator)[0]), d_denominator_matrix.elements + ,size, cudaMemcpyDeviceToHost) ); + + gpuErrchk( cudaFree(d_kernel_matrix.elements) ); + gpuErrchk( cudaFree(d_denominator_matrix.elements) ); + + return (*denominator); +} + +void shift_points(Matrix d_kernel_matrix, Matrix d_original_points, Matrix d_shifted_points, + Matrix d_new_shift, Matrix d_denominator, Matrix d_mean_shift_vector, double **kernel_matrix, + double **original_points, double ***new_shift, double ***mean_shift_vector){ + int size; + static bool first_iter = true; + // gets max block size supported from the device + int max_block_size = device_properties.maxThreadsPerBlock; + int requested_block_size = (int)sqrt(max_block_size); + bool block_size_too_big = true; + + dim3 dimBlock; + dim3 dimGrid; + do { + dimBlock.x = requested_block_size; + dimBlock.y = 2; + dimGrid.x = (d_kernel_matrix.height + dimBlock.x - 1) / dimBlock.x; + dimGrid.y = 1; + + shift_points_kernel<<>>(d_original_points, d_kernel_matrix, d_shifted_points, + d_new_shift, d_denominator, d_mean_shift_vector); + if (cudaGetLastError() != cudaSuccess){ + --requested_block_size; + } else { + block_size_too_big = false; + gpuErrchk( cudaDeviceSynchronize() ); + } + } while(block_size_too_big); + + if (first_iter && params.verbose){ + printf("shift_points_kernel called with:\n"); + printf("dimBlock.x = %d, dimBlock.y = %d\n", dimBlock.x, dimBlock.y); + printf("dimGrid.x = %d, dimGrid.y = %d\n\n", dimGrid.x, dimGrid.y); + first_iter = false; + } + size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double); gpuErrchk( cudaMemcpy(&((*new_shift)[0][0]), d_new_shift.elements , size, cudaMemcpyDeviceToHost) ); + gpuErrchk( cudaMemcpy(&((*mean_shift_vector)[0][0]), d_mean_shift_vector.elements + , size, cudaMemcpyDeviceToHost) ); +} - gpuErrchk( cudaFree(d_kernel_matrix.elements) ); +void free_device_memory(Matrix d_original_points, Matrix d_kernel_matrix, Matrix d_denominator, + Matrix d_new_shift){ + // frees all memory previously allocated in device gpuErrchk( cudaFree(d_original_points.elements) ); + gpuErrchk( cudaFree(d_kernel_matrix.elements) ); + //gpuErrchk( cudaFree(d_shifted_points.elements) ); + gpuErrchk( cudaFree(d_denominator.elements) ); gpuErrchk( cudaFree(d_new_shift.elements) ); } @@ -434,53 +506,4 @@ void save_matrix(double **matrix, int iteration){ } fprintf(file, "\n"); } -} - -double * calculate_denominator(double **kernel_matrix){ - static bool first_iter = true; - - // allocates memory for denominator_matrix in GPU - Matrix d_denominator_matrix; - d_denominator_matrix.width = NUMBER_OF_POINTS; - d_denominator_matrix.height = 1; - int size = NUMBER_OF_POINTS * sizeof(double); - gpuErrchk( cudaMalloc(&d_denominator_matrix.elements, size) ); - - // allocates memory for kernel_matrix in GPU and copies the array - Matrix d_kernel_matrix; - d_kernel_matrix.width = NUMBER_OF_POINTS; - d_kernel_matrix.height = NUMBER_OF_POINTS; - size = NUMBER_OF_POINTS * NUMBER_OF_POINTS * sizeof(double); - gpuErrchk( cudaMalloc(&d_kernel_matrix.elements, size) ); - gpuErrchk( cudaMemcpy(d_kernel_matrix.elements, &(kernel_matrix[0][0]) - , size, cudaMemcpyHostToDevice) ); - - // get max sizes supported from the device - int max_block_size = device_properties.maxThreadsPerBlock; - dim3 dimBlock((d_denominator_matrix.height < sqrt(max_block_size)) ? d_denominator_matrix.height : sqrt(max_block_size) - , (d_denominator_matrix.width < sqrt(max_block_size)) ? d_denominator_matrix.width : sqrt(max_block_size)); - dim3 dimGrid((d_denominator_matrix.height + dimBlock.x - 1) / dimBlock.x - , (d_denominator_matrix.width + dimBlock.y - 1) / dimBlock.y); - - if (first_iter && params.verbose){ - printf("calculate_denominator called with:\n"); - printf("dimBlock.x = %d, dimBlock.y = %d\n", dimBlock.x, dimBlock.y); - printf("dimGrid.x = %d, dimGrid.y = %d\n\n", dimGrid.x, dimGrid.y); - first_iter = false; - } - - denominator_kernel<<>>(d_denominator_matrix, d_kernel_matrix); - gpuErrchk( cudaPeekAtLastError() ); - gpuErrchk( cudaDeviceSynchronize() ); - - - size = NUMBER_OF_POINTS * sizeof(double); - double ** denominator = (double**)malloc(size); - gpuErrchk( cudaMemcpy(&((*denominator)[0]), d_denominator_matrix.elements - ,size, cudaMemcpyDeviceToHost) ); - - gpuErrchk( cudaFree(d_kernel_matrix.elements) ); - gpuErrchk( cudaFree(d_denominator_matrix.elements) ); - - return (*denominator); } \ No newline at end of file diff --git a/mean_shift_cuda/meanshift_utils.h b/mean_shift_cuda/meanshift_utils.h index fbf4394..75f7c7f 100644 --- a/mean_shift_cuda/meanshift_utils.h +++ b/mean_shift_cuda/meanshift_utils.h @@ -2,6 +2,7 @@ #define SERIAL_UTILS_H #include +#include "meanshift_kernels.h" //GPU error check snippet taken from: //https://stackoverflow.com/a/14038590 @@ -35,7 +36,7 @@ void get_args(int argc, char **argv, parameters *params); //Function init reads the dataset and label arrays from the corresponding files. void init(double ***vectors, char **labels); -void set_Gpu(); +void set_GPU(); //Function meanshift recursively shifts original points according to th //mean-shift algorithm saving the result to shiftedPoints. Struct opt has user @@ -43,16 +44,24 @@ void set_Gpu(); int meanshift(double **original_points, double ***shifted_points, int h , parameters *opt); +void init_device_memory(double **original_points, double **shifted_points, + Matrix *d_original_points, Matrix *d_shifted_points, + Matrix *d_kernel_matrix, Matrix *d_denominator, Matrix *d_new_shift); + //Function norm returns the second norm of matrix of dimensions rowsXcols. double norm(double **matrix, int rows, int cols); -void calculate_kernel_matrix(double **shifted_points, double **original_points, double deviation - , double ***kernel_matrix); +void calculate_kernel_matrix(Matrix d_shifted_points, Matrix d_original_points, + Matrix d_kernel_matrix, double deviation, double ***kernel_matrix); //Function multiply allocates memory in GPU, sends the data and calls the //multiply kernel function. -void multiply(double **kernel_matrix, double **original_points - , double ***new_shift); +void shift_points(Matrix d_kernel_matrix, Matrix d_original_points, Matrix d_shifted_points, + Matrix d_new_shift, Matrix d_denominator, Matrix d_mean_shift_vector, double **kernel_matrix, + double **original_points, double ***new_shift, double ***mean_shift_vector); + +void free_device_memory(Matrix d_original_points, Matrix d_kernel_matrix, Matrix d_denominator, + Matrix d_new_shift); //Function calculateDistance returns the distance between x and y vectors. double calculateDistance(double *y, double *x);