diff --git a/mean_shift_cuda/meanshift b/mean_shift_cuda/meanshift index 02e8444..dc21f7a 100644 Binary files a/mean_shift_cuda/meanshift and b/mean_shift_cuda/meanshift differ diff --git a/mean_shift_cuda/meanshift_kernels.cu b/mean_shift_cuda/meanshift_kernels.cu index ff284b8..4115289 100644 --- a/mean_shift_cuda/meanshift_kernels.cu +++ b/mean_shift_cuda/meanshift_kernels.cu @@ -15,4 +15,35 @@ __global__ void multiply_kernel(Matrix matrix1, Matrix matrix2, Matrix output){ } output.elements[row * output.width + col] = cell_value; } +} + +__global__ void calculate_kernel_matrix_kernel(Matrix shifted_points, Matrix original_points + , double deviation, Matrix kernel_matrix){ + // Each thread calculates one element of kernel_matrix + int row = blockIdx.x * blockDim.x + threadIdx.x; + int col = blockIdx.y * blockDim.y + threadIdx.y; + if (row * kernel_matrix.width + col > kernel_matrix.width * kernel_matrix.height){ + return; + } + + int dimensions = shifted_points.width; + + double sum = 0, dif; + for (int i=0; i>>(d_shifted_points, d_original_points + , deviation, d_kernel_matrix); + if (cudaGetLastError() != cudaSuccess){ + --requested_block_size; + } else { + block_size_too_big = false; + gpuErrchk( cudaDeviceSynchronize() ); + } + } while(block_size_too_big); + + if (first_iter && params.verbose){ + printf("calculate_kernel_matrix_kernel called with:\n"); + printf("dimBlock.x = %d, dimBlock.y = %d\n", dimBlock.x, dimBlock.y); + printf("dimGrid.x = %d, dimGrid.y = %d\n\n", dimGrid.x, dimGrid.y); + first_iter = false; + } + + size = NUMBER_OF_POINTS * NUMBER_OF_POINTS * sizeof(double); + gpuErrchk( cudaMemcpy(&((*kernel_matrix)[0][0]), d_kernel_matrix.elements + , size, cudaMemcpyDeviceToHost) ); + + gpuErrchk( cudaFree(d_shifted_points.elements) ); + gpuErrchk( cudaFree(d_original_points.elements) ); + gpuErrchk( cudaFree(d_kernel_matrix.elements) ); +} + + void multiply(double **kernel_matrix, double **original_points, double ***new_shift){ - static bool firstIter = true; + static bool first_iter = true; - // allocates memory for kernel_matrix in GPU and copies the array - Matrix d_kernel_matrix; + // allocates memory for kernel_matrix in GPU and copies the array + Matrix d_kernel_matrix; d_kernel_matrix.width = NUMBER_OF_POINTS; d_kernel_matrix.height = NUMBER_OF_POINTS; int size = NUMBER_OF_POINTS * NUMBER_OF_POINTS * sizeof(double); gpuErrchk( cudaMalloc(&d_kernel_matrix.elements, size) ); gpuErrchk( cudaMemcpy(d_kernel_matrix.elements, &(kernel_matrix[0][0]) - , size, cudaMemcpyHostToDevice) ); + , size, cudaMemcpyHostToDevice) ); // allocates memory for original_points in GPU and copies the array Matrix d_original_points; @@ -290,24 +346,27 @@ void multiply(double **kernel_matrix, double **original_points, double ***new_sh size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double); gpuErrchk( cudaMalloc(&d_original_points.elements, size) ); gpuErrchk( cudaMemcpy(d_original_points.elements, &(original_points[0][0]) - , size, cudaMemcpyHostToDevice) ); + , size, cudaMemcpyHostToDevice) ); - // allocates memory for new_shift in GPU + // allocates memory for new_shift in GPU Matrix d_new_shift; d_new_shift.width = DIMENSIONS; d_new_shift.height = NUMBER_OF_POINTS; size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double); gpuErrchk( cudaMalloc(&d_new_shift.elements, size) ); - dim3 dimBlock((d_new_shift.height < sqrt(BLOCK_SIZE)) ? d_new_shift.height : sqrt(BLOCK_SIZE) - , (d_new_shift.width < sqrt(BLOCK_SIZE)) ? d_new_shift.width : sqrt(BLOCK_SIZE)); + // get max sizes supported from the device + int max_block_size = device_properties.maxThreadsPerBlock; + dim3 dimBlock((d_new_shift.height < sqrt(max_block_size)) ? d_new_shift.height : sqrt(max_block_size) + , (d_new_shift.width < sqrt(max_block_size)) ? d_new_shift.width : sqrt(max_block_size)); dim3 dimGrid((d_new_shift.height + dimBlock.x - 1) / dimBlock.x , (d_new_shift.width + dimBlock.y - 1) / dimBlock.y); - if (firstIter && params.verbose){ + if (first_iter && params.verbose){ + printf("multiply_kernel called with:\n"); printf("dimBlock.x = %d, dimBlock.y = %d\n", dimBlock.x, dimBlock.y); printf("dimGrid.x = %d, dimGrid.y = %d\n\n", dimGrid.x, dimGrid.y); - firstIter = false; + first_iter = false; } multiply_kernel<<>>(d_kernel_matrix, d_original_points, d_new_shift); @@ -316,7 +375,7 @@ void multiply(double **kernel_matrix, double **original_points, double ***new_sh size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double); gpuErrchk( cudaMemcpy(&((*new_shift)[0][0]), d_new_shift.elements - , size, cudaMemcpyDeviceToHost) ); + , size, cudaMemcpyDeviceToHost) ); gpuErrchk( cudaFree(d_kernel_matrix.elements) ); gpuErrchk( cudaFree(d_original_points.elements) ); diff --git a/mean_shift_cuda/meanshift_utils.h b/mean_shift_cuda/meanshift_utils.h index ae15d02..9d80c14 100644 --- a/mean_shift_cuda/meanshift_utils.h +++ b/mean_shift_cuda/meanshift_utils.h @@ -7,8 +7,7 @@ //https://stackoverflow.com/a/14038590 #define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); } inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true){ - if (code != cudaSuccess) - { + if (code != cudaSuccess){ fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); if (abort) exit(code); } @@ -42,15 +41,18 @@ void set_Gpu(); //mean-shift algorithm saving the result to shiftedPoints. Struct opt has user //options, h is the desirable deviation. int meanshift(double **original_points, double ***shifted_points, int h - , parameters *opt); + , parameters *opt); //Function norm returns the second norm of matrix of dimensions rowsXcols. double norm(double **matrix, int rows, int cols); +void calculate_kernel_matrix(double **shifted_points, double **original_points, double deviation + , double ***kernel_matrix); + //Function multiply allocates memory in GPU, sends the data and calls the //multiply kernel function. void multiply(double **kernel_matrix, double **original_points - , double ***new_shift); + , double ***new_shift); //Function calculateDistance returns the distance between x and y vectors. double calculateDistance(double *y, double *x); @@ -67,6 +69,6 @@ void print_matrix(double **array, int rows, int cols); //Function save_matrix prints matrix in a csv file with path/filename //"output/output_iteration". If a file already exists new lines are concatenated. void save_matrix(double **matrix - , int iteration); + , int iteration); #endif //SERIAL_UTILS_H \ No newline at end of file