From 5084818391fc8937168d940d9332b2ddd16fd653 Mon Sep 17 00:00:00 2001 From: Apostolof Date: Fri, 26 Jan 2018 16:34:33 +0200 Subject: [PATCH] Fix calculate_denominator and denominator_kernel --- mean_shift_cuda/meanshift_kernels.cu | 15 +++--- mean_shift_cuda/meanshift_utils.cu | 79 ++++++++++------------------ mean_shift_cuda/meanshift_utils.h | 2 +- 3 files changed, 38 insertions(+), 58 deletions(-) diff --git a/mean_shift_cuda/meanshift_kernels.cu b/mean_shift_cuda/meanshift_kernels.cu index 3407613..533b928 100644 --- a/mean_shift_cuda/meanshift_kernels.cu +++ b/mean_shift_cuda/meanshift_kernels.cu @@ -61,14 +61,15 @@ __global__ void shift_points_kernel(Matrix original_points, Matrix kernel_matrix } __global__ void denominator_kernel(Matrix denominator, Matrix kernel_matrix){ - + // Each thread computes one element of denominator_kernel + // by accumulating results into cell_value + double cell_value = 0; int row = blockIdx.x * blockDim.x + threadIdx.x; - int col = blockIdx.y * blockDim.y + threadIdx.y; - if (row * denominator.width + col > denominator.width * denominator.height){ - return; + if (row < denominator.height){ + for (int column = 0; column < kernel_matrix.width; ++column){ + cell_value += kernel_matrix.elements[row * kernel_matrix.width + column]; + } + denominator.elements[row] = cell_value; } - - denominator.elements[col]=0; - denominator.elements[row] += kernel_matrix.elements[row*denominator.width + col]; } \ No newline at end of file diff --git a/mean_shift_cuda/meanshift_utils.cu b/mean_shift_cuda/meanshift_utils.cu index ce08ad3..e602ff6 100644 --- a/mean_shift_cuda/meanshift_utils.cu +++ b/mean_shift_cuda/meanshift_utils.cu @@ -202,15 +202,7 @@ int meanshift(double **original_points, double ***shifted_points, int deviation &kernel_matrix); // calculates denominator - for (int i=0; i>>(d_denominator, d_kernel_matrix); + if (cudaGetLastError() != cudaSuccess){ + --requested_block_size; + } else { + block_size_too_big = false; + gpuErrchk( cudaDeviceSynchronize() ); + } + } while(block_size_too_big); + if (first_iter && params.verbose){ printf("calculate_denominator called with:\n"); printf("dimBlock.x = %d, dimBlock.y = %d\n", dimBlock.x, dimBlock.y); @@ -389,20 +379,9 @@ double * calculate_denominator(double **kernel_matrix){ first_iter = false; } - denominator_kernel<<>>(d_denominator_matrix, d_kernel_matrix); - gpuErrchk( cudaPeekAtLastError() ); - gpuErrchk( cudaDeviceSynchronize() ); - - size = NUMBER_OF_POINTS * sizeof(double); - double ** denominator = (double**)malloc(size); - gpuErrchk( cudaMemcpy(&((*denominator)[0]), d_denominator_matrix.elements - ,size, cudaMemcpyDeviceToHost) ); - - gpuErrchk( cudaFree(d_kernel_matrix.elements) ); - gpuErrchk( cudaFree(d_denominator_matrix.elements) ); - - return (*denominator); + gpuErrchk( cudaMemcpy(&((*denominator)[0]), d_denominator.elements + , size, cudaMemcpyDeviceToHost) ); } void shift_points(Matrix d_kernel_matrix, Matrix d_original_points, Matrix d_shifted_points, @@ -411,8 +390,8 @@ void shift_points(Matrix d_kernel_matrix, Matrix d_original_points, Matrix d_shi int size; static bool first_iter = true; // gets max block size supported from the device - int max_block_size = device_properties.maxThreadsPerBlock; - int requested_block_size = (int)sqrt(max_block_size); + static int max_block_size = device_properties.maxThreadsPerBlock; + static int requested_block_size = (int)(max_block_size / 2); bool block_size_too_big = true; dim3 dimBlock; @@ -420,7 +399,7 @@ void shift_points(Matrix d_kernel_matrix, Matrix d_original_points, Matrix d_shi do { dimBlock.x = requested_block_size; dimBlock.y = 2; - dimGrid.x = (d_kernel_matrix.height + dimBlock.x - 1) / dimBlock.x; + dimGrid.x = (d_denominator.height + dimBlock.x - 1) / dimBlock.x; dimGrid.y = 1; shift_points_kernel<<>>(d_original_points, d_kernel_matrix, d_shifted_points, diff --git a/mean_shift_cuda/meanshift_utils.h b/mean_shift_cuda/meanshift_utils.h index 75f7c7f..0232116 100644 --- a/mean_shift_cuda/meanshift_utils.h +++ b/mean_shift_cuda/meanshift_utils.h @@ -82,6 +82,6 @@ void save_matrix(double **matrix //Function calculate_denominator allocates memory in GPU, sends the data and calls the //denominator kernel function. -double * calculate_denominator(double **kernel_matrix); +void calculate_denominator(Matrix d_kernel_matrix, Matrix d_denominator, double **denominator); #endif //SERIAL_UTILS_H \ No newline at end of file