Browse Source

Fix calculate_denominator and denominator_kernel

master
Apostolos Fanakis 7 years ago
parent
commit
5084818391
  1. 15
      mean_shift_cuda/meanshift_kernels.cu
  2. 75
      mean_shift_cuda/meanshift_utils.cu
  3. 2
      mean_shift_cuda/meanshift_utils.h

15
mean_shift_cuda/meanshift_kernels.cu

@ -61,14 +61,15 @@ __global__ void shift_points_kernel(Matrix original_points, Matrix kernel_matrix
} }
__global__ void denominator_kernel(Matrix denominator, Matrix kernel_matrix){ __global__ void denominator_kernel(Matrix denominator, Matrix kernel_matrix){
// Each thread computes one element of denominator_kernel
// by accumulating results into cell_value
double cell_value = 0;
int row = blockIdx.x * blockDim.x + threadIdx.x; int row = blockIdx.x * blockDim.x + threadIdx.x;
int col = blockIdx.y * blockDim.y + threadIdx.y;
if (row * denominator.width + col > denominator.width * denominator.height){ if (row < denominator.height){
return; for (int column = 0; column < kernel_matrix.width; ++column){
cell_value += kernel_matrix.elements[row * kernel_matrix.width + column];
}
denominator.elements[row] = cell_value;
} }
denominator.elements[col]=0;
denominator.elements[row] += kernel_matrix.elements[row*denominator.width + col];
} }

75
mean_shift_cuda/meanshift_utils.cu

@ -202,15 +202,7 @@ int meanshift(double **original_points, double ***shifted_points, int deviation
&kernel_matrix); &kernel_matrix);
// calculates denominator // calculates denominator
for (int i=0; i<NUMBER_OF_POINTS; i++){ calculate_denominator(d_kernel_matrix, d_denominator, &denominator);
double sum = 0;
for (int j=0; j<NUMBER_OF_POINTS; j++){
sum = sum + kernel_matrix[i][j];
}
denominator[i] = sum;
}
//calculate_denominator(kernel_matrix);
size = NUMBER_OF_POINTS * sizeof(double); size = NUMBER_OF_POINTS * sizeof(double);
gpuErrchk( cudaMemcpy(d_denominator.elements, &(denominator[0]) gpuErrchk( cudaMemcpy(d_denominator.elements, &(denominator[0])
@ -322,8 +314,8 @@ void calculate_kernel_matrix(Matrix d_shifted_points, Matrix d_original_points,
int size; int size;
static bool first_iter = true; static bool first_iter = true;
// gets max block size supported from the device // gets max block size supported from the device
int max_block_size = device_properties.maxThreadsPerBlock; static int max_block_size = device_properties.maxThreadsPerBlock;
int requested_block_size = (int)sqrt(max_block_size); static int requested_block_size = (int)sqrt(max_block_size);
bool block_size_too_big = true; bool block_size_too_big = true;
dim3 dimBlock; dim3 dimBlock;
@ -356,31 +348,29 @@ void calculate_kernel_matrix(Matrix d_shifted_points, Matrix d_original_points,
, size, cudaMemcpyDeviceToHost) ); , size, cudaMemcpyDeviceToHost) );
} }
double * calculate_denominator(double **kernel_matrix){ void calculate_denominator(Matrix d_kernel_matrix, Matrix d_denominator, double **denominator){
int size;
static bool first_iter = true; static bool first_iter = true;
// gets max block size supported from the device
static int requested_block_size = device_properties.maxThreadsPerBlock;
bool block_size_too_big = true;
// allocates memory for denominator_matrix in GPU dim3 dimBlock;
Matrix d_denominator_matrix; dim3 dimGrid;
d_denominator_matrix.width = NUMBER_OF_POINTS; do {
d_denominator_matrix.height = 1; dimBlock.x = requested_block_size;
int size = NUMBER_OF_POINTS * sizeof(double); dimBlock.y = 1;
gpuErrchk( cudaMalloc(&d_denominator_matrix.elements, size) ); dimGrid.x = (d_kernel_matrix.height + dimBlock.x - 1) / dimBlock.x;
dimGrid.y = 1;
// allocates memory for kernel_matrix in GPU and copies the array
Matrix d_kernel_matrix;
d_kernel_matrix.width = NUMBER_OF_POINTS;
d_kernel_matrix.height = NUMBER_OF_POINTS;
size = NUMBER_OF_POINTS * NUMBER_OF_POINTS * sizeof(double);
gpuErrchk( cudaMalloc(&d_kernel_matrix.elements, size) );
gpuErrchk( cudaMemcpy(d_kernel_matrix.elements, &(kernel_matrix[0][0])
, size, cudaMemcpyHostToDevice) );
// get max sizes supported from the device denominator_kernel<<<dimGrid, dimBlock>>>(d_denominator, d_kernel_matrix);
int max_block_size = device_properties.maxThreadsPerBlock; if (cudaGetLastError() != cudaSuccess){
dim3 dimBlock((d_denominator_matrix.height < sqrt(max_block_size)) ? d_denominator_matrix.height : sqrt(max_block_size) --requested_block_size;
, (d_denominator_matrix.width < sqrt(max_block_size)) ? d_denominator_matrix.width : sqrt(max_block_size)); } else {
dim3 dimGrid((d_denominator_matrix.height + dimBlock.x - 1) / dimBlock.x block_size_too_big = false;
, (d_denominator_matrix.width + dimBlock.y - 1) / dimBlock.y); gpuErrchk( cudaDeviceSynchronize() );
}
} while(block_size_too_big);
if (first_iter && params.verbose){ if (first_iter && params.verbose){
printf("calculate_denominator called with:\n"); printf("calculate_denominator called with:\n");
@ -389,20 +379,9 @@ double * calculate_denominator(double **kernel_matrix){
first_iter = false; first_iter = false;
} }
denominator_kernel<<<dimGrid, dimBlock>>>(d_denominator_matrix, d_kernel_matrix);
gpuErrchk( cudaPeekAtLastError() );
gpuErrchk( cudaDeviceSynchronize() );
size = NUMBER_OF_POINTS * sizeof(double); size = NUMBER_OF_POINTS * sizeof(double);
double ** denominator = (double**)malloc(size); gpuErrchk( cudaMemcpy(&((*denominator)[0]), d_denominator.elements
gpuErrchk( cudaMemcpy(&((*denominator)[0]), d_denominator_matrix.elements
, size, cudaMemcpyDeviceToHost) ); , size, cudaMemcpyDeviceToHost) );
gpuErrchk( cudaFree(d_kernel_matrix.elements) );
gpuErrchk( cudaFree(d_denominator_matrix.elements) );
return (*denominator);
} }
void shift_points(Matrix d_kernel_matrix, Matrix d_original_points, Matrix d_shifted_points, void shift_points(Matrix d_kernel_matrix, Matrix d_original_points, Matrix d_shifted_points,
@ -411,8 +390,8 @@ void shift_points(Matrix d_kernel_matrix, Matrix d_original_points, Matrix d_shi
int size; int size;
static bool first_iter = true; static bool first_iter = true;
// gets max block size supported from the device // gets max block size supported from the device
int max_block_size = device_properties.maxThreadsPerBlock; static int max_block_size = device_properties.maxThreadsPerBlock;
int requested_block_size = (int)sqrt(max_block_size); static int requested_block_size = (int)(max_block_size / 2);
bool block_size_too_big = true; bool block_size_too_big = true;
dim3 dimBlock; dim3 dimBlock;
@ -420,7 +399,7 @@ void shift_points(Matrix d_kernel_matrix, Matrix d_original_points, Matrix d_shi
do { do {
dimBlock.x = requested_block_size; dimBlock.x = requested_block_size;
dimBlock.y = 2; dimBlock.y = 2;
dimGrid.x = (d_kernel_matrix.height + dimBlock.x - 1) / dimBlock.x; dimGrid.x = (d_denominator.height + dimBlock.x - 1) / dimBlock.x;
dimGrid.y = 1; dimGrid.y = 1;
shift_points_kernel<<<dimGrid, dimBlock>>>(d_original_points, d_kernel_matrix, d_shifted_points, shift_points_kernel<<<dimGrid, dimBlock>>>(d_original_points, d_kernel_matrix, d_shifted_points,

2
mean_shift_cuda/meanshift_utils.h

@ -82,6 +82,6 @@ void save_matrix(double **matrix
//Function calculate_denominator allocates memory in GPU, sends the data and calls the //Function calculate_denominator allocates memory in GPU, sends the data and calls the
//denominator kernel function. //denominator kernel function.
double * calculate_denominator(double **kernel_matrix); void calculate_denominator(Matrix d_kernel_matrix, Matrix d_denominator, double **denominator);
#endif //SERIAL_UTILS_H #endif //SERIAL_UTILS_H
Loading…
Cancel
Save