Browse Source

Fix kernel dynamic memory allocation

master
Apostolos Fanakis 7 years ago
parent
commit
e4fe746939
  1. 25
      mean_shift_cuda_shared_mem/meanshift_gpu_utils.cu
  2. 12
      mean_shift_cuda_shared_mem/meanshift_kernels.cu

25
mean_shift_cuda_shared_mem/meanshift_gpu_utils.cu

@ -122,18 +122,6 @@ int meanshift(double **original_points, double ***shifted_points, int deviation)
&tmp_w_memcpy_time);
w_memcpy_time += tmp_w_memcpy_time;
/*for (int row=0; row<2; ++row){
for (int col=0; col<2; ++col){
printf("new_shift[%d][%d] = %f\n", row, col, new_shift[row][col]);
printf("new_shift[%d][%d] = %f\n", 300+row, 216+col, new_shift[300+row][216+col]);
printf("new_shift[%d][%d] = %f\n", 562+row, 487+col, new_shift[562+row][487+col]);
}
}*/
/*if(is_first_recursion){
exit(0);
}*/
// frees previously shifted points, they're now garbage
free((*shifted_points)[0]);
gpuErrchk( cudaFree(d_shifted_points.elements) );
@ -328,13 +316,16 @@ void shift_points(Matrix d_kernel_matrix, Matrix d_original_points, Matrix d_shi
do {
/*dimBlock.x = requested_block_size;
dimBlock.y = d_new_shift.width;*/
dimBlock.x = 2;
dimBlock.y = 2;
dimBlock.x = min(d_new_shift.width, d_new_shift.height);
dimBlock.y = min(d_new_shift.width, d_new_shift.height);
dimGrid.x = (d_new_shift.height + dimBlock.x - 1) / dimBlock.x;
dimGrid.y = 1;
dimGrid.y = (d_new_shift.height + dimBlock.x - 1) / dimBlock.x;
int shared_memory_size = dimBlock.x * 2 * sizeof(double);
//Kernel <<<numBlocks, threadsPerBlock, sharedMemory>>> (count_a, count_b);
shift_points_kernel<<<dimGrid, dimBlock>>>(d_original_points, d_kernel_matrix, d_shifted_points,
d_new_shift, d_denominator, d_mean_shift_vector);
shift_points_kernel<<<dimGrid, dimBlock, shared_memory_size>>>(d_original_points,
d_kernel_matrix, d_shifted_points, d_new_shift, d_denominator, d_mean_shift_vector);
if (cudaGetLastError() != cudaSuccess){
--requested_block_size;
} else {

12
mean_shift_cuda_shared_mem/meanshift_kernels.cu

@ -74,11 +74,13 @@ __global__ void shift_points_kernel(Matrix original_points, Matrix kernel_matrix
// each thread block computes one sub-matrix sub_new_shift of C
Matrix sub_new_shift = GetSubMatrix(new_shift, block_row, block_col, BLOCK_SIZE);
// shared memory used to store sub_kernel_matrix and sub_original_points respectively
__shared__ double *s_sub_kernel_matrix;
s_sub_kernel_matrix = (double*)malloc(BLOCK_SIZE * BLOCK_SIZE * sizeof(double));
__shared__ double *s_sub_original_points;
s_sub_original_points = (double*)malloc(BLOCK_SIZE * BLOCK_SIZE * sizeof(double));
// dynamically allocated shared memory used to store sub_kernel_matrix and sub_original_points
// respectively
extern __shared__ double joined_shared_memory[];
// first part of the allocated memory is used for s_sub_kernel_matrix and second part is used
// for s_sub_original_points
double *s_sub_kernel_matrix = &(joined_shared_memory[0]);
double *s_sub_original_points = &(joined_shared_memory[BLOCK_SIZE * BLOCK_SIZE]);
// loops over all the sub-matrices of kernel_matrix and original_points that are required to
// compute sub_new_shift, multiplies each pair of sub-matrices and accumulates the results

Loading…
Cancel
Save