Browse Source

Fix shared memory implementation

master
Apostolos Fanakis 7 years ago
parent
commit
64c24ec313
  1. 2
      mean_shift_cuda_shared_mem/meanshift.cu
  2. 16
      mean_shift_cuda_shared_mem/meanshift_gpu_utils.cu
  3. 29
      mean_shift_cuda_shared_mem/meanshift_kernels.cu

2
mean_shift_cuda_shared_mem/meanshift.cu

@ -21,7 +21,7 @@ int main(int argc, char **argv){
char *labels; char *labels;
params.epsilon = 0.0001; params.epsilon = 0.0001;
params.verbose = false; params.verbose = true;
params.display = true; params.display = true;
//get_args(argc, argv, &params); //commented out while in development //get_args(argc, argv, &params); //commented out while in development
init(&vectors, &labels); init(&vectors, &labels);

16
mean_shift_cuda_shared_mem/meanshift_gpu_utils.cu

@ -113,6 +113,7 @@ int meanshift(double **original_points, double ***shifted_points, int deviation)
// allocates corresponding memory in device // allocates corresponding memory in device
d_new_shift.width = DIMENSIONS; d_new_shift.width = DIMENSIONS;
d_new_shift.height = NUMBER_OF_POINTS; d_new_shift.height = NUMBER_OF_POINTS;
d_new_shift.stride = d_new_shift.width;
size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double); size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double);
gpuErrchk( cudaMalloc(&(d_new_shift.elements), size) ); gpuErrchk( cudaMalloc(&(d_new_shift.elements), size) );
@ -121,17 +122,17 @@ int meanshift(double **original_points, double ***shifted_points, int deviation)
&tmp_w_memcpy_time); &tmp_w_memcpy_time);
w_memcpy_time += tmp_w_memcpy_time; w_memcpy_time += tmp_w_memcpy_time;
for (int row=0; row<2; ++row){ /*for (int row=0; row<2; ++row){
for (int col=0; col<2; ++col){ for (int col=0; col<2; ++col){
printf("new_shift[%d][%d] = %f\n", row, col, new_shift[row][col]); printf("new_shift[%d][%d] = %f\n", row, col, new_shift[row][col]);
printf("new_shift[%d][%d] = %f\n", 300+row, 216+col, new_shift[300+row][216+col]); printf("new_shift[%d][%d] = %f\n", 300+row, 216+col, new_shift[300+row][216+col]);
printf("new_shift[%d][%d] = %f\n", 562+row, 487+col, new_shift[562+row][487+col]); printf("new_shift[%d][%d] = %f\n", 562+row, 487+col, new_shift[562+row][487+col]);
} }
} }*/
if(is_first_recursion){ /*if(is_first_recursion){
exit(0); exit(0);
} }*/
// frees previously shifted points, they're now garbage // frees previously shifted points, they're now garbage
free((*shifted_points)[0]); free((*shifted_points)[0]);
@ -195,6 +196,7 @@ void init_device_memory(double **original_points, double **shifted_points,
// allocates memory for original_points in GPU and copies the array // allocates memory for original_points in GPU and copies the array
d_original_points->width = DIMENSIONS; d_original_points->width = DIMENSIONS;
d_original_points->height = NUMBER_OF_POINTS; d_original_points->height = NUMBER_OF_POINTS;
d_original_points->stride = d_original_points->width;
size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double); size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double);
gpuErrchk( cudaMalloc(&(d_original_points->elements), size) ); gpuErrchk( cudaMalloc(&(d_original_points->elements), size) );
gpuErrchk( cudaMemcpy(d_original_points->elements, &(original_points[0][0]) gpuErrchk( cudaMemcpy(d_original_points->elements, &(original_points[0][0])
@ -203,6 +205,7 @@ void init_device_memory(double **original_points, double **shifted_points,
// allocates memory for shifted_points in GPU and copies the array // allocates memory for shifted_points in GPU and copies the array
d_shifted_points->width = DIMENSIONS; d_shifted_points->width = DIMENSIONS;
d_shifted_points->height = NUMBER_OF_POINTS; d_shifted_points->height = NUMBER_OF_POINTS;
d_shifted_points->stride = d_shifted_points->width;
size = DIMENSIONS * NUMBER_OF_POINTS * sizeof(double); size = DIMENSIONS * NUMBER_OF_POINTS * sizeof(double);
gpuErrchk( cudaMalloc(&(d_shifted_points->elements), size) ); gpuErrchk( cudaMalloc(&(d_shifted_points->elements), size) );
gpuErrchk( cudaMemcpy(d_shifted_points->elements, &(shifted_points[0][0]) gpuErrchk( cudaMemcpy(d_shifted_points->elements, &(shifted_points[0][0])
@ -211,18 +214,21 @@ void init_device_memory(double **original_points, double **shifted_points,
// allocates memory for kernel_matrix in GPU // allocates memory for kernel_matrix in GPU
d_kernel_matrix->width = NUMBER_OF_POINTS; d_kernel_matrix->width = NUMBER_OF_POINTS;
d_kernel_matrix->height = NUMBER_OF_POINTS; d_kernel_matrix->height = NUMBER_OF_POINTS;
d_kernel_matrix->stride = d_kernel_matrix->width;
size = NUMBER_OF_POINTS * NUMBER_OF_POINTS * sizeof(double); size = NUMBER_OF_POINTS * NUMBER_OF_POINTS * sizeof(double);
gpuErrchk( cudaMalloc(&(d_kernel_matrix->elements), size) ); gpuErrchk( cudaMalloc(&(d_kernel_matrix->elements), size) );
// allocates memory for denominator in GPU // allocates memory for denominator in GPU
d_denominator->width = 1; d_denominator->width = 1;
d_denominator->height = NUMBER_OF_POINTS; d_denominator->height = NUMBER_OF_POINTS;
d_denominator->stride = d_denominator->width;
size = NUMBER_OF_POINTS * sizeof(double); size = NUMBER_OF_POINTS * sizeof(double);
gpuErrchk( cudaMalloc(&(d_denominator->elements), size) ); gpuErrchk( cudaMalloc(&(d_denominator->elements), size) );
// allocates memory for mean_shift_vector in GPU // allocates memory for mean_shift_vector in GPU
d_mean_shift_vector->width = DIMENSIONS; d_mean_shift_vector->width = DIMENSIONS;
d_mean_shift_vector->height = NUMBER_OF_POINTS; d_mean_shift_vector->height = NUMBER_OF_POINTS;
d_mean_shift_vector->stride = d_mean_shift_vector->width;
size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double); size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double);
gpuErrchk( cudaMalloc(&(d_mean_shift_vector->elements), size) ); gpuErrchk( cudaMalloc(&(d_mean_shift_vector->elements), size) );
} }
@ -324,7 +330,7 @@ void shift_points(Matrix d_kernel_matrix, Matrix d_original_points, Matrix d_shi
dimBlock.y = d_new_shift.width;*/ dimBlock.y = d_new_shift.width;*/
dimBlock.x = 2; dimBlock.x = 2;
dimBlock.y = 2; dimBlock.y = 2;
dimGrid.x = (d_denominator.height + dimBlock.x - 1) / dimBlock.x; dimGrid.x = (d_new_shift.height + dimBlock.x - 1) / dimBlock.x;
dimGrid.y = 1; dimGrid.y = 1;
shift_points_kernel<<<dimGrid, dimBlock>>>(d_original_points, d_kernel_matrix, d_shifted_points, shift_points_kernel<<<dimGrid, dimBlock>>>(d_original_points, d_kernel_matrix, d_shifted_points,

29
mean_shift_cuda_shared_mem/meanshift_kernels.cu

@ -67,14 +67,11 @@ __global__ void shift_points_kernel(Matrix original_points, Matrix kernel_matrix
int col = threadIdx.y; int col = threadIdx.y;
// performs calculations only if thread's indexes are within matrix bounds // performs calculations only if thread's indexes are within matrix bounds
//if (row * new_shift.width + col >= new_shift.width * new_shift.height){
/*if (new_shift.stride * BLOCK_SIZE * block_row + BLOCK_SIZE * block_col >=
new_shift.width * new_shift.height){*/
if (BLOCK_SIZE * block_row >= new_shift.height || BLOCK_SIZE * block_col >= new_shift.width){ if (BLOCK_SIZE * block_row >= new_shift.height || BLOCK_SIZE * block_col >= new_shift.width){
return; return;
} }
// Each thread block computes one sub-matrix sub_new_shift of C // each thread block computes one sub-matrix sub_new_shift of C
Matrix sub_new_shift = GetSubMatrix(new_shift, block_row, block_col, BLOCK_SIZE); Matrix sub_new_shift = GetSubMatrix(new_shift, block_row, block_col, BLOCK_SIZE);
// shared memory used to store sub_kernel_matrix and sub_original_points respectively // shared memory used to store sub_kernel_matrix and sub_original_points respectively
@ -84,7 +81,7 @@ __global__ void shift_points_kernel(Matrix original_points, Matrix kernel_matrix
s_sub_original_points = (double*)malloc(BLOCK_SIZE * BLOCK_SIZE * sizeof(double)); s_sub_original_points = (double*)malloc(BLOCK_SIZE * BLOCK_SIZE * sizeof(double));
// loops over all the sub-matrices of kernel_matrix and original_points that are required to // loops over all the sub-matrices of kernel_matrix and original_points that are required to
//compute sub_new_shift, multiplies each pair of sub-matrices and accumulates the results // compute sub_new_shift, multiplies each pair of sub-matrices and accumulates the results
for (int sub_matrix_index = 0; sub_matrix_index < (kernel_matrix.width / BLOCK_SIZE); ++sub_matrix_index) { for (int sub_matrix_index = 0; sub_matrix_index < (kernel_matrix.width / BLOCK_SIZE); ++sub_matrix_index) {
// gets sub-matrix sub_kernel_matrix of kernel_matrix // gets sub-matrix sub_kernel_matrix of kernel_matrix
@ -104,8 +101,8 @@ __global__ void shift_points_kernel(Matrix original_points, Matrix kernel_matrix
// multiplies sub_kernel_matrix and sub_original_points // multiplies sub_kernel_matrix and sub_original_points
for (int element_index = 0; element_index < BLOCK_SIZE; ++element_index){ for (int element_index = 0; element_index < BLOCK_SIZE; ++element_index){
cell_value += s_sub_kernel_matrix[row * sub_kernel_matrix.stride + element_index] * cell_value += s_sub_kernel_matrix[row * BLOCK_SIZE + element_index] *
s_sub_original_points[element_index * sub_original_points.stride + col]; s_sub_original_points[element_index * BLOCK_SIZE + col];
} }
// synchronizes to make sure that the preceding computation is done before loading two new // synchronizes to make sure that the preceding computation is done before loading two new
@ -114,21 +111,15 @@ __global__ void shift_points_kernel(Matrix original_points, Matrix kernel_matrix
} }
// new_shift elements are calculated by dividing with the denominator // new_shift elements are calculated by dividing with the denominator
int cell_row = (block_row * BLOCK_SIZE + row) * new_shift.stride;
int cell_col = block_col * BLOCK_SIZE + col;
//sub_new_shift.elements[cell_row + cell_col] = cell_value / denominator.elements[cell_row];
sub_new_shift.elements[row * sub_new_shift.stride + col] = sub_new_shift.elements[row * sub_new_shift.stride + col] =
cell_value / denominator.elements[block_row * BLOCK_SIZE + row]; cell_value / denominator.elements[block_row * BLOCK_SIZE + row];
// calculates mean-shift vector
/*mean_shift_vector.elements[(block_row * BLOCK_SIZE + row) * mean_shift_vector.stride
+ (block_col * BLOCK_SIZE + col)] =
sub_new_shift.elements[row * sub_new_shift.stride + col] -
shifted_points.elements[(block_row * BLOCK_SIZE + row) * shifted_points.stride
+ (block_col * BLOCK_SIZE + col)];*/
/*free(s_sub_kernel_matrix); int cell_row = block_row * BLOCK_SIZE + row;
free(s_sub_original_points);*/ int cell_col = block_col * BLOCK_SIZE + col;
mean_shift_vector.elements[cell_row * mean_shift_vector.stride + cell_col] =
sub_new_shift.elements[row * sub_new_shift.stride + col] -
shifted_points.elements[cell_row * shifted_points.stride + cell_col];
} }
// Get the BLOCK_SIZExBLOCK_SIZE sub-matrix Asub of A that is // Get the BLOCK_SIZExBLOCK_SIZE sub-matrix Asub of A that is
@ -138,7 +129,7 @@ __device__ Matrix GetSubMatrix(Matrix A, int row, int col, int BLOCK_SIZE){
Matrix Asub; Matrix Asub;
Asub.width = BLOCK_SIZE; Asub.width = BLOCK_SIZE;
Asub.height = BLOCK_SIZE; Asub.height = BLOCK_SIZE;
Asub.stride = BLOCK_SIZE; Asub.stride = A.stride;
Asub.elements = &(A.elements[A.stride * BLOCK_SIZE * row + BLOCK_SIZE * col]); Asub.elements = &(A.elements[A.stride * BLOCK_SIZE * row + BLOCK_SIZE * col]);
return Asub; return Asub;
} }
Loading…
Cancel
Save