From 64c24ec313d548111126920920dc7b00a1e61a55 Mon Sep 17 00:00:00 2001 From: Apostolof Date: Sun, 28 Jan 2018 16:02:13 +0200 Subject: [PATCH] Fix shared memory implementation --- mean_shift_cuda_shared_mem/meanshift.cu | 2 +- .../meanshift_gpu_utils.cu | 16 +++++++--- .../meanshift_kernels.cu | 31 +++++++------------ 3 files changed, 23 insertions(+), 26 deletions(-) diff --git a/mean_shift_cuda_shared_mem/meanshift.cu b/mean_shift_cuda_shared_mem/meanshift.cu index 37b971a..3b33b84 100644 --- a/mean_shift_cuda_shared_mem/meanshift.cu +++ b/mean_shift_cuda_shared_mem/meanshift.cu @@ -21,7 +21,7 @@ int main(int argc, char **argv){ char *labels; params.epsilon = 0.0001; - params.verbose = false; + params.verbose = true; params.display = true; //get_args(argc, argv, ¶ms); //commented out while in development init(&vectors, &labels); diff --git a/mean_shift_cuda_shared_mem/meanshift_gpu_utils.cu b/mean_shift_cuda_shared_mem/meanshift_gpu_utils.cu index 7f18758..8bc80ed 100644 --- a/mean_shift_cuda_shared_mem/meanshift_gpu_utils.cu +++ b/mean_shift_cuda_shared_mem/meanshift_gpu_utils.cu @@ -113,6 +113,7 @@ int meanshift(double **original_points, double ***shifted_points, int deviation) // allocates corresponding memory in device d_new_shift.width = DIMENSIONS; d_new_shift.height = NUMBER_OF_POINTS; + d_new_shift.stride = d_new_shift.width; size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double); gpuErrchk( cudaMalloc(&(d_new_shift.elements), size) ); @@ -121,17 +122,17 @@ int meanshift(double **original_points, double ***shifted_points, int deviation) &tmp_w_memcpy_time); w_memcpy_time += tmp_w_memcpy_time; - for (int row=0; row<2; ++row){ + /*for (int row=0; row<2; ++row){ for (int col=0; col<2; ++col){ printf("new_shift[%d][%d] = %f\n", row, col, new_shift[row][col]); printf("new_shift[%d][%d] = %f\n", 300+row, 216+col, new_shift[300+row][216+col]); printf("new_shift[%d][%d] = %f\n", 562+row, 487+col, new_shift[562+row][487+col]); } - } + }*/ - if(is_first_recursion){ + /*if(is_first_recursion){ exit(0); - } + }*/ // frees previously shifted points, they're now garbage free((*shifted_points)[0]); @@ -195,6 +196,7 @@ void init_device_memory(double **original_points, double **shifted_points, // allocates memory for original_points in GPU and copies the array d_original_points->width = DIMENSIONS; d_original_points->height = NUMBER_OF_POINTS; + d_original_points->stride = d_original_points->width; size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double); gpuErrchk( cudaMalloc(&(d_original_points->elements), size) ); gpuErrchk( cudaMemcpy(d_original_points->elements, &(original_points[0][0]) @@ -203,6 +205,7 @@ void init_device_memory(double **original_points, double **shifted_points, // allocates memory for shifted_points in GPU and copies the array d_shifted_points->width = DIMENSIONS; d_shifted_points->height = NUMBER_OF_POINTS; + d_shifted_points->stride = d_shifted_points->width; size = DIMENSIONS * NUMBER_OF_POINTS * sizeof(double); gpuErrchk( cudaMalloc(&(d_shifted_points->elements), size) ); gpuErrchk( cudaMemcpy(d_shifted_points->elements, &(shifted_points[0][0]) @@ -211,18 +214,21 @@ void init_device_memory(double **original_points, double **shifted_points, // allocates memory for kernel_matrix in GPU d_kernel_matrix->width = NUMBER_OF_POINTS; d_kernel_matrix->height = NUMBER_OF_POINTS; + d_kernel_matrix->stride = d_kernel_matrix->width; size = NUMBER_OF_POINTS * NUMBER_OF_POINTS * sizeof(double); gpuErrchk( cudaMalloc(&(d_kernel_matrix->elements), size) ); // allocates memory for denominator in GPU d_denominator->width = 1; d_denominator->height = NUMBER_OF_POINTS; + d_denominator->stride = d_denominator->width; size = NUMBER_OF_POINTS * sizeof(double); gpuErrchk( cudaMalloc(&(d_denominator->elements), size) ); // allocates memory for mean_shift_vector in GPU d_mean_shift_vector->width = DIMENSIONS; d_mean_shift_vector->height = NUMBER_OF_POINTS; + d_mean_shift_vector->stride = d_mean_shift_vector->width; size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double); gpuErrchk( cudaMalloc(&(d_mean_shift_vector->elements), size) ); } @@ -324,7 +330,7 @@ void shift_points(Matrix d_kernel_matrix, Matrix d_original_points, Matrix d_shi dimBlock.y = d_new_shift.width;*/ dimBlock.x = 2; dimBlock.y = 2; - dimGrid.x = (d_denominator.height + dimBlock.x - 1) / dimBlock.x; + dimGrid.x = (d_new_shift.height + dimBlock.x - 1) / dimBlock.x; dimGrid.y = 1; shift_points_kernel<<>>(d_original_points, d_kernel_matrix, d_shifted_points, diff --git a/mean_shift_cuda_shared_mem/meanshift_kernels.cu b/mean_shift_cuda_shared_mem/meanshift_kernels.cu index f8078fb..99368b6 100644 --- a/mean_shift_cuda_shared_mem/meanshift_kernels.cu +++ b/mean_shift_cuda_shared_mem/meanshift_kernels.cu @@ -66,15 +66,12 @@ __global__ void shift_points_kernel(Matrix original_points, Matrix kernel_matrix int row = threadIdx.x; int col = threadIdx.y; - // performs calculations only if thread's indexes are within matrix bounds - //if (row * new_shift.width + col >= new_shift.width * new_shift.height){ - /*if (new_shift.stride * BLOCK_SIZE * block_row + BLOCK_SIZE * block_col >= - new_shift.width * new_shift.height){*/ + // performs calculations only if thread's indexes are within matrix bounds if (BLOCK_SIZE * block_row >= new_shift.height || BLOCK_SIZE * block_col >= new_shift.width){ return; } - // Each thread block computes one sub-matrix sub_new_shift of C + // each thread block computes one sub-matrix sub_new_shift of C Matrix sub_new_shift = GetSubMatrix(new_shift, block_row, block_col, BLOCK_SIZE); // shared memory used to store sub_kernel_matrix and sub_original_points respectively @@ -84,7 +81,7 @@ __global__ void shift_points_kernel(Matrix original_points, Matrix kernel_matrix s_sub_original_points = (double*)malloc(BLOCK_SIZE * BLOCK_SIZE * sizeof(double)); // loops over all the sub-matrices of kernel_matrix and original_points that are required to - //compute sub_new_shift, multiplies each pair of sub-matrices and accumulates the results + // compute sub_new_shift, multiplies each pair of sub-matrices and accumulates the results for (int sub_matrix_index = 0; sub_matrix_index < (kernel_matrix.width / BLOCK_SIZE); ++sub_matrix_index) { // gets sub-matrix sub_kernel_matrix of kernel_matrix @@ -104,8 +101,8 @@ __global__ void shift_points_kernel(Matrix original_points, Matrix kernel_matrix // multiplies sub_kernel_matrix and sub_original_points for (int element_index = 0; element_index < BLOCK_SIZE; ++element_index){ - cell_value += s_sub_kernel_matrix[row * sub_kernel_matrix.stride + element_index] * - s_sub_original_points[element_index * sub_original_points.stride + col]; + cell_value += s_sub_kernel_matrix[row * BLOCK_SIZE + element_index] * + s_sub_original_points[element_index * BLOCK_SIZE + col]; } // synchronizes to make sure that the preceding computation is done before loading two new @@ -114,21 +111,15 @@ __global__ void shift_points_kernel(Matrix original_points, Matrix kernel_matrix } // new_shift elements are calculated by dividing with the denominator - int cell_row = (block_row * BLOCK_SIZE + row) * new_shift.stride; - int cell_col = block_col * BLOCK_SIZE + col; - //sub_new_shift.elements[cell_row + cell_col] = cell_value / denominator.elements[cell_row]; sub_new_shift.elements[row * sub_new_shift.stride + col] = cell_value / denominator.elements[block_row * BLOCK_SIZE + row]; - // calculates mean-shift vector - /*mean_shift_vector.elements[(block_row * BLOCK_SIZE + row) * mean_shift_vector.stride - + (block_col * BLOCK_SIZE + col)] = - sub_new_shift.elements[row * sub_new_shift.stride + col] - - shifted_points.elements[(block_row * BLOCK_SIZE + row) * shifted_points.stride - + (block_col * BLOCK_SIZE + col)];*/ - /*free(s_sub_kernel_matrix); - free(s_sub_original_points);*/ + int cell_row = block_row * BLOCK_SIZE + row; + int cell_col = block_col * BLOCK_SIZE + col; + mean_shift_vector.elements[cell_row * mean_shift_vector.stride + cell_col] = + sub_new_shift.elements[row * sub_new_shift.stride + col] - + shifted_points.elements[cell_row * shifted_points.stride + cell_col]; } // Get the BLOCK_SIZExBLOCK_SIZE sub-matrix Asub of A that is @@ -138,7 +129,7 @@ __device__ Matrix GetSubMatrix(Matrix A, int row, int col, int BLOCK_SIZE){ Matrix Asub; Asub.width = BLOCK_SIZE; Asub.height = BLOCK_SIZE; - Asub.stride = BLOCK_SIZE; + Asub.stride = A.stride; Asub.elements = &(A.elements[A.stride * BLOCK_SIZE * row + BLOCK_SIZE * col]); return Asub; } \ No newline at end of file