From 9779116930e2f9f9c9000500bcaf353ca554d2d5 Mon Sep 17 00:00:00 2001 From: Apostolof Date: Thu, 1 Feb 2018 02:16:07 +0200 Subject: [PATCH] Add asymmetric block sizes, occupancy improvement --- .../meanshift_gpu_utils.cu | 9 ++-- .../meanshift_kernels.cu | 51 ++++++++++--------- .../meanshift_kernels.h | 3 +- 3 files changed, 36 insertions(+), 27 deletions(-) diff --git a/mean_shift_cuda_shared_mem/meanshift_gpu_utils.cu b/mean_shift_cuda_shared_mem/meanshift_gpu_utils.cu index 6566df4..6e6437d 100644 --- a/mean_shift_cuda_shared_mem/meanshift_gpu_utils.cu +++ b/mean_shift_cuda_shared_mem/meanshift_gpu_utils.cu @@ -320,15 +320,18 @@ void shift_points(Matrix d_kernel_matrix, Matrix d_original_points, Matrix d_shi dim3 dimBlock; dim3 dimGrid; do { - dimBlock.x = requested_block_size; - dimBlock.y = requested_block_size; + /*dimBlock.x = requested_block_size; + dimBlock.y = requested_block_size;*/ + //to do + dimBlock.x = 240; + dimBlock.y = 2; dimGrid.x = (d_new_shift.height + dimBlock.x - 1) / dimBlock.x; dimGrid.y = (d_new_shift.width + dimBlock.y - 1) / dimBlock.y; // size for kernel's dynamically allocated array // the size FOR EACH array is calculated as BLOCK_SIZE * BLOCK_SIZE * size_of_double // the arrays needed in kernel are two - shared_memory_size = (int)(dimBlock.x * dimBlock.x * sizeof(double) * 2); + shared_memory_size = (int)(dimBlock.x * dimBlock.y * sizeof(double) * 2); shift_points_kernel<<>>(d_original_points, d_kernel_matrix, d_shifted_points, d_new_shift, d_denominator, d_mean_shift_vector); diff --git a/mean_shift_cuda_shared_mem/meanshift_kernels.cu b/mean_shift_cuda_shared_mem/meanshift_kernels.cu index 9c2d948..2181327 100644 --- a/mean_shift_cuda_shared_mem/meanshift_kernels.cu +++ b/mean_shift_cuda_shared_mem/meanshift_kernels.cu @@ -55,7 +55,8 @@ __global__ void denominator_kernel(Matrix denominator, Matrix kernel_matrix){ __global__ void shift_points_kernel(Matrix original_points, Matrix kernel_matrix, Matrix shifted_points, Matrix new_shift, Matrix denominator, Matrix mean_shift_vector){ - int BLOCK_SIZE = blockDim.x; + int ROW_BLOCK_SIZE = blockDim.x; + int COLUMN_BLOCK_SIZE = blockDim.y; int block_row = blockIdx.x; int block_col = blockIdx.y; @@ -67,52 +68,56 @@ __global__ void shift_points_kernel(Matrix original_points, Matrix kernel_matrix int col = threadIdx.y; // performs calculations only if thread's indexes are within matrix bounds - if ((BLOCK_SIZE * block_row + row) > new_shift.height || - (BLOCK_SIZE * block_col + col) > new_shift.width){ + if ((ROW_BLOCK_SIZE * block_row + row) > new_shift.height || + (COLUMN_BLOCK_SIZE * block_col + col) > new_shift.width){ return; } // each thread block computes one sub-matrix sub_new_shift of C - Matrix sub_new_shift = get_sub_matrix(new_shift, block_row, block_col, BLOCK_SIZE); + Matrix sub_new_shift = get_sub_matrix(new_shift, block_row, block_col, ROW_BLOCK_SIZE, + COLUMN_BLOCK_SIZE); // dynamically allocated shared memory used to store sub_kernel_matrix and sub_original_points // respectively extern __shared__ double joined_shared_memory[]; // makes sure enough memory has been allocated - joined_shared_memory[BLOCK_SIZE * BLOCK_SIZE * 2] = 0; + joined_shared_memory[ROW_BLOCK_SIZE * COLUMN_BLOCK_SIZE * 2 - 1] = 0; // first part of the allocated memory is used for s_sub_kernel_matrix and second part is used // for s_sub_original_points double *s_sub_kernel_matrix = (double *)joined_shared_memory; - double *s_sub_original_points = (double *)&joined_shared_memory[BLOCK_SIZE * BLOCK_SIZE]; + double *s_sub_original_points = (double *)&joined_shared_memory[ROW_BLOCK_SIZE * + COLUMN_BLOCK_SIZE]; // loops over all the sub-matrices of kernel_matrix and original_points that are required to // compute sub_new_shift, multiplies each pair of sub-matrices and accumulates the results for (int sub_matrix_index = 0; - sub_matrix_index < ((kernel_matrix.width + BLOCK_SIZE - 1) / BLOCK_SIZE); + sub_matrix_index < ((kernel_matrix.width + COLUMN_BLOCK_SIZE - 1) / COLUMN_BLOCK_SIZE); ++sub_matrix_index) { // gets sub-matrix sub_kernel_matrix of kernel_matrix - Matrix sub_kernel_matrix = get_sub_matrix(kernel_matrix, block_row, sub_matrix_index, BLOCK_SIZE); + Matrix sub_kernel_matrix = get_sub_matrix(kernel_matrix, block_row, sub_matrix_index, + ROW_BLOCK_SIZE, COLUMN_BLOCK_SIZE); // gets sub-matrix sub_original_points of original_points - Matrix sub_original_points = get_sub_matrix(original_points, sub_matrix_index, block_col, BLOCK_SIZE); + Matrix sub_original_points = get_sub_matrix(original_points, sub_matrix_index, block_col, + COLUMN_BLOCK_SIZE, ROW_BLOCK_SIZE); // loads s_sub_kernel_matrix and s_sub_original_points from device global memory to shared - //memory, each thread loads one element of each sub-matrix - s_sub_kernel_matrix[row * BLOCK_SIZE + col] = + // memory, each thread loads one element of each sub-matrix + s_sub_kernel_matrix[row * COLUMN_BLOCK_SIZE + col] = sub_kernel_matrix.elements[row * sub_kernel_matrix.stride + col]; - s_sub_original_points[row * BLOCK_SIZE + col] = + s_sub_original_points[row * COLUMN_BLOCK_SIZE + col] = sub_original_points.elements[row * sub_original_points.stride + col]; // synchronizes to make sure the sub-matrices are loaded before starting the computation __syncthreads(); - int curr_col_begin = sub_matrix_index * BLOCK_SIZE; + int curr_col_begin = sub_matrix_index * COLUMN_BLOCK_SIZE; // multiplies sub_kernel_matrix and sub_original_points - for (int element_index = 0; element_index < BLOCK_SIZE; ++element_index){ + for (int element_index = 0; element_index < COLUMN_BLOCK_SIZE; ++element_index){ if (curr_col_begin + element_index < kernel_matrix.width){ - cell_value += s_sub_kernel_matrix[row * BLOCK_SIZE + element_index] * - s_sub_original_points[element_index * BLOCK_SIZE + col]; + cell_value += s_sub_kernel_matrix[row * COLUMN_BLOCK_SIZE + element_index] * + s_sub_original_points[element_index * COLUMN_BLOCK_SIZE + col]; } } @@ -123,10 +128,10 @@ __global__ void shift_points_kernel(Matrix original_points, Matrix kernel_matrix // new_shift elements are calculated by dividing with the denominator sub_new_shift.elements[row * sub_new_shift.stride + col] = - cell_value / denominator.elements[block_row * BLOCK_SIZE + row]; + cell_value / denominator.elements[block_row * ROW_BLOCK_SIZE + row]; - int cell_row = block_row * BLOCK_SIZE + row; - int cell_col = block_col * BLOCK_SIZE + col; + int cell_row = block_row * ROW_BLOCK_SIZE + row; + int cell_col = block_col * COLUMN_BLOCK_SIZE + col; mean_shift_vector.elements[cell_row * mean_shift_vector.stride + cell_col] = sub_new_shift.elements[row * sub_new_shift.stride + col] - shifted_points.elements[cell_row * shifted_points.stride + cell_col]; @@ -134,11 +139,11 @@ __global__ void shift_points_kernel(Matrix original_points, Matrix kernel_matrix // gets the BLOCK_SIZExBLOCK_SIZE sub-matrix Asub of A that is located col sub-matrices to the right // and row sub-matrices down from the upper-left corner of A -__device__ Matrix get_sub_matrix(Matrix A, int row, int col, int BLOCK_SIZE){ +__device__ Matrix get_sub_matrix(Matrix A, int row, int col, int ROW_BLOCK_SIZE, int COLUMN_BLOCK_SIZE){ Matrix Asub; - Asub.width = BLOCK_SIZE; - Asub.height = BLOCK_SIZE; + Asub.width = COLUMN_BLOCK_SIZE; + Asub.height = ROW_BLOCK_SIZE; Asub.stride = A.stride; - Asub.elements = &(A.elements[A.stride * BLOCK_SIZE * row + BLOCK_SIZE * col]); + Asub.elements = &(A.elements[A.stride * ROW_BLOCK_SIZE * row + COLUMN_BLOCK_SIZE * col]); return Asub; } \ No newline at end of file diff --git a/mean_shift_cuda_shared_mem/meanshift_kernels.h b/mean_shift_cuda_shared_mem/meanshift_kernels.h index 16e9f95..db93809 100644 --- a/mean_shift_cuda_shared_mem/meanshift_kernels.h +++ b/mean_shift_cuda_shared_mem/meanshift_kernels.h @@ -24,6 +24,7 @@ __global__ void denominator_kernel(Matrix denominator, Matrix kernel_matrix); __global__ void shift_points_kernel(Matrix original_points, Matrix kernel_matrix, Matrix shifted_points, Matrix new_shift, Matrix denominator, Matrix mean_shift_vector); -__device__ Matrix get_sub_matrix(Matrix A, int row, int col, int BLOCK_SIZE); +__device__ Matrix get_sub_matrix(Matrix A, int row, int col, int ROW_BLOCK_SIZE, + int COLUMN_BLOCK_SIZE); #endif //SERIAL_KERNELS_H \ No newline at end of file