|
@ -55,7 +55,8 @@ __global__ void denominator_kernel(Matrix denominator, Matrix kernel_matrix){ |
|
|
|
|
|
|
|
|
__global__ void shift_points_kernel(Matrix original_points, Matrix kernel_matrix, |
|
|
__global__ void shift_points_kernel(Matrix original_points, Matrix kernel_matrix, |
|
|
Matrix shifted_points, Matrix new_shift, Matrix denominator, Matrix mean_shift_vector){ |
|
|
Matrix shifted_points, Matrix new_shift, Matrix denominator, Matrix mean_shift_vector){ |
|
|
int BLOCK_SIZE = blockDim.x; |
|
|
int ROW_BLOCK_SIZE = blockDim.x; |
|
|
|
|
|
int COLUMN_BLOCK_SIZE = blockDim.y; |
|
|
int block_row = blockIdx.x; |
|
|
int block_row = blockIdx.x; |
|
|
int block_col = blockIdx.y; |
|
|
int block_col = blockIdx.y; |
|
|
|
|
|
|
|
@ -67,52 +68,56 @@ __global__ void shift_points_kernel(Matrix original_points, Matrix kernel_matrix |
|
|
int col = threadIdx.y; |
|
|
int col = threadIdx.y; |
|
|
|
|
|
|
|
|
// performs calculations only if thread's indexes are within matrix bounds |
|
|
// performs calculations only if thread's indexes are within matrix bounds |
|
|
if ((BLOCK_SIZE * block_row + row) > new_shift.height || |
|
|
if ((ROW_BLOCK_SIZE * block_row + row) > new_shift.height || |
|
|
(BLOCK_SIZE * block_col + col) > new_shift.width){ |
|
|
(COLUMN_BLOCK_SIZE * block_col + col) > new_shift.width){ |
|
|
return; |
|
|
return; |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
// each thread block computes one sub-matrix sub_new_shift of C |
|
|
// each thread block computes one sub-matrix sub_new_shift of C |
|
|
Matrix sub_new_shift = get_sub_matrix(new_shift, block_row, block_col, BLOCK_SIZE); |
|
|
Matrix sub_new_shift = get_sub_matrix(new_shift, block_row, block_col, ROW_BLOCK_SIZE, |
|
|
|
|
|
COLUMN_BLOCK_SIZE); |
|
|
|
|
|
|
|
|
// dynamically allocated shared memory used to store sub_kernel_matrix and sub_original_points |
|
|
// dynamically allocated shared memory used to store sub_kernel_matrix and sub_original_points |
|
|
// respectively |
|
|
// respectively |
|
|
extern __shared__ double joined_shared_memory[]; |
|
|
extern __shared__ double joined_shared_memory[]; |
|
|
// makes sure enough memory has been allocated |
|
|
// makes sure enough memory has been allocated |
|
|
joined_shared_memory[BLOCK_SIZE * BLOCK_SIZE * 2] = 0; |
|
|
joined_shared_memory[ROW_BLOCK_SIZE * COLUMN_BLOCK_SIZE * 2 - 1] = 0; |
|
|
|
|
|
|
|
|
// first part of the allocated memory is used for s_sub_kernel_matrix and second part is used |
|
|
// first part of the allocated memory is used for s_sub_kernel_matrix and second part is used |
|
|
// for s_sub_original_points |
|
|
// for s_sub_original_points |
|
|
double *s_sub_kernel_matrix = (double *)joined_shared_memory; |
|
|
double *s_sub_kernel_matrix = (double *)joined_shared_memory; |
|
|
double *s_sub_original_points = (double *)&joined_shared_memory[BLOCK_SIZE * BLOCK_SIZE]; |
|
|
double *s_sub_original_points = (double *)&joined_shared_memory[ROW_BLOCK_SIZE * |
|
|
|
|
|
COLUMN_BLOCK_SIZE]; |
|
|
|
|
|
|
|
|
// loops over all the sub-matrices of kernel_matrix and original_points that are required to |
|
|
// loops over all the sub-matrices of kernel_matrix and original_points that are required to |
|
|
// compute sub_new_shift, multiplies each pair of sub-matrices and accumulates the results |
|
|
// compute sub_new_shift, multiplies each pair of sub-matrices and accumulates the results |
|
|
for (int sub_matrix_index = 0; |
|
|
for (int sub_matrix_index = 0; |
|
|
sub_matrix_index < ((kernel_matrix.width + BLOCK_SIZE - 1) / BLOCK_SIZE); |
|
|
sub_matrix_index < ((kernel_matrix.width + COLUMN_BLOCK_SIZE - 1) / COLUMN_BLOCK_SIZE); |
|
|
++sub_matrix_index) { |
|
|
++sub_matrix_index) { |
|
|
|
|
|
|
|
|
// gets sub-matrix sub_kernel_matrix of kernel_matrix |
|
|
// gets sub-matrix sub_kernel_matrix of kernel_matrix |
|
|
Matrix sub_kernel_matrix = get_sub_matrix(kernel_matrix, block_row, sub_matrix_index, BLOCK_SIZE); |
|
|
Matrix sub_kernel_matrix = get_sub_matrix(kernel_matrix, block_row, sub_matrix_index, |
|
|
|
|
|
ROW_BLOCK_SIZE, COLUMN_BLOCK_SIZE); |
|
|
// gets sub-matrix sub_original_points of original_points |
|
|
// gets sub-matrix sub_original_points of original_points |
|
|
Matrix sub_original_points = get_sub_matrix(original_points, sub_matrix_index, block_col, BLOCK_SIZE); |
|
|
Matrix sub_original_points = get_sub_matrix(original_points, sub_matrix_index, block_col, |
|
|
|
|
|
COLUMN_BLOCK_SIZE, ROW_BLOCK_SIZE); |
|
|
|
|
|
|
|
|
// loads s_sub_kernel_matrix and s_sub_original_points from device global memory to shared |
|
|
// loads s_sub_kernel_matrix and s_sub_original_points from device global memory to shared |
|
|
//memory, each thread loads one element of each sub-matrix |
|
|
// memory, each thread loads one element of each sub-matrix |
|
|
s_sub_kernel_matrix[row * BLOCK_SIZE + col] = |
|
|
s_sub_kernel_matrix[row * COLUMN_BLOCK_SIZE + col] = |
|
|
sub_kernel_matrix.elements[row * sub_kernel_matrix.stride + col]; |
|
|
sub_kernel_matrix.elements[row * sub_kernel_matrix.stride + col]; |
|
|
s_sub_original_points[row * BLOCK_SIZE + col] = |
|
|
s_sub_original_points[row * COLUMN_BLOCK_SIZE + col] = |
|
|
sub_original_points.elements[row * sub_original_points.stride + col]; |
|
|
sub_original_points.elements[row * sub_original_points.stride + col]; |
|
|
|
|
|
|
|
|
// synchronizes to make sure the sub-matrices are loaded before starting the computation |
|
|
// synchronizes to make sure the sub-matrices are loaded before starting the computation |
|
|
__syncthreads(); |
|
|
__syncthreads(); |
|
|
|
|
|
|
|
|
int curr_col_begin = sub_matrix_index * BLOCK_SIZE; |
|
|
int curr_col_begin = sub_matrix_index * COLUMN_BLOCK_SIZE; |
|
|
// multiplies sub_kernel_matrix and sub_original_points |
|
|
// multiplies sub_kernel_matrix and sub_original_points |
|
|
for (int element_index = 0; element_index < BLOCK_SIZE; ++element_index){ |
|
|
for (int element_index = 0; element_index < COLUMN_BLOCK_SIZE; ++element_index){ |
|
|
if (curr_col_begin + element_index < kernel_matrix.width){ |
|
|
if (curr_col_begin + element_index < kernel_matrix.width){ |
|
|
cell_value += s_sub_kernel_matrix[row * BLOCK_SIZE + element_index] * |
|
|
cell_value += s_sub_kernel_matrix[row * COLUMN_BLOCK_SIZE + element_index] * |
|
|
s_sub_original_points[element_index * BLOCK_SIZE + col]; |
|
|
s_sub_original_points[element_index * COLUMN_BLOCK_SIZE + col]; |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
@ -123,10 +128,10 @@ __global__ void shift_points_kernel(Matrix original_points, Matrix kernel_matrix |
|
|
|
|
|
|
|
|
// new_shift elements are calculated by dividing with the denominator |
|
|
// new_shift elements are calculated by dividing with the denominator |
|
|
sub_new_shift.elements[row * sub_new_shift.stride + col] = |
|
|
sub_new_shift.elements[row * sub_new_shift.stride + col] = |
|
|
cell_value / denominator.elements[block_row * BLOCK_SIZE + row]; |
|
|
cell_value / denominator.elements[block_row * ROW_BLOCK_SIZE + row]; |
|
|
|
|
|
|
|
|
int cell_row = block_row * BLOCK_SIZE + row; |
|
|
int cell_row = block_row * ROW_BLOCK_SIZE + row; |
|
|
int cell_col = block_col * BLOCK_SIZE + col; |
|
|
int cell_col = block_col * COLUMN_BLOCK_SIZE + col; |
|
|
mean_shift_vector.elements[cell_row * mean_shift_vector.stride + cell_col] = |
|
|
mean_shift_vector.elements[cell_row * mean_shift_vector.stride + cell_col] = |
|
|
sub_new_shift.elements[row * sub_new_shift.stride + col] - |
|
|
sub_new_shift.elements[row * sub_new_shift.stride + col] - |
|
|
shifted_points.elements[cell_row * shifted_points.stride + cell_col]; |
|
|
shifted_points.elements[cell_row * shifted_points.stride + cell_col]; |
|
@ -134,11 +139,11 @@ __global__ void shift_points_kernel(Matrix original_points, Matrix kernel_matrix |
|
|
|
|
|
|
|
|
// gets the BLOCK_SIZExBLOCK_SIZE sub-matrix Asub of A that is located col sub-matrices to the right |
|
|
// gets the BLOCK_SIZExBLOCK_SIZE sub-matrix Asub of A that is located col sub-matrices to the right |
|
|
// and row sub-matrices down from the upper-left corner of A |
|
|
// and row sub-matrices down from the upper-left corner of A |
|
|
__device__ Matrix get_sub_matrix(Matrix A, int row, int col, int BLOCK_SIZE){ |
|
|
__device__ Matrix get_sub_matrix(Matrix A, int row, int col, int ROW_BLOCK_SIZE, int COLUMN_BLOCK_SIZE){ |
|
|
Matrix Asub; |
|
|
Matrix Asub; |
|
|
Asub.width = BLOCK_SIZE; |
|
|
Asub.width = COLUMN_BLOCK_SIZE; |
|
|
Asub.height = BLOCK_SIZE; |
|
|
Asub.height = ROW_BLOCK_SIZE; |
|
|
Asub.stride = A.stride; |
|
|
Asub.stride = A.stride; |
|
|
Asub.elements = &(A.elements[A.stride * BLOCK_SIZE * row + BLOCK_SIZE * col]); |
|
|
Asub.elements = &(A.elements[A.stride * ROW_BLOCK_SIZE * row + COLUMN_BLOCK_SIZE * col]); |
|
|
return Asub; |
|
|
return Asub; |
|
|
} |
|
|
} |