|
|
@ -78,11 +78,18 @@ __global__ void shift_points_kernel(Matrix original_points, Matrix kernel_matrix |
|
|
|
// dynamically allocated shared memory used to store sub_kernel_matrix and sub_original_points |
|
|
|
// respectively |
|
|
|
extern __shared__ double joined_shared_memory[]; |
|
|
|
|
|
|
|
// first part of the allocated memory is used for s_sub_kernel_matrix and second part is used |
|
|
|
// for s_sub_original_points |
|
|
|
double *s_sub_kernel_matrix = &(joined_shared_memory[0]); |
|
|
|
double *s_sub_original_points = &(joined_shared_memory[BLOCK_SIZE * BLOCK_SIZE]); |
|
|
|
|
|
|
|
// cancel execution if allocation failed |
|
|
|
if (sizeof(s_sub_kernel_matrix) != BLOCK_SIZE * BLOCK_SIZE * 2){ |
|
|
|
__threadfence(); |
|
|
|
asm("trap;"); |
|
|
|
} |
|
|
|
|
|
|
|
// loops over all the sub-matrices of kernel_matrix and original_points that are required to |
|
|
|
// compute sub_new_shift, multiplies each pair of sub-matrices and accumulates the results |
|
|
|
for (int sub_matrix_index = 0; |
|
|
@ -104,10 +111,13 @@ __global__ void shift_points_kernel(Matrix original_points, Matrix kernel_matrix |
|
|
|
// synchronizes to make sure the sub-matrices are loaded before starting the computation |
|
|
|
__syncthreads(); |
|
|
|
|
|
|
|
int curr_col_begin = sub_matrix_index * BLOCK_SIZE; |
|
|
|
// multiplies sub_kernel_matrix and sub_original_points |
|
|
|
for (int element_index = 0; element_index < BLOCK_SIZE; ++element_index){ |
|
|
|
cell_value += s_sub_kernel_matrix[row * BLOCK_SIZE + element_index] * |
|
|
|
s_sub_original_points[element_index * BLOCK_SIZE + col]; |
|
|
|
if (curr_col_begin + element_index < kernel_matrix.width){ |
|
|
|
cell_value += s_sub_kernel_matrix[row * BLOCK_SIZE + element_index] * |
|
|
|
s_sub_original_points[element_index * BLOCK_SIZE + col]; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
// synchronizes to make sure that the preceding computation is done before loading two new |
|
|
|