diff --git a/mean_shift_cuda_shared_mem/meanshift_gpu_utils.cu b/mean_shift_cuda_shared_mem/meanshift_gpu_utils.cu index 8bc80ed..2acbfea 100644 --- a/mean_shift_cuda_shared_mem/meanshift_gpu_utils.cu +++ b/mean_shift_cuda_shared_mem/meanshift_gpu_utils.cu @@ -122,18 +122,6 @@ int meanshift(double **original_points, double ***shifted_points, int deviation) &tmp_w_memcpy_time); w_memcpy_time += tmp_w_memcpy_time; - /*for (int row=0; row<2; ++row){ - for (int col=0; col<2; ++col){ - printf("new_shift[%d][%d] = %f\n", row, col, new_shift[row][col]); - printf("new_shift[%d][%d] = %f\n", 300+row, 216+col, new_shift[300+row][216+col]); - printf("new_shift[%d][%d] = %f\n", 562+row, 487+col, new_shift[562+row][487+col]); - } - }*/ - - /*if(is_first_recursion){ - exit(0); - }*/ - // frees previously shifted points, they're now garbage free((*shifted_points)[0]); gpuErrchk( cudaFree(d_shifted_points.elements) ); @@ -328,13 +316,16 @@ void shift_points(Matrix d_kernel_matrix, Matrix d_original_points, Matrix d_shi do { /*dimBlock.x = requested_block_size; dimBlock.y = d_new_shift.width;*/ - dimBlock.x = 2; - dimBlock.y = 2; + dimBlock.x = min(d_new_shift.width, d_new_shift.height); + dimBlock.y = min(d_new_shift.width, d_new_shift.height); dimGrid.x = (d_new_shift.height + dimBlock.x - 1) / dimBlock.x; - dimGrid.y = 1; + dimGrid.y = (d_new_shift.height + dimBlock.x - 1) / dimBlock.x; + + int shared_memory_size = dimBlock.x * 2 * sizeof(double); + //Kernel <<>> (count_a, count_b); - shift_points_kernel<<>>(d_original_points, d_kernel_matrix, d_shifted_points, - d_new_shift, d_denominator, d_mean_shift_vector); + shift_points_kernel<<>>(d_original_points, + d_kernel_matrix, d_shifted_points, d_new_shift, d_denominator, d_mean_shift_vector); if (cudaGetLastError() != cudaSuccess){ --requested_block_size; } else { diff --git a/mean_shift_cuda_shared_mem/meanshift_kernels.cu b/mean_shift_cuda_shared_mem/meanshift_kernels.cu index 99368b6..09dada6 100644 --- a/mean_shift_cuda_shared_mem/meanshift_kernels.cu +++ b/mean_shift_cuda_shared_mem/meanshift_kernels.cu @@ -74,11 +74,13 @@ __global__ void shift_points_kernel(Matrix original_points, Matrix kernel_matrix // each thread block computes one sub-matrix sub_new_shift of C Matrix sub_new_shift = GetSubMatrix(new_shift, block_row, block_col, BLOCK_SIZE); - // shared memory used to store sub_kernel_matrix and sub_original_points respectively - __shared__ double *s_sub_kernel_matrix; - s_sub_kernel_matrix = (double*)malloc(BLOCK_SIZE * BLOCK_SIZE * sizeof(double)); - __shared__ double *s_sub_original_points; - s_sub_original_points = (double*)malloc(BLOCK_SIZE * BLOCK_SIZE * sizeof(double)); + // dynamically allocated shared memory used to store sub_kernel_matrix and sub_original_points + // respectively + extern __shared__ double joined_shared_memory[]; + // first part of the allocated memory is used for s_sub_kernel_matrix and second part is used + // for s_sub_original_points + double *s_sub_kernel_matrix = &(joined_shared_memory[0]); + double *s_sub_original_points = &(joined_shared_memory[BLOCK_SIZE * BLOCK_SIZE]); // loops over all the sub-matrices of kernel_matrix and original_points that are required to // compute sub_new_shift, multiplies each pair of sub-matrices and accumulates the results