Fix kernel dynamic memory allocation

7 years ago · e4fe746939
2 changed files with 15 additions and 22 deletions
--- a/mean_shift_cuda_shared_mem/meanshift_gpu_utils.cu
+++ b/mean_shift_cuda_shared_mem/meanshift_gpu_utils.cu
@ -122,18 +122,6 @@ int meanshift(double **original_points, double ***shifted_points, int deviation)
        &tmp_w_memcpy_time);
    w_memcpy_time += tmp_w_memcpy_time;
    /*for (int row=0; row<2; ++row){
        for (int col=0; col<2; ++col){
            printf("new_shift[%d][%d] = %f\n", row, col, new_shift[row][col]);
            printf("new_shift[%d][%d] = %f\n", 300+row, 216+col, new_shift[300+row][216+col]);
            printf("new_shift[%d][%d] = %f\n", 562+row, 487+col, new_shift[562+row][487+col]);
        }
    }*/
    /*if(is_first_recursion){
        exit(0);
    }*/
    // frees previously shifted points, they're now garbage
    free((*shifted_points)[0]);
    gpuErrchk( cudaFree(d_shifted_points.elements) );
@ -328,13 +316,16 @@ void shift_points(Matrix d_kernel_matrix, Matrix d_original_points, Matrix d_shi
    do {
        /*dimBlock.x = requested_block_size;
        dimBlock.y = d_new_shift.width;*/
-        dimBlock.x = 2;
+        dimBlock.x = min(d_new_shift.width, d_new_shift.height);
-        dimBlock.y = 2;
+        dimBlock.y = min(d_new_shift.width, d_new_shift.height);
        dimGrid.x = (d_new_shift.height + dimBlock.x - 1) / dimBlock.x;
-        dimGrid.y = 1;
+        dimGrid.y = (d_new_shift.height + dimBlock.x - 1) / dimBlock.x;
        int shared_memory_size = dimBlock.x * 2 * sizeof(double);
        //Kernel <<<numBlocks, threadsPerBlock, sharedMemory>>> (count_a, count_b);
-        shift_points_kernel<<<dimGrid, dimBlock>>>(d_original_points, d_kernel_matrix, d_shifted_points,
+        shift_points_kernel<<<dimGrid, dimBlock, shared_memory_size>>>(d_original_points,
-            d_new_shift, d_denominator, d_mean_shift_vector);
+            d_kernel_matrix, d_shifted_points, d_new_shift, d_denominator, d_mean_shift_vector);
        if (cudaGetLastError() != cudaSuccess){
            --requested_block_size;
        } else {
--- a/mean_shift_cuda_shared_mem/meanshift_kernels.cu
+++ b/mean_shift_cuda_shared_mem/meanshift_kernels.cu
@ -74,11 +74,13 @@ __global__ void shift_points_kernel(Matrix original_points, Matrix kernel_matrix
    // each thread block computes one sub-matrix sub_new_shift of C
    Matrix sub_new_shift = GetSubMatrix(new_shift, block_row, block_col, BLOCK_SIZE);
-    // shared memory used to store sub_kernel_matrix and sub_original_points respectively
+    // dynamically allocated shared memory used to store sub_kernel_matrix and sub_original_points
-    __shared__ double *s_sub_kernel_matrix;
+    // respectively
-    s_sub_kernel_matrix = (double*)malloc(BLOCK_SIZE * BLOCK_SIZE * sizeof(double));
+    extern __shared__ double joined_shared_memory[];
-    __shared__ double *s_sub_original_points;
+    // first part of the allocated memory is used for s_sub_kernel_matrix and second part is used
-    s_sub_original_points = (double*)malloc(BLOCK_SIZE * BLOCK_SIZE * sizeof(double));
+    // for s_sub_original_points
    double *s_sub_kernel_matrix = &(joined_shared_memory[0]);
    double *s_sub_original_points = &(joined_shared_memory[BLOCK_SIZE * BLOCK_SIZE]);
    // loops over all the sub-matrices of kernel_matrix and original_points that are required to
    // compute sub_new_shift, multiplies each pair of sub-matrices and accumulates the results