Fix shared memory version for odd number of points

8 years ago · 736b71fded
3 changed files with 15 additions and 16 deletions
--- a/mean_shift_cuda/meanshift.cu
+++ b/mean_shift_cuda/meanshift.cu
@ -5,10 +5,10 @@
 #include "meanshift_utils.h"
 #include "meanshift_gpu_utils.h"

-int DEVIATION = 31000;
-int NUMBER_OF_POINTS = 5000;
+int DEVIATION = 1;
+int NUMBER_OF_POINTS = 587;
 int DIMENSIONS = 2;
-char* POINTS_FILENAME = "../data/s4";
+const char *POINTS_FILENAME = "../data/X.bin";
 const char *LABELS_FILENAME = "../data/L.bin";
 parameters params;

--- a/mean_shift_cuda_shared_mem/meanshift_gpu_utils.cu
+++ b/mean_shift_cuda_shared_mem/meanshift_gpu_utils.cu
@ -40,6 +40,9 @@ void set_GPU(){
    }
    // sets the device
    gpuErrchk( cudaSetDevice(max_device) );
+    // lastly sets shared memory bank size to 8 bytes since data are represented by doubles
+    gpuErrchk( cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte) );
+
    if (params.verbose){
        printf("Device chosen is \"%s\"\n"
            "Device has %d multi processors and compute capability %d.%d\n"
@ -324,8 +327,8 @@ void shift_points(Matrix d_kernel_matrix, Matrix d_original_points, Matrix d_shi

        // size for kernel's dynamically allocated array
        // the size FOR EACH array is calculated as BLOCK_SIZE * BLOCK_SIZE * size_of_double
-        // the arrays nedded in kernel are two
-        shared_memory_size = dimBlock.x * dimBlock.x * sizeof(double) * 2;
+        // the arrays needed in kernel are two
+        shared_memory_size = (int)(dimBlock.x * dimBlock.x * sizeof(double) * 2);

        shift_points_kernel<<<dimGrid, dimBlock, shared_memory_size>>>(d_original_points,
            d_kernel_matrix, d_shifted_points, d_new_shift, d_denominator, d_mean_shift_vector);
--- a/mean_shift_cuda_shared_mem/meanshift_kernels.cu
+++ b/mean_shift_cuda_shared_mem/meanshift_kernels.cu
@ -55,7 +55,7 @@ __global__ void denominator_kernel(Matrix denominator, Matrix kernel_matrix){

 __global__ void shift_points_kernel(Matrix original_points, Matrix kernel_matrix,
    Matrix shifted_points, Matrix new_shift, Matrix denominator, Matrix mean_shift_vector){
-    int BLOCK_SIZE = blockDim.y;
+    int BLOCK_SIZE = blockDim.x;
    int block_row = blockIdx.x;
    int block_col = blockIdx.y;

@ -67,8 +67,8 @@ __global__ void shift_points_kernel(Matrix original_points, Matrix kernel_matrix
    int col = threadIdx.y;

    // performs calculations only if thread's indexes are within matrix bounds
-    if ((BLOCK_SIZE * block_row + row) >= new_shift.height ||
-        (BLOCK_SIZE * block_col + col) >= new_shift.width){
+    if ((BLOCK_SIZE * block_row + row) > new_shift.height ||
+        (BLOCK_SIZE * block_col + col) > new_shift.width){
        return;
    }

@ -78,17 +78,13 @@ __global__ void shift_points_kernel(Matrix original_points, Matrix kernel_matrix
    // dynamically allocated shared memory used to store sub_kernel_matrix and sub_original_points
    // respectively
    extern __shared__ double joined_shared_memory[];
+    // makes sure enough memory has been allocated
+    joined_shared_memory[BLOCK_SIZE * BLOCK_SIZE * 2] = 0;

    // first part of the allocated memory is used for s_sub_kernel_matrix and second part is used
    // for s_sub_original_points
-    double *s_sub_kernel_matrix = &(joined_shared_memory[0]);
-    double *s_sub_original_points = &(joined_shared_memory[BLOCK_SIZE * BLOCK_SIZE]);
-
-    // cancel execution if allocation failed
-    if (sizeof(s_sub_kernel_matrix) != BLOCK_SIZE * BLOCK_SIZE * 2){
-        __threadfence();
-        asm("trap;");
-    }
+    double *s_sub_kernel_matrix = (double *)joined_shared_memory;
+    double *s_sub_original_points = (double *)&joined_shared_memory[BLOCK_SIZE * BLOCK_SIZE];

    // loops over all the sub-matrices of kernel_matrix and original_points that are required to
    // compute sub_new_shift, multiplies each pair of sub-matrices and accumulates the results