From 64c24ec313d548111126920920dc7b00a1e61a55 Mon Sep 17 00:00:00 2001
From: Apostolof <apotwohd@gmail.com>
Date: Sun, 28 Jan 2018 16:02:13 +0200
Subject: [PATCH] Fix shared memory implementation

---
 mean_shift_cuda_shared_mem/meanshift.cu       |  2 +-
 .../meanshift_gpu_utils.cu                    | 16 +++++++---
 .../meanshift_kernels.cu                      | 31 +++++++------------
 3 files changed, 23 insertions(+), 26 deletions(-)

diff --git a/mean_shift_cuda_shared_mem/meanshift.cu b/mean_shift_cuda_shared_mem/meanshift.cu
index 37b971a..3b33b84 100644
--- a/mean_shift_cuda_shared_mem/meanshift.cu
+++ b/mean_shift_cuda_shared_mem/meanshift.cu
@@ -21,7 +21,7 @@ int main(int argc, char **argv){
     char *labels;
 
     params.epsilon = 0.0001;
-    params.verbose = false;
+    params.verbose = true;
     params.display = true;
     //get_args(argc, argv, &params); //commented out while in development
     init(&vectors, &labels);
diff --git a/mean_shift_cuda_shared_mem/meanshift_gpu_utils.cu b/mean_shift_cuda_shared_mem/meanshift_gpu_utils.cu
index 7f18758..8bc80ed 100644
--- a/mean_shift_cuda_shared_mem/meanshift_gpu_utils.cu
+++ b/mean_shift_cuda_shared_mem/meanshift_gpu_utils.cu
@@ -113,6 +113,7 @@ int meanshift(double **original_points, double ***shifted_points, int deviation)
     // allocates corresponding memory in device
     d_new_shift.width = DIMENSIONS;
     d_new_shift.height = NUMBER_OF_POINTS;
+    d_new_shift.stride = d_new_shift.width;
     size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double);
     gpuErrchk( cudaMalloc(&(d_new_shift.elements), size) );
 
@@ -121,17 +122,17 @@ int meanshift(double **original_points, double ***shifted_points, int deviation)
         &tmp_w_memcpy_time);
     w_memcpy_time += tmp_w_memcpy_time;
 
-    for (int row=0; row<2; ++row){
+    /*for (int row=0; row<2; ++row){
         for (int col=0; col<2; ++col){
             printf("new_shift[%d][%d] = %f\n", row, col, new_shift[row][col]);
             printf("new_shift[%d][%d] = %f\n", 300+row, 216+col, new_shift[300+row][216+col]);
             printf("new_shift[%d][%d] = %f\n", 562+row, 487+col, new_shift[562+row][487+col]);
         }
-    }
+    }*/
 
-    if(is_first_recursion){
+    /*if(is_first_recursion){
         exit(0);
-    }
+    }*/
 
     // frees previously shifted points, they're now garbage
     free((*shifted_points)[0]);
@@ -195,6 +196,7 @@ void init_device_memory(double **original_points, double **shifted_points,
     // allocates memory for original_points in GPU and copies the array
     d_original_points->width = DIMENSIONS;
     d_original_points->height = NUMBER_OF_POINTS;
+    d_original_points->stride = d_original_points->width;
     size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double);
     gpuErrchk( cudaMalloc(&(d_original_points->elements), size) );
     gpuErrchk( cudaMemcpy(d_original_points->elements, &(original_points[0][0])
@@ -203,6 +205,7 @@ void init_device_memory(double **original_points, double **shifted_points,
     // allocates memory for shifted_points in GPU and copies the array
     d_shifted_points->width = DIMENSIONS;
     d_shifted_points->height = NUMBER_OF_POINTS;
+    d_shifted_points->stride = d_shifted_points->width;
     size = DIMENSIONS * NUMBER_OF_POINTS * sizeof(double);
     gpuErrchk( cudaMalloc(&(d_shifted_points->elements), size) );
     gpuErrchk( cudaMemcpy(d_shifted_points->elements, &(shifted_points[0][0])
@@ -211,18 +214,21 @@ void init_device_memory(double **original_points, double **shifted_points,
     // allocates memory for kernel_matrix in GPU
     d_kernel_matrix->width = NUMBER_OF_POINTS;
     d_kernel_matrix->height = NUMBER_OF_POINTS;
+    d_kernel_matrix->stride = d_kernel_matrix->width;
     size = NUMBER_OF_POINTS * NUMBER_OF_POINTS * sizeof(double);
     gpuErrchk( cudaMalloc(&(d_kernel_matrix->elements), size) );
 
     // allocates memory for denominator in GPU
     d_denominator->width = 1;
     d_denominator->height = NUMBER_OF_POINTS;
+    d_denominator->stride = d_denominator->width;
     size = NUMBER_OF_POINTS * sizeof(double);
     gpuErrchk( cudaMalloc(&(d_denominator->elements), size) );
 
     // allocates memory for mean_shift_vector in GPU
     d_mean_shift_vector->width = DIMENSIONS;
     d_mean_shift_vector->height = NUMBER_OF_POINTS;
+    d_mean_shift_vector->stride = d_mean_shift_vector->width;
     size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double);
     gpuErrchk( cudaMalloc(&(d_mean_shift_vector->elements), size) );
 }
@@ -324,7 +330,7 @@ void shift_points(Matrix d_kernel_matrix, Matrix d_original_points, Matrix d_shi
         dimBlock.y = d_new_shift.width;*/
         dimBlock.x = 2;
         dimBlock.y = 2;
-        dimGrid.x = (d_denominator.height + dimBlock.x - 1) / dimBlock.x;
+        dimGrid.x = (d_new_shift.height + dimBlock.x - 1) / dimBlock.x;
         dimGrid.y = 1;
 
         shift_points_kernel<<<dimGrid, dimBlock>>>(d_original_points, d_kernel_matrix, d_shifted_points,
diff --git a/mean_shift_cuda_shared_mem/meanshift_kernels.cu b/mean_shift_cuda_shared_mem/meanshift_kernels.cu
index f8078fb..99368b6 100644
--- a/mean_shift_cuda_shared_mem/meanshift_kernels.cu
+++ b/mean_shift_cuda_shared_mem/meanshift_kernels.cu
@@ -66,15 +66,12 @@ __global__ void shift_points_kernel(Matrix original_points, Matrix kernel_matrix
     int row = threadIdx.x;
     int col = threadIdx.y;
 
-        // performs calculations only if thread's indexes are within matrix bounds
-    //if (row * new_shift.width + col >= new_shift.width * new_shift.height){
-    /*if (new_shift.stride * BLOCK_SIZE * block_row + BLOCK_SIZE * block_col >=
-        new_shift.width * new_shift.height){*/
+    // performs calculations only if thread's indexes are within matrix bounds
     if (BLOCK_SIZE * block_row >= new_shift.height || BLOCK_SIZE * block_col >= new_shift.width){
         return;
     }
 
-    // Each thread block computes one sub-matrix sub_new_shift of C
+    // each thread block computes one sub-matrix sub_new_shift of C
     Matrix sub_new_shift = GetSubMatrix(new_shift, block_row, block_col, BLOCK_SIZE);
 
     // shared memory used to store sub_kernel_matrix and sub_original_points respectively
@@ -84,7 +81,7 @@ __global__ void shift_points_kernel(Matrix original_points, Matrix kernel_matrix
     s_sub_original_points = (double*)malloc(BLOCK_SIZE * BLOCK_SIZE * sizeof(double));
 
     // loops over all the sub-matrices of kernel_matrix and original_points that are required to
-    //compute sub_new_shift, multiplies each pair of sub-matrices and accumulates the results
+    // compute sub_new_shift, multiplies each pair of sub-matrices and accumulates the results
     for (int sub_matrix_index = 0; sub_matrix_index < (kernel_matrix.width / BLOCK_SIZE); ++sub_matrix_index) {
 
         // gets sub-matrix sub_kernel_matrix of kernel_matrix
@@ -104,8 +101,8 @@ __global__ void shift_points_kernel(Matrix original_points, Matrix kernel_matrix
 
         // multiplies sub_kernel_matrix and sub_original_points
         for (int element_index = 0; element_index < BLOCK_SIZE; ++element_index){
-            cell_value += s_sub_kernel_matrix[row * sub_kernel_matrix.stride + element_index] *
-                s_sub_original_points[element_index * sub_original_points.stride + col];
+            cell_value += s_sub_kernel_matrix[row * BLOCK_SIZE + element_index] *
+                s_sub_original_points[element_index * BLOCK_SIZE + col];
         }
 
         // synchronizes to make sure that the preceding computation is done before loading two new
@@ -114,21 +111,15 @@ __global__ void shift_points_kernel(Matrix original_points, Matrix kernel_matrix
     }
 
     // new_shift elements are calculated by dividing with the denominator
-    int cell_row = (block_row * BLOCK_SIZE + row) * new_shift.stride;
-    int cell_col = block_col * BLOCK_SIZE + col;
-    //sub_new_shift.elements[cell_row + cell_col] = cell_value / denominator.elements[cell_row];
     sub_new_shift.elements[row * sub_new_shift.stride + col] =
         cell_value / denominator.elements[block_row * BLOCK_SIZE + row];
 
-    // calculates mean-shift vector
-    /*mean_shift_vector.elements[(block_row * BLOCK_SIZE + row) * mean_shift_vector.stride
-        + (block_col * BLOCK_SIZE + col)] =
-        sub_new_shift.elements[row * sub_new_shift.stride + col] -
-        shifted_points.elements[(block_row * BLOCK_SIZE + row) * shifted_points.stride
-        + (block_col * BLOCK_SIZE + col)];*/
 
-    /*free(s_sub_kernel_matrix);
-    free(s_sub_original_points);*/
+    int cell_row = block_row * BLOCK_SIZE + row;
+    int cell_col = block_col * BLOCK_SIZE + col;
+    mean_shift_vector.elements[cell_row * mean_shift_vector.stride + cell_col] =
+        sub_new_shift.elements[row * sub_new_shift.stride + col] -
+        shifted_points.elements[cell_row * shifted_points.stride + cell_col];
 }
 
 // Get the BLOCK_SIZExBLOCK_SIZE sub-matrix Asub of A that is
@@ -138,7 +129,7 @@ __device__ Matrix GetSubMatrix(Matrix A, int row, int col, int BLOCK_SIZE){
     Matrix Asub;
     Asub.width = BLOCK_SIZE;
     Asub.height = BLOCK_SIZE;
-    Asub.stride = BLOCK_SIZE;
+    Asub.stride = A.stride;
     Asub.elements = &(A.elements[A.stride * BLOCK_SIZE * row + BLOCK_SIZE * col]);
     return Asub;
 }
\ No newline at end of file