From 9779116930e2f9f9c9000500bcaf353ca554d2d5 Mon Sep 17 00:00:00 2001
From: Apostolof <apotwohd@gmail.com>
Date: Thu, 1 Feb 2018 02:16:07 +0200
Subject: [PATCH] Add asymmetric block sizes, occupancy improvement

---
 .../meanshift_gpu_utils.cu                    |  9 ++--
 .../meanshift_kernels.cu                      | 51 ++++++++++---------
 .../meanshift_kernels.h                       |  3 +-
 3 files changed, 36 insertions(+), 27 deletions(-)

diff --git a/mean_shift_cuda_shared_mem/meanshift_gpu_utils.cu b/mean_shift_cuda_shared_mem/meanshift_gpu_utils.cu
index 6566df4..6e6437d 100644
--- a/mean_shift_cuda_shared_mem/meanshift_gpu_utils.cu
+++ b/mean_shift_cuda_shared_mem/meanshift_gpu_utils.cu
@@ -320,15 +320,18 @@ void shift_points(Matrix d_kernel_matrix, Matrix d_original_points, Matrix d_shi
     dim3 dimBlock;
     dim3 dimGrid;
     do {
-        dimBlock.x = requested_block_size;
-        dimBlock.y = requested_block_size;
+        /*dimBlock.x = requested_block_size;
+        dimBlock.y = requested_block_size;*/
+        //to do
+        dimBlock.x = 240;
+        dimBlock.y = 2;
         dimGrid.x = (d_new_shift.height + dimBlock.x - 1) / dimBlock.x;
         dimGrid.y = (d_new_shift.width + dimBlock.y - 1) / dimBlock.y;
 
         // size for kernel's dynamically allocated array
         // the size FOR EACH array is calculated as BLOCK_SIZE * BLOCK_SIZE * size_of_double
         // the arrays needed in kernel are two
-        shared_memory_size = (int)(dimBlock.x * dimBlock.x * sizeof(double) * 2);
+        shared_memory_size = (int)(dimBlock.x * dimBlock.y * sizeof(double) * 2);
 
         shift_points_kernel<<<dimGrid, dimBlock, shared_memory_size>>>(d_original_points,
             d_kernel_matrix, d_shifted_points, d_new_shift, d_denominator, d_mean_shift_vector);
diff --git a/mean_shift_cuda_shared_mem/meanshift_kernels.cu b/mean_shift_cuda_shared_mem/meanshift_kernels.cu
index 9c2d948..2181327 100644
--- a/mean_shift_cuda_shared_mem/meanshift_kernels.cu
+++ b/mean_shift_cuda_shared_mem/meanshift_kernels.cu
@@ -55,7 +55,8 @@ __global__ void denominator_kernel(Matrix denominator, Matrix kernel_matrix){
 
 __global__ void shift_points_kernel(Matrix original_points, Matrix kernel_matrix,
     Matrix shifted_points, Matrix new_shift, Matrix denominator, Matrix mean_shift_vector){
-    int BLOCK_SIZE = blockDim.x;
+    int ROW_BLOCK_SIZE = blockDim.x;
+    int COLUMN_BLOCK_SIZE = blockDim.y;
     int block_row = blockIdx.x;
     int block_col = blockIdx.y;
 
@@ -67,52 +68,56 @@ __global__ void shift_points_kernel(Matrix original_points, Matrix kernel_matrix
     int col = threadIdx.y;
 
     // performs calculations only if thread's indexes are within matrix bounds
-    if ((BLOCK_SIZE * block_row + row) > new_shift.height ||
-        (BLOCK_SIZE * block_col + col) > new_shift.width){
+    if ((ROW_BLOCK_SIZE * block_row + row) > new_shift.height ||
+        (COLUMN_BLOCK_SIZE * block_col + col) > new_shift.width){
         return;
     }
 
     // each thread block computes one sub-matrix sub_new_shift of C
-    Matrix sub_new_shift = get_sub_matrix(new_shift, block_row, block_col, BLOCK_SIZE);
+    Matrix sub_new_shift = get_sub_matrix(new_shift, block_row, block_col, ROW_BLOCK_SIZE,
+        COLUMN_BLOCK_SIZE);
 
     // dynamically allocated shared memory used to store sub_kernel_matrix and sub_original_points
     // respectively
     extern __shared__ double joined_shared_memory[];
     // makes sure enough memory has been allocated
-    joined_shared_memory[BLOCK_SIZE * BLOCK_SIZE * 2] = 0;
+    joined_shared_memory[ROW_BLOCK_SIZE * COLUMN_BLOCK_SIZE * 2 - 1] = 0;
 
     // first part of the allocated memory is used for s_sub_kernel_matrix and second part is used
     // for s_sub_original_points
     double *s_sub_kernel_matrix = (double *)joined_shared_memory;
-    double *s_sub_original_points = (double *)&joined_shared_memory[BLOCK_SIZE * BLOCK_SIZE];
+    double *s_sub_original_points = (double *)&joined_shared_memory[ROW_BLOCK_SIZE *
+        COLUMN_BLOCK_SIZE];
 
     // loops over all the sub-matrices of kernel_matrix and original_points that are required to
     // compute sub_new_shift, multiplies each pair of sub-matrices and accumulates the results
     for (int sub_matrix_index = 0;
-            sub_matrix_index < ((kernel_matrix.width + BLOCK_SIZE - 1) / BLOCK_SIZE);
+            sub_matrix_index < ((kernel_matrix.width + COLUMN_BLOCK_SIZE - 1) / COLUMN_BLOCK_SIZE);
             ++sub_matrix_index) {
 
         // gets sub-matrix sub_kernel_matrix of kernel_matrix
-        Matrix sub_kernel_matrix = get_sub_matrix(kernel_matrix, block_row, sub_matrix_index, BLOCK_SIZE);
+        Matrix sub_kernel_matrix = get_sub_matrix(kernel_matrix, block_row, sub_matrix_index,
+            ROW_BLOCK_SIZE, COLUMN_BLOCK_SIZE);
         // gets sub-matrix sub_original_points of original_points
-        Matrix sub_original_points = get_sub_matrix(original_points, sub_matrix_index, block_col, BLOCK_SIZE);
+        Matrix sub_original_points = get_sub_matrix(original_points, sub_matrix_index, block_col,
+            COLUMN_BLOCK_SIZE, ROW_BLOCK_SIZE);
 
         // loads s_sub_kernel_matrix and s_sub_original_points from device global memory to shared
-        //memory, each thread loads one element of each sub-matrix
-        s_sub_kernel_matrix[row * BLOCK_SIZE + col] =
+        // memory, each thread loads one element of each sub-matrix
+        s_sub_kernel_matrix[row * COLUMN_BLOCK_SIZE + col] =
             sub_kernel_matrix.elements[row * sub_kernel_matrix.stride + col];
-        s_sub_original_points[row * BLOCK_SIZE + col] =
+        s_sub_original_points[row * COLUMN_BLOCK_SIZE + col] =
             sub_original_points.elements[row * sub_original_points.stride + col];
 
         // synchronizes to make sure the sub-matrices are loaded before starting the computation
         __syncthreads();
 
-        int curr_col_begin = sub_matrix_index * BLOCK_SIZE;
+        int curr_col_begin = sub_matrix_index * COLUMN_BLOCK_SIZE;
         // multiplies sub_kernel_matrix and sub_original_points
-        for (int element_index = 0; element_index < BLOCK_SIZE; ++element_index){
+        for (int element_index = 0; element_index < COLUMN_BLOCK_SIZE; ++element_index){
             if (curr_col_begin + element_index < kernel_matrix.width){
-                cell_value += s_sub_kernel_matrix[row * BLOCK_SIZE + element_index] *
-                    s_sub_original_points[element_index * BLOCK_SIZE + col];
+                cell_value += s_sub_kernel_matrix[row * COLUMN_BLOCK_SIZE + element_index] *
+                    s_sub_original_points[element_index * COLUMN_BLOCK_SIZE + col];
             }
         }
 
@@ -123,10 +128,10 @@ __global__ void shift_points_kernel(Matrix original_points, Matrix kernel_matrix
 
     // new_shift elements are calculated by dividing with the denominator
     sub_new_shift.elements[row * sub_new_shift.stride + col] =
-        cell_value / denominator.elements[block_row * BLOCK_SIZE + row];
+        cell_value / denominator.elements[block_row * ROW_BLOCK_SIZE + row];
 
-    int cell_row = block_row * BLOCK_SIZE + row;
-    int cell_col = block_col * BLOCK_SIZE + col;
+    int cell_row = block_row * ROW_BLOCK_SIZE + row;
+    int cell_col = block_col * COLUMN_BLOCK_SIZE + col;
     mean_shift_vector.elements[cell_row * mean_shift_vector.stride + cell_col] =
         sub_new_shift.elements[row * sub_new_shift.stride + col] -
         shifted_points.elements[cell_row * shifted_points.stride + cell_col];
@@ -134,11 +139,11 @@ __global__ void shift_points_kernel(Matrix original_points, Matrix kernel_matrix
 
 // gets the BLOCK_SIZExBLOCK_SIZE sub-matrix Asub of A that is located col sub-matrices to the right
 // and row sub-matrices down from the upper-left corner of A
-__device__ Matrix get_sub_matrix(Matrix A, int row, int col, int BLOCK_SIZE){
+__device__ Matrix get_sub_matrix(Matrix A, int row, int col, int ROW_BLOCK_SIZE, int COLUMN_BLOCK_SIZE){
     Matrix Asub;
-    Asub.width = BLOCK_SIZE;
-    Asub.height = BLOCK_SIZE;
+    Asub.width = COLUMN_BLOCK_SIZE;
+    Asub.height = ROW_BLOCK_SIZE;
     Asub.stride = A.stride;
-    Asub.elements = &(A.elements[A.stride * BLOCK_SIZE * row + BLOCK_SIZE * col]);
+    Asub.elements = &(A.elements[A.stride * ROW_BLOCK_SIZE * row + COLUMN_BLOCK_SIZE * col]);
     return Asub;
 }
\ No newline at end of file
diff --git a/mean_shift_cuda_shared_mem/meanshift_kernels.h b/mean_shift_cuda_shared_mem/meanshift_kernels.h
index 16e9f95..db93809 100644
--- a/mean_shift_cuda_shared_mem/meanshift_kernels.h
+++ b/mean_shift_cuda_shared_mem/meanshift_kernels.h
@@ -24,6 +24,7 @@ __global__ void denominator_kernel(Matrix denominator, Matrix kernel_matrix);
 __global__ void shift_points_kernel(Matrix original_points, Matrix kernel_matrix,
     Matrix shifted_points, Matrix new_shift, Matrix denominator, Matrix mean_shift_vector);
 
-__device__ Matrix get_sub_matrix(Matrix A, int row, int col, int BLOCK_SIZE);
+__device__ Matrix get_sub_matrix(Matrix A, int row, int col, int ROW_BLOCK_SIZE,
+    int COLUMN_BLOCK_SIZE);
 
 #endif //SERIAL_KERNELS_H
\ No newline at end of file