Fix shared memory implementation

9 years ago · 64c24ec313
3 changed files with 23 additions and 26 deletions
--- a/mean_shift_cuda_shared_mem/meanshift.cu
+++ b/mean_shift_cuda_shared_mem/meanshift.cu
@ -21,7 +21,7 @@ int main(int argc, char **argv){
    char *labels;
    params.epsilon = 0.0001;
-    params.verbose = false;
+    params.verbose = true;
    params.display = true;
    //get_args(argc, argv, &params); //commented out while in development
    init(&vectors, &labels);
--- a/mean_shift_cuda_shared_mem/meanshift_gpu_utils.cu
+++ b/mean_shift_cuda_shared_mem/meanshift_gpu_utils.cu
@ -113,6 +113,7 @@ int meanshift(double **original_points, double ***shifted_points, int deviation)
    // allocates corresponding memory in device
    d_new_shift.width = DIMENSIONS;
    d_new_shift.height = NUMBER_OF_POINTS;
    d_new_shift.stride = d_new_shift.width;
    size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double);
    gpuErrchk( cudaMalloc(&(d_new_shift.elements), size) );
@ -121,17 +122,17 @@ int meanshift(double **original_points, double ***shifted_points, int deviation)
        &tmp_w_memcpy_time);
    w_memcpy_time += tmp_w_memcpy_time;
-    for (int row=0; row<2; ++row){
+    /*for (int row=0; row<2; ++row){
        for (int col=0; col<2; ++col){
            printf("new_shift[%d][%d] = %f\n", row, col, new_shift[row][col]);
            printf("new_shift[%d][%d] = %f\n", 300+row, 216+col, new_shift[300+row][216+col]);
            printf("new_shift[%d][%d] = %f\n", 562+row, 487+col, new_shift[562+row][487+col]);
        }
-    }
+    }*/
-    if(is_first_recursion){
+    /*if(is_first_recursion){
        exit(0);
-    }
+    }*/
    // frees previously shifted points, they're now garbage
    free((*shifted_points)[0]);
@ -195,6 +196,7 @@ void init_device_memory(double **original_points, double **shifted_points,
    // allocates memory for original_points in GPU and copies the array
    d_original_points->width = DIMENSIONS;
    d_original_points->height = NUMBER_OF_POINTS;
    d_original_points->stride = d_original_points->width;
    size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double);
    gpuErrchk( cudaMalloc(&(d_original_points->elements), size) );
    gpuErrchk( cudaMemcpy(d_original_points->elements, &(original_points[0][0])
@ -203,6 +205,7 @@ void init_device_memory(double **original_points, double **shifted_points,
    // allocates memory for shifted_points in GPU and copies the array
    d_shifted_points->width = DIMENSIONS;
    d_shifted_points->height = NUMBER_OF_POINTS;
    d_shifted_points->stride = d_shifted_points->width;
    size = DIMENSIONS * NUMBER_OF_POINTS * sizeof(double);
    gpuErrchk( cudaMalloc(&(d_shifted_points->elements), size) );
    gpuErrchk( cudaMemcpy(d_shifted_points->elements, &(shifted_points[0][0])
@ -211,18 +214,21 @@ void init_device_memory(double **original_points, double **shifted_points,
    // allocates memory for kernel_matrix in GPU
    d_kernel_matrix->width = NUMBER_OF_POINTS;
    d_kernel_matrix->height = NUMBER_OF_POINTS;
    d_kernel_matrix->stride = d_kernel_matrix->width;
    size = NUMBER_OF_POINTS * NUMBER_OF_POINTS * sizeof(double);
    gpuErrchk( cudaMalloc(&(d_kernel_matrix->elements), size) );
    // allocates memory for denominator in GPU
    d_denominator->width = 1;
    d_denominator->height = NUMBER_OF_POINTS;
    d_denominator->stride = d_denominator->width;
    size = NUMBER_OF_POINTS * sizeof(double);
    gpuErrchk( cudaMalloc(&(d_denominator->elements), size) );
    // allocates memory for mean_shift_vector in GPU
    d_mean_shift_vector->width = DIMENSIONS;
    d_mean_shift_vector->height = NUMBER_OF_POINTS;
    d_mean_shift_vector->stride = d_mean_shift_vector->width;
    size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double);
    gpuErrchk( cudaMalloc(&(d_mean_shift_vector->elements), size) );
 }
@ -324,7 +330,7 @@ void shift_points(Matrix d_kernel_matrix, Matrix d_original_points, Matrix d_shi
        dimBlock.y = d_new_shift.width;*/
        dimBlock.x = 2;
        dimBlock.y = 2;
-        dimGrid.x = (d_denominator.height + dimBlock.x - 1) / dimBlock.x;
+        dimGrid.x = (d_new_shift.height + dimBlock.x - 1) / dimBlock.x;
        dimGrid.y = 1;
        shift_points_kernel<<<dimGrid, dimBlock>>>(d_original_points, d_kernel_matrix, d_shifted_points,
--- a/mean_shift_cuda_shared_mem/meanshift_kernels.cu
+++ b/mean_shift_cuda_shared_mem/meanshift_kernels.cu
@ -67,14 +67,11 @@ __global__ void shift_points_kernel(Matrix original_points, Matrix kernel_matrix
    int col = threadIdx.y;
    // performs calculations only if thread's indexes are within matrix bounds
    //if (row * new_shift.width + col >= new_shift.width * new_shift.height){
    /*if (new_shift.stride * BLOCK_SIZE * block_row + BLOCK_SIZE * block_col >=
        new_shift.width * new_shift.height){*/
    if (BLOCK_SIZE * block_row >= new_shift.height || BLOCK_SIZE * block_col >= new_shift.width){
        return;
    }
-    // Each thread block computes one sub-matrix sub_new_shift of C
+    // each thread block computes one sub-matrix sub_new_shift of C
    Matrix sub_new_shift = GetSubMatrix(new_shift, block_row, block_col, BLOCK_SIZE);
    // shared memory used to store sub_kernel_matrix and sub_original_points respectively
@ -84,7 +81,7 @@ __global__ void shift_points_kernel(Matrix original_points, Matrix kernel_matrix
    s_sub_original_points = (double*)malloc(BLOCK_SIZE * BLOCK_SIZE * sizeof(double));
    // loops over all the sub-matrices of kernel_matrix and original_points that are required to
-    //compute sub_new_shift, multiplies each pair of sub-matrices and accumulates the results
+    // compute sub_new_shift, multiplies each pair of sub-matrices and accumulates the results
    for (int sub_matrix_index = 0; sub_matrix_index < (kernel_matrix.width / BLOCK_SIZE); ++sub_matrix_index) {
        // gets sub-matrix sub_kernel_matrix of kernel_matrix
@ -104,8 +101,8 @@ __global__ void shift_points_kernel(Matrix original_points, Matrix kernel_matrix
        // multiplies sub_kernel_matrix and sub_original_points
        for (int element_index = 0; element_index < BLOCK_SIZE; ++element_index){
-            cell_value += s_sub_kernel_matrix[row * sub_kernel_matrix.stride + element_index] *
+            cell_value += s_sub_kernel_matrix[row * BLOCK_SIZE + element_index] *
-                s_sub_original_points[element_index * sub_original_points.stride + col];
+                s_sub_original_points[element_index * BLOCK_SIZE + col];
        }
        // synchronizes to make sure that the preceding computation is done before loading two new
@ -114,21 +111,15 @@ __global__ void shift_points_kernel(Matrix original_points, Matrix kernel_matrix
    }
    // new_shift elements are calculated by dividing with the denominator
    int cell_row = (block_row * BLOCK_SIZE + row) * new_shift.stride;
    int cell_col = block_col * BLOCK_SIZE + col;
    //sub_new_shift.elements[cell_row + cell_col] = cell_value / denominator.elements[cell_row];
    sub_new_shift.elements[row * sub_new_shift.stride + col] =
        cell_value / denominator.elements[block_row * BLOCK_SIZE + row];
    // calculates mean-shift vector
    /*mean_shift_vector.elements[(block_row * BLOCK_SIZE + row) * mean_shift_vector.stride
        + (block_col * BLOCK_SIZE + col)] =
        sub_new_shift.elements[row * sub_new_shift.stride + col] -
        shifted_points.elements[(block_row * BLOCK_SIZE + row) * shifted_points.stride
        + (block_col * BLOCK_SIZE + col)];*/
-    /*free(s_sub_kernel_matrix);
+    int cell_row = block_row * BLOCK_SIZE + row;
-    free(s_sub_original_points);*/
+    int cell_col = block_col * BLOCK_SIZE + col;
    mean_shift_vector.elements[cell_row * mean_shift_vector.stride + cell_col] =
        sub_new_shift.elements[row * sub_new_shift.stride + col] -
        shifted_points.elements[cell_row * shifted_points.stride + cell_col];
 }
 // Get the BLOCK_SIZExBLOCK_SIZE sub-matrix Asub of A that is
@ -138,7 +129,7 @@ __device__ Matrix GetSubMatrix(Matrix A, int row, int col, int BLOCK_SIZE){
    Matrix Asub;
    Asub.width = BLOCK_SIZE;
    Asub.height = BLOCK_SIZE;
-    Asub.stride = BLOCK_SIZE;
+    Asub.stride = A.stride;
    Asub.elements = &(A.elements[A.stride * BLOCK_SIZE * row + BLOCK_SIZE * col]);
    return Asub;
 }