Add calculate_kernel_matrix_kernel

9 years ago · 2ffa434bdc
5 changed files with 129 additions and 34 deletions
--- a/mean_shift_cuda/meanshift
+++ b/mean_shift_cuda/meanshift
--- a/mean_shift_cuda/meanshift_kernels.cu
+++ b/mean_shift_cuda/meanshift_kernels.cu
@ -16,3 +16,34 @@ __global__ void multiply_kernel(Matrix matrix1, Matrix matrix2, Matrix output){
        output.elements[row * output.width + col] = cell_value;
    }
 }
+
+__global__ void calculate_kernel_matrix_kernel(Matrix shifted_points, Matrix original_points
+    , double deviation, Matrix kernel_matrix){
+    // Each thread calculates one element of kernel_matrix
+    int row = blockIdx.x * blockDim.x + threadIdx.x;
+    int col = blockIdx.y * blockDim.y + threadIdx.y;
+    if (row * kernel_matrix.width + col > kernel_matrix.width * kernel_matrix.height){
+        return;
+    }
+
+    int dimensions = shifted_points.width;
+
+    double sum = 0, dif;
+    for (int i=0; i<dimensions; i++){
+        dif = shifted_points.elements[row * dimensions + i] - original_points.elements[col * dimensions + i];
+        sum += dif * dif;
+    }
+    double distance = sqrt(sum);
+
+    double deviation_square = deviation*deviation;
+    if (distance < deviation_square){
+        // computes kernel matrix
+        double pow = ((-1)*(distance * distance))/(2*(deviation_square));
+        kernel_matrix.elements[row * kernel_matrix.width + col] = exp(pow);
+    } else {
+        kernel_matrix.elements[row * kernel_matrix.width + col] = 0;
+    }
+    if (row == col){
+        kernel_matrix.elements[row * kernel_matrix.width + col] += 1;
+    }
+}
--- a/mean_shift_cuda/meanshift_kernels.h
+++ b/mean_shift_cuda/meanshift_kernels.h
@ -10,4 +10,7 @@ typedef struct{
 //Function multiply_kernel calculates the product of matrices 1 and 2 into output.
 __global__ void multiply_kernel(Matrix matrix1, Matrix matrix2, Matrix output);

+__global__ void calculate_kernel_matrix_kernel(Matrix shifted_points, Matrix original_points
+    , double deviation, Matrix kernel_matrix);
+
 #endif //SERIAL_KERNELS_H
--- a/mean_shift_cuda/meanshift_utils.cu
+++ b/mean_shift_cuda/meanshift_utils.cu
@ -8,7 +8,6 @@
 #include "meanshift_kernels.h"

 #define OUTPUT_PREFIX "../output/output_"
-int BLOCK_SIZE = 16;

 cudaDeviceProp device_properties;

@ -151,14 +150,13 @@ void set_Gpu(){
    }
    // sets the device
    gpuErrchk( cudaSetDevice(max_device) );
-    BLOCK_SIZE = device_properties.maxThreadsPerBlock;
    if (params.verbose){
        printf("Device chosen is \"%s\"\n"
            "Device has %d multi processors and compute capability %d.%d\n"
-            "Setting BLOCK_SIZE to max threads per block supported (%d)\n\n"
+            "Max threads per block supported are %d\n\n"
            , device_properties.name
            , device_properties.multiProcessorCount, device_properties.major, device_properties.minor
-            , BLOCK_SIZE);
+            , device_properties.maxThreadsPerBlock);
    }
 }

@ -188,23 +186,11 @@ int meanshift(double **original_points, double ***shifted_points, int deviation

    // finds pairwise distance matrix (inside radius)
    // [I, D] = rangesearch(x,y,h);
+    calculate_kernel_matrix((*shifted_points), original_points, deviation, &kernel_matrix);
+    // calculate denominator
    for (int i=0; i<NUMBER_OF_POINTS; i++){
        double sum = 0;
        for (int j=0; j<NUMBER_OF_POINTS; j++){
-            double distance = calculateDistance((*shifted_points)[i]
-                , original_points[j]);
-
-            double deviation_square = deviation*deviation;
-            if (distance < deviation_square){
-                // computes kernel matrix
-                double pow = ((-1)*(distance * distance))/(2*(deviation_square));
-                kernel_matrix[i][j] = exp(pow);
-            } else {
-                kernel_matrix[i][j] = 0;
-            }
-            if (i == j){
-                kernel_matrix[i][j] += 1;
-            }
            sum = sum + kernel_matrix[i][j];
        }
        denominator[i] = sum;
@ -271,8 +257,78 @@ double norm(double **matrix, int rows, int cols){
    return norm;
 }

+void calculate_kernel_matrix(double **shifted_points, double **original_points, double deviation
+    , double ***kernel_matrix){
+    static bool first_iter = true;
+
+    // allocates memory for shifted_points in GPU and copies the array
+    Matrix d_shifted_points;
+    d_shifted_points.width = DIMENSIONS;
+    d_shifted_points.height = NUMBER_OF_POINTS;
+    int size = DIMENSIONS * NUMBER_OF_POINTS * sizeof(double);
+    gpuErrchk( cudaMalloc(&d_shifted_points.elements, size) );
+    gpuErrchk( cudaMemcpy(d_shifted_points.elements, &(shifted_points[0][0])
+        , size, cudaMemcpyHostToDevice) );
+
+    // allocates memory for original_points in GPU and copies the array
+    Matrix d_original_points;
+    d_original_points.width = DIMENSIONS;
+    d_original_points.height = NUMBER_OF_POINTS;
+    size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double);
+    gpuErrchk( cudaMalloc(&d_original_points.elements, size) );
+    gpuErrchk( cudaMemcpy(d_original_points.elements, &(original_points[0][0])
+        , size, cudaMemcpyHostToDevice) );
+
+    // allocates memory for kernel_matrix in GPU
+    Matrix d_kernel_matrix;
+    d_kernel_matrix.width = NUMBER_OF_POINTS;
+    d_kernel_matrix.height = NUMBER_OF_POINTS;
+    size = NUMBER_OF_POINTS * NUMBER_OF_POINTS * sizeof(double);
+    gpuErrchk( cudaMalloc(&d_kernel_matrix.elements, size) );
+
+    // get max sizes supported from the device
+    int max_block_size = (int)sqrt(device_properties.maxThreadsPerBlock);
+    int requested_block_size = max_block_size;
+    bool block_size_too_big = true;
+
+    dim3 dimBlock;
+    dim3 dimGrid;
+
+    do {
+        dimBlock.x = requested_block_size;
+        dimBlock.y = requested_block_size;
+        dimGrid.x = (d_kernel_matrix.height + dimBlock.x - 1) / dimBlock.x;
+        dimGrid.y = (d_kernel_matrix.width + dimBlock.y - 1) / dimBlock.y;
+
+        calculate_kernel_matrix_kernel<<<dimGrid, dimBlock>>>(d_shifted_points, d_original_points
+            , deviation, d_kernel_matrix);
+        if (cudaGetLastError() != cudaSuccess){
+            --requested_block_size;
+        } else {
+            block_size_too_big = false;
+            gpuErrchk( cudaDeviceSynchronize() );
+        }
+    } while(block_size_too_big);
+    
+    if (first_iter && params.verbose){
+        printf("calculate_kernel_matrix_kernel called with:\n");
+        printf("dimBlock.x = %d, dimBlock.y = %d\n", dimBlock.x, dimBlock.y);
+        printf("dimGrid.x = %d, dimGrid.y = %d\n\n", dimGrid.x, dimGrid.y);
+        first_iter = false;
+    }
+
+    size = NUMBER_OF_POINTS * NUMBER_OF_POINTS * sizeof(double);
+    gpuErrchk( cudaMemcpy(&((*kernel_matrix)[0][0]), d_kernel_matrix.elements
+        , size, cudaMemcpyDeviceToHost) );
+
+    gpuErrchk( cudaFree(d_shifted_points.elements) );
+    gpuErrchk( cudaFree(d_original_points.elements) );
+    gpuErrchk( cudaFree(d_kernel_matrix.elements) );
+}
+
+
 void multiply(double **kernel_matrix, double **original_points, double ***new_shift){
-    static bool firstIter = true;
+    static bool first_iter = true;

    // allocates memory for kernel_matrix in GPU and copies the array
    Matrix d_kernel_matrix;
@ -299,15 +355,18 @@ void multiply(double **kernel_matrix, double **original_points, double ***new_sh
    size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double);
    gpuErrchk( cudaMalloc(&d_new_shift.elements, size) );

-    dim3 dimBlock((d_new_shift.height < sqrt(BLOCK_SIZE)) ? d_new_shift.height : sqrt(BLOCK_SIZE)
-        , (d_new_shift.width < sqrt(BLOCK_SIZE)) ? d_new_shift.width : sqrt(BLOCK_SIZE));
+    // get max sizes supported from the device
+    int max_block_size = device_properties.maxThreadsPerBlock;
+    dim3 dimBlock((d_new_shift.height < sqrt(max_block_size)) ? d_new_shift.height : sqrt(max_block_size)
+        , (d_new_shift.width < sqrt(max_block_size)) ? d_new_shift.width : sqrt(max_block_size));
    dim3 dimGrid((d_new_shift.height + dimBlock.x - 1) / dimBlock.x
        , (d_new_shift.width + dimBlock.y - 1) / dimBlock.y);

-    if (firstIter && params.verbose){
+    if (first_iter && params.verbose){
+        printf("multiply_kernel called with:\n");
        printf("dimBlock.x = %d, dimBlock.y = %d\n", dimBlock.x, dimBlock.y);
        printf("dimGrid.x = %d, dimGrid.y = %d\n\n", dimGrid.x, dimGrid.y);
-        firstIter = false;
+        first_iter = false;
    }

    multiply_kernel<<<dimGrid, dimBlock>>>(d_kernel_matrix, d_original_points, d_new_shift);
--- a/mean_shift_cuda/meanshift_utils.h
+++ b/mean_shift_cuda/meanshift_utils.h
@ -7,8 +7,7 @@
 //https://stackoverflow.com/a/14038590
 #define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
 inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true){
-   if (code != cudaSuccess) 
-   {
+   if (code != cudaSuccess){
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
@ -47,6 +46,9 @@ int meanshift(double **original_points, double ***shifted_points, int h
 //Function norm returns the second norm of matrix of dimensions rowsXcols.
 double norm(double **matrix, int rows, int cols);

+void calculate_kernel_matrix(double **shifted_points, double **original_points, double deviation
+    , double ***kernel_matrix);
+
 //Function multiply allocates memory in GPU, sends the data and calls the 
 //multiply kernel function.
 void multiply(double **kernel_matrix, double **original_points