2D memory alloc, norm refactored as __device__ function

8 years ago · 26869bc13a
1 changed files with 96 additions and 17 deletions
--- a/meanshift.c
+++ b/meanshift.c
@ -1,8 +1,11 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <sys/time.h>
 #include <math.h>
 #include <float.h>
 #include "serial_declarations.h"
 #define N       512
 int NUMBER_OF_POINTS = 600;
 int DIMENSIONS = 2;
@ -15,6 +18,19 @@ double seq_time;
 int meanshift(double **original_points, double ***shifted_points, int h
        , parameters *opt, int iteration);
 __device__ double norm(double **matrix, int rows, int cols){
    double sum=0, temp_mul=0;
    for (int i=0; i<rows; i++) {
        for (int j=0; j<cols; j++) {
            temp_mul = matrix[i][j] * matrix[i][j];
            sum = sum + temp_mul;
        }
    }
    double norm = sqrt(sum);
    return norm;
 }
 int main(int argc, char **argv){
    int h = 1;
@ -74,6 +90,8 @@ int main(int argc, char **argv){
    //save_matrix(shifted_points, iterations);
 }
 int meanshift(double **original_points, double ***shifted_points, int h
        , parameters *opt, int iteration){
@ -102,33 +120,40 @@ int meanshift(double **original_points, double ***shifted_points, int h
    double * d_kernel_matrix;
-    cudaMalloc(&d_kernel_matrix, NUMBER_OF_POINTS * NUMBER_OF_POINTS * sizeof(double));
+    size_t pitch_kernel_matrix;
    cudaMallocPitch(&d_kernel_matrix, &pitch_kernel_matrix,
                    NUMBER_OF_POINTS * sizeof(double), NUMBER_OF_POINTS);
    double * d_denominator;
    cudaMalloc(&d_denominator, NUMBER_OF_POINTS * sizeof(double));
    double * d_new_shift;
-    cudaMalloc(&d_new_shift, NUMBER_OF_POINTS * DIMENSIONS * sizeof(double));
+    size_t pitch_new_shift;
    cudaMallocPitch(&d_new_shift, &pitch_new_shift,
                    NUMBER_OF_POINTS * sizeof(double), DIMENSIONS);
 //    (*shifted_points) = alloc_2d_double(NUMBER_OF_POINTS, DIMENSIONS);
 //    duplicate(original_points, NUMBER_OF_POINTS, DIMENSIONS, shifted_points);
    double * d_shifted_points;
-    cudaMalloc(&d_shifted_points, NUMBER_OF_POINTS * DIMENSIONS * sizeof(double));
+    size_t pitch_shifted_points;
    cudaMallocPitch(&d_shifted_points, &pitch_shifted_points,
                    NUMBER_OF_POINTS * sizeof(double), DIMENSIONS);
 //    double **mean_shift_vector;
 //    mean_shift_vector = alloc_2d_double(NUMBER_OF_POINTS, DIMENSIONS);
    double * d_mean_shift_vector;
-    cudaMalloc(&d_mean_shift_vector, NUMBER_OF_POINTS * DIMENSIONS * sizeof(double));
+    size_t pitch_mean_shift_vector;
    cudaMallocPitch(&d_mean_shift_vector, &pitch_mean_shift_vector,
                    NUMBER_OF_POINTS * sizeof(double), DIMENSIONS);
-    //Copy vectors from host memory to device memory
+    cudaMemcpy2D(d_shifted_points, NUMBER_OF_POINTS * sizeof(double), *shifted_points,
-    cudaMemcpy(d_shifted_points, *shifted_points, NUMBER_OF_POINTS * DIMENSIONS * sizeof(double),
+                 pitch_shifted_points, NUMBER_OF_POINTS * sizeof(double),
-               cudaMemcpyHostToDevice);
+                 DIMENSIONS, cudaMemcpyHostToDevice);
-    // y[i][j] == d_y[COLUMNS*i + j]
+
-    cudaMemcpy(d_mean_shift_vector, mean_shift_vector, NUMBER_OF_POINTS * DIMENSIONS * sizeof(double),
+    cudaMemcpy2D(d_mean_shift_vector, NUMBER_OF_POINTS * sizeof(double), *mean_shift_vector,
-               cudaMemcpyHostToDevice);
+                 pitch_mean_shift_vector, NUMBER_OF_POINTS * sizeof(double),
                 DIMENSIONS, cudaMemcpyHostToDevice);
    // TODO REFACTOR AS A KERNEL
-    // find pairwise distance matrix (inside radius)
+
    // [I, D] = rangesearch(x,y,h);
    for (int i=0; i<NUMBER_OF_POINTS; i++){
        double sum = 0;
        for (int j=0; j<NUMBER_OF_POINTS; j++){
@ -191,4 +216,58 @@ int meanshift(double **original_points, double ***shifted_points, int h
    }
    return iteration;
-}
+}
 /**
 __global__ int iteration(double * kernel_matrix, double * denominator,
            double * new_shift, double *shifted_points, double mean_shift_vector,
            int NUMBER_OF_POINTS, int DIMENSIONS, int h){
    int i = threadIdx.x + blockIdx.x * blockDim.x;
    for (i = 0; i < NUMBER_OF_POINTS; i++) {
        double sum = 0;
        for (int j = 0; j < NUMBER_OF_POINTS; j++) {
            double dist_sum = 0;
            for (int p = 0; p < DIMENSIONS; p++) {
                double dif = ((*shifted_points)[i])[p] - (original_points[j])[p];
                dist_sum += dif * dif;
            }
            double dist = sqrt(dist_sum);
            if (dist < h * h) {
                kernel_matrix[i][j] = dist * dist;
                // compute kernel matrix
                double pow = ((-1) * (kernel_matrix[i][j])) / (2 * (h * h));
                kernel_matrix[i][j] = exp(pow);
            } else {
                kernel_matrix[i][j] = 0;
            }
            if (i == j) {
                kernel_matrix[i][j] += 1;
            }
            sum = sum + kernel_matrix[i][j];
        }
        denominator[i] = sum;
        // build nominator
        for (int j = 0; j < DIMENSIONS; j++) {
            new_shift[i][j] = 0;
            for (int k = 0; k < NUMBER_OF_POINTS; k++) {
                new_shift[i][j] += kernel_matrix[i][k] * original_points[k][j];
            }
            // divide element-wise
            new_shift[i][j] = new_shift[i][j] / denominator[i];
            // calculate mean-shift vector at the same time
            mean_shift_vector[i][j] = new_shift[i][j] - (*shifted_points)[i][j];
        }
    }
    // frees previously shifted points, they're now garbage
    free((*shifted_points)[0]);
    // updates shifted points pointer to the new array address
    shifted_points = &new_shift;
 }
 */