Fix multiply, Add error checking for CUDA calls

7 years ago · dd81bf36bb
5 changed files with 72 additions and 47 deletions
--- a/mean_shift_cuda/Makefile
+++ b/mean_shift_cuda/Makefile
@ -5,7 +5,8 @@ SHELL := /bin/bash

 CC = /usr/local/cuda/bin/nvcc
 RM = rm -f
-CFLAGS= -arch=sm_21 -lm -O0 -I.
+HOST_COMPILER = -ccbin gcc
+CFLAGS= -arch=sm_21 -lm -O0 -I. -Wno-deprecated-gpu-targets
 OBJ = meanshift.o meanshift_utils.o meanshift_kernels.o
 DEPS = meanshift_utils.h meanshift_kernels.h

@ -22,7 +23,7 @@ all: $(EXECUTABLES)
 # DEPENDENCIES (HEADERS)

 %.o: %.cu $(DEPS)
-	$(CC) -x cu $(CFLAGS) -dc $< -o $@
+	$(CC) $(HOST_COMPILER) -x cu $(CFLAGS) -dc $< -o $@

 .PRECIOUS: $(EXECUTABLES) $(OBJ)

@ -30,7 +31,7 @@ all: $(EXECUTABLES)
 # EXECUTABLE (MAIN)

 $(EXECUTABLES): $(OBJ)
-	$(CC) $(CFLAGS) $(OBJ) -o $@
+	$(CC) $(HOST_COMPILER) $(CFLAGS) $(OBJ) -o $@

 clean:
 	$(RM) *.o *~ $(EXECUTABLES)
--- a/mean_shift_cuda/meanshift
+++ b/mean_shift_cuda/meanshift
--- a/mean_shift_cuda/meanshift_kernels.cu
+++ b/mean_shift_cuda/meanshift_kernels.cu
@ -5,15 +5,14 @@ __global__ void multiply_kernel(Matrix matrix1, Matrix matrix2, Matrix output){
    // Each thread computes one element of output
    // by accumulating results into cell_value
    double cell_value = 0;
-    int row = blockIdx.y * blockDim.y + threadIdx.y;
-    int col = blockIdx.x * blockDim.x + threadIdx.x;
+    int col = blockIdx.y * blockDim.y + threadIdx.y;
+    int row = blockIdx.x * blockDim.x + threadIdx.x;

-    if (row < output.height && col < output.width){
+    if (row + col < output.height * output.width){
        for (int element_index = 0; element_index < matrix1.width; ++element_index){
            cell_value += matrix1.elements[row * matrix1.width + element_index]
                * matrix2.elements[element_index * matrix2.width + col];
        }
-        printf("%f\n", cell_value);
        output.elements[row * output.width + col] = cell_value;
    }
 }
--- a/mean_shift_cuda/meanshift_utils.cu
+++ b/mean_shift_cuda/meanshift_utils.cu
@ -8,7 +8,7 @@
 #include "meanshift_kernels.h"

 #define OUTPUT_PREFIX "../output/output_"
-#define BLOCK_SIZE 16
+#define BLOCK_SIZE 8

 void get_args(int argc, char **argv){
    if (argc != 6) {
@ -139,45 +139,9 @@ int meanshift(double **original_points, double ***shifted_points, int deviation

    // creates new y vector
    double **new_shift = alloc_2d_double(NUMBER_OF_POINTS, DIMENSIONS);
-//==============================================================================

    // builds nominator
-    /*multiply(kernel_matrix, original_points, new_shift);*/
-
-    Matrix d_kernel_matrix;
-    d_kernel_matrix.width = NUMBER_OF_POINTS;
-    d_kernel_matrix.height = NUMBER_OF_POINTS;
-    int size = NUMBER_OF_POINTS * NUMBER_OF_POINTS * sizeof(double);
-    cudaMalloc(&d_kernel_matrix.elements, size);
-    cudaMemcpy(d_kernel_matrix.elements, &(kernel_matrix[0][0]), size, cudaMemcpyHostToDevice);
-
-    Matrix d_original_points;
-    d_original_points.width = DIMENSIONS;
-    d_original_points.height = NUMBER_OF_POINTS;
-    size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double);
-    cudaMalloc(&d_original_points.elements, size);
-    cudaMemcpy(d_original_points.elements, &(original_points[0][0]), size, cudaMemcpyHostToDevice);
-
-    Matrix d_new_shift;
-    d_new_shift.width = DIMENSIONS;
-    d_new_shift.height = NUMBER_OF_POINTS;
-    size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double);
-    cudaMalloc(&d_new_shift.elements, size);
-
-    dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
-    dim3 dimGrid(d_original_points.width / dimBlock.x, d_kernel_matrix.height / dimBlock.y);
-
-    multiply_kernel<<<dimGrid, dimBlock>>>(d_kernel_matrix, d_original_points
-        , d_new_shift);
-
-    size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double);
-    cudaMemcpy(&(new_shift[0][0]), d_new_shift.elements, size, cudaMemcpyDeviceToHost);
-
-    cudaFree(d_kernel_matrix.elements);
-    cudaFree(d_original_points.elements);
-    cudaFree(d_new_shift.elements);
-
-//==============================================================================
+    multiply(kernel_matrix, original_points, &new_shift);

    // divides element-wise
    for (int i=0; i<NUMBER_OF_POINTS; i++){
@ -230,6 +194,51 @@ double norm(double **matrix, int rows, int cols){
    return norm;
 }

+void multiply(double **kernel_matrix, double **original_points, double ***new_shift){
+	// allocates memory for kernel_matrix in GPU and copies the array
+	Matrix d_kernel_matrix;
+    d_kernel_matrix.width = NUMBER_OF_POINTS;
+    d_kernel_matrix.height = NUMBER_OF_POINTS;
+    int size = NUMBER_OF_POINTS * NUMBER_OF_POINTS * sizeof(double);
+    gpuErrchk( cudaMalloc(&d_kernel_matrix.elements, size) );
+    gpuErrchk( cudaMemcpy(d_kernel_matrix.elements, &(kernel_matrix[0][0])
+    	, size, cudaMemcpyHostToDevice) );
+
+    // allocates memory for original_points in GPU and copies the array
+    Matrix d_original_points;
+    d_original_points.width = DIMENSIONS;
+    d_original_points.height = NUMBER_OF_POINTS;
+    size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double);
+    gpuErrchk( cudaMalloc(&d_original_points.elements, size) );
+    gpuErrchk( cudaMemcpy(d_original_points.elements, &(original_points[0][0])
+    	, size, cudaMemcpyHostToDevice) );
+
+	// allocates memory for new_shift in GPU
+    Matrix d_new_shift;
+    d_new_shift.width = DIMENSIONS;
+    d_new_shift.height = NUMBER_OF_POINTS;
+    size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double);
+    gpuErrchk( cudaMalloc(&d_new_shift.elements, size) );
+
+    //dim3 dimBlock(16, 16);
+    //dim3 dimGrid(d_original_points.width / dimBlock.x, d_kernel_matrix.height / dimBlock.y);
+
+    dim3 dimBlock(10, 2);
+    dim3 dimGrid(60, 1);
+
+    multiply_kernel<<<dimGrid, dimBlock>>>(d_kernel_matrix, d_original_points, d_new_shift);
+    gpuErrchk( cudaPeekAtLastError() );
+    gpuErrchk( cudaDeviceSynchronize() );
+
+    size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double);
+    gpuErrchk( cudaMemcpy(&((*new_shift)[0][0]), d_new_shift.elements
+    	, size, cudaMemcpyDeviceToHost) );
+
+    gpuErrchk( cudaFree(d_kernel_matrix.elements) );
+    gpuErrchk( cudaFree(d_original_points.elements) );
+    gpuErrchk( cudaFree(d_new_shift.elements) );
+}
+
 double calculateDistance(double *y, double *x){
    double sum = 0, dif;
    for (int i=0; i<DIMENSIONS; i++){
--- a/mean_shift_cuda/meanshift_utils.h
+++ b/mean_shift_cuda/meanshift_utils.h
@ -3,6 +3,17 @@

 #include <stdbool.h>

+//GPU error check snippet taken from:
+//https://stackoverflow.com/a/14038590
+#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
+inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true){
+   if (code != cudaSuccess) 
+   {
+      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
+      if (abort) exit(code);
+   }
+}
+
 extern int DEVIATION;
 extern int NUMBER_OF_POINTS;
 extern int DIMENSIONS;
@ -30,6 +41,11 @@ int meanshift(double **original_points, double ***shifted_points, int h
 //Function norm returns the second norm of matrix of dimensions rowsXcols.
 double norm(double **matrix, int rows, int cols);

+//Function multiply allocates memory in GPU, sends the data and calls the 
+//multiply kernel function.
+void multiply(double **kernel_matrix, double **original_points
+	, double ***new_shift);
+
 //Function calculateDistance returns the distance between x and y vectors.
 double calculateDistance(double *y, double *x);