Fix multiply, Add error checking for CUDA calls

8 years ago · dd81bf36bb
5 changed files with 72 additions and 47 deletions
--- a/mean_shift_cuda/Makefile
+++ b/mean_shift_cuda/Makefile
@ -5,9 +5,10 @@ SHELL := /bin/bash
 CC = /usr/local/cuda/bin/nvcc
 RM = rm -f
-CFLAGS= -arch=sm_21 -lm -O0 -I.
+HOST_COMPILER = -ccbin gcc
-OBJ=meanshift.o meanshift_utils.o meanshift_kernels.o
+CFLAGS= -arch=sm_21 -lm -O0 -I. -Wno-deprecated-gpu-targets
-DEPS=meanshift_utils.h meanshift_kernels.h
+OBJ = meanshift.o meanshift_utils.o meanshift_kernels.o
 DEPS = meanshift_utils.h meanshift_kernels.h
 # ==========================================
 # TARGETS
@ -22,7 +23,7 @@ all: $(EXECUTABLES)
 # DEPENDENCIES (HEADERS)
 %.o: %.cu $(DEPS)
-	$(CC) -x cu $(CFLAGS) -dc $< -o $@
+	$(CC) $(HOST_COMPILER) -x cu $(CFLAGS) -dc $< -o $@
 .PRECIOUS: $(EXECUTABLES) $(OBJ)
@ -30,7 +31,7 @@ all: $(EXECUTABLES)
 # EXECUTABLE (MAIN)
 $(EXECUTABLES): $(OBJ)
-	$(CC) $(CFLAGS) $(OBJ) -o $@
+	$(CC) $(HOST_COMPILER) $(CFLAGS) $(OBJ) -o $@
 clean:
 	$(RM) *.o *~ $(EXECUTABLES)
--- a/mean_shift_cuda/meanshift
+++ b/mean_shift_cuda/meanshift
--- a/mean_shift_cuda/meanshift_kernels.cu
+++ b/mean_shift_cuda/meanshift_kernels.cu
@ -5,15 +5,14 @@ __global__ void multiply_kernel(Matrix matrix1, Matrix matrix2, Matrix output){
    // Each thread computes one element of output
    // by accumulating results into cell_value
    double cell_value = 0;
-    int row = blockIdx.y * blockDim.y + threadIdx.y;
+    int col = blockIdx.y * blockDim.y + threadIdx.y;
-    int col = blockIdx.x * blockDim.x + threadIdx.x;
+    int row = blockIdx.x * blockDim.x + threadIdx.x;
-    if (row < output.height && col < output.width){
+    if (row + col < output.height * output.width){
        for (int element_index = 0; element_index < matrix1.width; ++element_index){
            cell_value += matrix1.elements[row * matrix1.width + element_index]
                * matrix2.elements[element_index * matrix2.width + col];
        }
        printf("%f\n", cell_value);
        output.elements[row * output.width + col] = cell_value;
    }
 }
--- a/mean_shift_cuda/meanshift_utils.cu
+++ b/mean_shift_cuda/meanshift_utils.cu
@ -8,7 +8,7 @@
 #include "meanshift_kernels.h"
 #define OUTPUT_PREFIX "../output/output_"
-#define BLOCK_SIZE 16
+#define BLOCK_SIZE 8
 void get_args(int argc, char **argv){
    if (argc != 6) {
@ -139,45 +139,9 @@ int meanshift(double **original_points, double ***shifted_points, int deviation
    // creates new y vector
    double **new_shift = alloc_2d_double(NUMBER_OF_POINTS, DIMENSIONS);
 //==============================================================================
    // builds nominator
-    /*multiply(kernel_matrix, original_points, new_shift);*/
+    multiply(kernel_matrix, original_points, &new_shift);
    Matrix d_kernel_matrix;
    d_kernel_matrix.width = NUMBER_OF_POINTS;
    d_kernel_matrix.height = NUMBER_OF_POINTS;
    int size = NUMBER_OF_POINTS * NUMBER_OF_POINTS * sizeof(double);
    cudaMalloc(&d_kernel_matrix.elements, size);
    cudaMemcpy(d_kernel_matrix.elements, &(kernel_matrix[0][0]), size, cudaMemcpyHostToDevice);
    Matrix d_original_points;
    d_original_points.width = DIMENSIONS;
    d_original_points.height = NUMBER_OF_POINTS;
    size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double);
    cudaMalloc(&d_original_points.elements, size);
    cudaMemcpy(d_original_points.elements, &(original_points[0][0]), size, cudaMemcpyHostToDevice);
    Matrix d_new_shift;
    d_new_shift.width = DIMENSIONS;
    d_new_shift.height = NUMBER_OF_POINTS;
    size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double);
    cudaMalloc(&d_new_shift.elements, size);
    dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
    dim3 dimGrid(d_original_points.width / dimBlock.x, d_kernel_matrix.height / dimBlock.y);
    multiply_kernel<<<dimGrid, dimBlock>>>(d_kernel_matrix, d_original_points
        , d_new_shift);
    size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double);
    cudaMemcpy(&(new_shift[0][0]), d_new_shift.elements, size, cudaMemcpyDeviceToHost);
    cudaFree(d_kernel_matrix.elements);
    cudaFree(d_original_points.elements);
    cudaFree(d_new_shift.elements);
 //==============================================================================
    // divides element-wise
    for (int i=0; i<NUMBER_OF_POINTS; i++){
@ -230,6 +194,51 @@ double norm(double **matrix, int rows, int cols){
    return norm;
 }
 void multiply(double **kernel_matrix, double **original_points, double ***new_shift){
 	// allocates memory for kernel_matrix in GPU and copies the array
 	Matrix d_kernel_matrix;
    d_kernel_matrix.width = NUMBER_OF_POINTS;
    d_kernel_matrix.height = NUMBER_OF_POINTS;
    int size = NUMBER_OF_POINTS * NUMBER_OF_POINTS * sizeof(double);
    gpuErrchk( cudaMalloc(&d_kernel_matrix.elements, size) );
    gpuErrchk( cudaMemcpy(d_kernel_matrix.elements, &(kernel_matrix[0][0])
    	, size, cudaMemcpyHostToDevice) );
    // allocates memory for original_points in GPU and copies the array
    Matrix d_original_points;
    d_original_points.width = DIMENSIONS;
    d_original_points.height = NUMBER_OF_POINTS;
    size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double);
    gpuErrchk( cudaMalloc(&d_original_points.elements, size) );
    gpuErrchk( cudaMemcpy(d_original_points.elements, &(original_points[0][0])
    	, size, cudaMemcpyHostToDevice) );
 	// allocates memory for new_shift in GPU
    Matrix d_new_shift;
    d_new_shift.width = DIMENSIONS;
    d_new_shift.height = NUMBER_OF_POINTS;
    size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double);
    gpuErrchk( cudaMalloc(&d_new_shift.elements, size) );
    //dim3 dimBlock(16, 16);
    //dim3 dimGrid(d_original_points.width / dimBlock.x, d_kernel_matrix.height / dimBlock.y);
    dim3 dimBlock(10, 2);
    dim3 dimGrid(60, 1);
    multiply_kernel<<<dimGrid, dimBlock>>>(d_kernel_matrix, d_original_points, d_new_shift);
    gpuErrchk( cudaPeekAtLastError() );
    gpuErrchk( cudaDeviceSynchronize() );
    size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double);
    gpuErrchk( cudaMemcpy(&((*new_shift)[0][0]), d_new_shift.elements
    	, size, cudaMemcpyDeviceToHost) );
    gpuErrchk( cudaFree(d_kernel_matrix.elements) );
    gpuErrchk( cudaFree(d_original_points.elements) );
    gpuErrchk( cudaFree(d_new_shift.elements) );
 }
 double calculateDistance(double *y, double *x){
    double sum = 0, dif;
    for (int i=0; i<DIMENSIONS; i++){
--- a/mean_shift_cuda/meanshift_utils.h
+++ b/mean_shift_cuda/meanshift_utils.h
@ -3,6 +3,17 @@
 #include <stdbool.h>
 //GPU error check snippet taken from:
 //https://stackoverflow.com/a/14038590
 #define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
 inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true){
   if (code != cudaSuccess) 
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
 }
 extern int DEVIATION;
 extern int NUMBER_OF_POINTS;
 extern int DIMENSIONS;
@ -30,6 +41,11 @@ int meanshift(double **original_points, double ***shifted_points, int h
 //Function norm returns the second norm of matrix of dimensions rowsXcols.
 double norm(double **matrix, int rows, int cols);
 //Function multiply allocates memory in GPU, sends the data and calls the 
 //multiply kernel function.
 void multiply(double **kernel_matrix, double **original_points
 	, double ***new_shift);
 //Function calculateDistance returns the distance between x and y vectors.
 double calculateDistance(double *y, double *x);