diff --git a/mean_shift_cuda/Makefile b/mean_shift_cuda/Makefile index a931e40..b244f3d 100644 --- a/mean_shift_cuda/Makefile +++ b/mean_shift_cuda/Makefile @@ -5,9 +5,10 @@ SHELL := /bin/bash CC = /usr/local/cuda/bin/nvcc RM = rm -f -CFLAGS= -arch=sm_21 -lm -O0 -I. -OBJ=meanshift.o meanshift_utils.o meanshift_kernels.o -DEPS=meanshift_utils.h meanshift_kernels.h +HOST_COMPILER = -ccbin gcc +CFLAGS= -arch=sm_21 -lm -O0 -I. -Wno-deprecated-gpu-targets +OBJ = meanshift.o meanshift_utils.o meanshift_kernels.o +DEPS = meanshift_utils.h meanshift_kernels.h # ========================================== # TARGETS @@ -22,7 +23,7 @@ all: $(EXECUTABLES) # DEPENDENCIES (HEADERS) %.o: %.cu $(DEPS) - $(CC) -x cu $(CFLAGS) -dc $< -o $@ + $(CC) $(HOST_COMPILER) -x cu $(CFLAGS) -dc $< -o $@ .PRECIOUS: $(EXECUTABLES) $(OBJ) @@ -30,7 +31,7 @@ all: $(EXECUTABLES) # EXECUTABLE (MAIN) $(EXECUTABLES): $(OBJ) - $(CC) $(CFLAGS) $(OBJ) -o $@ + $(CC) $(HOST_COMPILER) $(CFLAGS) $(OBJ) -o $@ clean: $(RM) *.o *~ $(EXECUTABLES) \ No newline at end of file diff --git a/mean_shift_cuda/meanshift b/mean_shift_cuda/meanshift new file mode 100644 index 0000000..44369bc Binary files /dev/null and b/mean_shift_cuda/meanshift differ diff --git a/mean_shift_cuda/meanshift_kernels.cu b/mean_shift_cuda/meanshift_kernels.cu index 79fb49a..69ebe70 100644 --- a/mean_shift_cuda/meanshift_kernels.cu +++ b/mean_shift_cuda/meanshift_kernels.cu @@ -5,15 +5,14 @@ __global__ void multiply_kernel(Matrix matrix1, Matrix matrix2, Matrix output){ // Each thread computes one element of output // by accumulating results into cell_value double cell_value = 0; - int row = blockIdx.y * blockDim.y + threadIdx.y; - int col = blockIdx.x * blockDim.x + threadIdx.x; + int col = blockIdx.y * blockDim.y + threadIdx.y; + int row = blockIdx.x * blockDim.x + threadIdx.x; - if (row < output.height && col < output.width){ + if (row + col < output.height * output.width){ for (int element_index = 0; element_index < matrix1.width; ++element_index){ cell_value += matrix1.elements[row * matrix1.width + element_index] * matrix2.elements[element_index * matrix2.width + col]; } - printf("%f\n", cell_value); output.elements[row * output.width + col] = cell_value; } } \ No newline at end of file diff --git a/mean_shift_cuda/meanshift_utils.cu b/mean_shift_cuda/meanshift_utils.cu index e791067..6b2f30e 100644 --- a/mean_shift_cuda/meanshift_utils.cu +++ b/mean_shift_cuda/meanshift_utils.cu @@ -8,7 +8,7 @@ #include "meanshift_kernels.h" #define OUTPUT_PREFIX "../output/output_" -#define BLOCK_SIZE 16 +#define BLOCK_SIZE 8 void get_args(int argc, char **argv){ if (argc != 6) { @@ -139,45 +139,9 @@ int meanshift(double **original_points, double ***shifted_points, int deviation // creates new y vector double **new_shift = alloc_2d_double(NUMBER_OF_POINTS, DIMENSIONS); -//============================================================================== // builds nominator - /*multiply(kernel_matrix, original_points, new_shift);*/ - - Matrix d_kernel_matrix; - d_kernel_matrix.width = NUMBER_OF_POINTS; - d_kernel_matrix.height = NUMBER_OF_POINTS; - int size = NUMBER_OF_POINTS * NUMBER_OF_POINTS * sizeof(double); - cudaMalloc(&d_kernel_matrix.elements, size); - cudaMemcpy(d_kernel_matrix.elements, &(kernel_matrix[0][0]), size, cudaMemcpyHostToDevice); - - Matrix d_original_points; - d_original_points.width = DIMENSIONS; - d_original_points.height = NUMBER_OF_POINTS; - size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double); - cudaMalloc(&d_original_points.elements, size); - cudaMemcpy(d_original_points.elements, &(original_points[0][0]), size, cudaMemcpyHostToDevice); - - Matrix d_new_shift; - d_new_shift.width = DIMENSIONS; - d_new_shift.height = NUMBER_OF_POINTS; - size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double); - cudaMalloc(&d_new_shift.elements, size); - - dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE); - dim3 dimGrid(d_original_points.width / dimBlock.x, d_kernel_matrix.height / dimBlock.y); - - multiply_kernel<<>>(d_kernel_matrix, d_original_points - , d_new_shift); - - size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double); - cudaMemcpy(&(new_shift[0][0]), d_new_shift.elements, size, cudaMemcpyDeviceToHost); - - cudaFree(d_kernel_matrix.elements); - cudaFree(d_original_points.elements); - cudaFree(d_new_shift.elements); - -//============================================================================== + multiply(kernel_matrix, original_points, &new_shift); // divides element-wise for (int i=0; i>>(d_kernel_matrix, d_original_points, d_new_shift); + gpuErrchk( cudaPeekAtLastError() ); + gpuErrchk( cudaDeviceSynchronize() ); + + size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double); + gpuErrchk( cudaMemcpy(&((*new_shift)[0][0]), d_new_shift.elements + , size, cudaMemcpyDeviceToHost) ); + + gpuErrchk( cudaFree(d_kernel_matrix.elements) ); + gpuErrchk( cudaFree(d_original_points.elements) ); + gpuErrchk( cudaFree(d_new_shift.elements) ); +} + double calculateDistance(double *y, double *x){ double sum = 0, dif; for (int i=0; i +//GPU error check snippet taken from: +//https://stackoverflow.com/a/14038590 +#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); } +inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true){ + if (code != cudaSuccess) + { + fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); + if (abort) exit(code); + } +} + extern int DEVIATION; extern int NUMBER_OF_POINTS; extern int DIMENSIONS; @@ -30,6 +41,11 @@ int meanshift(double **original_points, double ***shifted_points, int h //Function norm returns the second norm of matrix of dimensions rowsXcols. double norm(double **matrix, int rows, int cols); +//Function multiply allocates memory in GPU, sends the data and calls the +//multiply kernel function. +void multiply(double **kernel_matrix, double **original_points + , double ***new_shift); + //Function calculateDistance returns the distance between x and y vectors. double calculateDistance(double *y, double *x);