Browse Source

Fix multiply, Add error checking for CUDA calls

master
Apostolos Fanakis 7 years ago
parent
commit
dd81bf36bb
  1. 7
      mean_shift_cuda/Makefile
  2. BIN
      mean_shift_cuda/meanshift
  3. 7
      mean_shift_cuda/meanshift_kernels.cu
  4. 85
      mean_shift_cuda/meanshift_utils.cu
  5. 16
      mean_shift_cuda/meanshift_utils.h

7
mean_shift_cuda/Makefile

@ -5,7 +5,8 @@ SHELL := /bin/bash
CC = /usr/local/cuda/bin/nvcc
RM = rm -f
CFLAGS= -arch=sm_21 -lm -O0 -I.
HOST_COMPILER = -ccbin gcc
CFLAGS= -arch=sm_21 -lm -O0 -I. -Wno-deprecated-gpu-targets
OBJ = meanshift.o meanshift_utils.o meanshift_kernels.o
DEPS = meanshift_utils.h meanshift_kernels.h
@ -22,7 +23,7 @@ all: $(EXECUTABLES)
# DEPENDENCIES (HEADERS)
%.o: %.cu $(DEPS)
$(CC) -x cu $(CFLAGS) -dc $< -o $@
$(CC) $(HOST_COMPILER) -x cu $(CFLAGS) -dc $< -o $@
.PRECIOUS: $(EXECUTABLES) $(OBJ)
@ -30,7 +31,7 @@ all: $(EXECUTABLES)
# EXECUTABLE (MAIN)
$(EXECUTABLES): $(OBJ)
$(CC) $(CFLAGS) $(OBJ) -o $@
$(CC) $(HOST_COMPILER) $(CFLAGS) $(OBJ) -o $@
clean:
$(RM) *.o *~ $(EXECUTABLES)

BIN
mean_shift_cuda/meanshift

Binary file not shown.

7
mean_shift_cuda/meanshift_kernels.cu

@ -5,15 +5,14 @@ __global__ void multiply_kernel(Matrix matrix1, Matrix matrix2, Matrix output){
// Each thread computes one element of output
// by accumulating results into cell_value
double cell_value = 0;
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
int col = blockIdx.y * blockDim.y + threadIdx.y;
int row = blockIdx.x * blockDim.x + threadIdx.x;
if (row < output.height && col < output.width){
if (row + col < output.height * output.width){
for (int element_index = 0; element_index < matrix1.width; ++element_index){
cell_value += matrix1.elements[row * matrix1.width + element_index]
* matrix2.elements[element_index * matrix2.width + col];
}
printf("%f\n", cell_value);
output.elements[row * output.width + col] = cell_value;
}
}

85
mean_shift_cuda/meanshift_utils.cu

@ -8,7 +8,7 @@
#include "meanshift_kernels.h"
#define OUTPUT_PREFIX "../output/output_"
#define BLOCK_SIZE 16
#define BLOCK_SIZE 8
void get_args(int argc, char **argv){
if (argc != 6) {
@ -139,45 +139,9 @@ int meanshift(double **original_points, double ***shifted_points, int deviation
// creates new y vector
double **new_shift = alloc_2d_double(NUMBER_OF_POINTS, DIMENSIONS);
//==============================================================================
// builds nominator
/*multiply(kernel_matrix, original_points, new_shift);*/
Matrix d_kernel_matrix;
d_kernel_matrix.width = NUMBER_OF_POINTS;
d_kernel_matrix.height = NUMBER_OF_POINTS;
int size = NUMBER_OF_POINTS * NUMBER_OF_POINTS * sizeof(double);
cudaMalloc(&d_kernel_matrix.elements, size);
cudaMemcpy(d_kernel_matrix.elements, &(kernel_matrix[0][0]), size, cudaMemcpyHostToDevice);
Matrix d_original_points;
d_original_points.width = DIMENSIONS;
d_original_points.height = NUMBER_OF_POINTS;
size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double);
cudaMalloc(&d_original_points.elements, size);
cudaMemcpy(d_original_points.elements, &(original_points[0][0]), size, cudaMemcpyHostToDevice);
Matrix d_new_shift;
d_new_shift.width = DIMENSIONS;
d_new_shift.height = NUMBER_OF_POINTS;
size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double);
cudaMalloc(&d_new_shift.elements, size);
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
dim3 dimGrid(d_original_points.width / dimBlock.x, d_kernel_matrix.height / dimBlock.y);
multiply_kernel<<<dimGrid, dimBlock>>>(d_kernel_matrix, d_original_points
, d_new_shift);
size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double);
cudaMemcpy(&(new_shift[0][0]), d_new_shift.elements, size, cudaMemcpyDeviceToHost);
cudaFree(d_kernel_matrix.elements);
cudaFree(d_original_points.elements);
cudaFree(d_new_shift.elements);
//==============================================================================
multiply(kernel_matrix, original_points, &new_shift);
// divides element-wise
for (int i=0; i<NUMBER_OF_POINTS; i++){
@ -230,6 +194,51 @@ double norm(double **matrix, int rows, int cols){
return norm;
}
void multiply(double **kernel_matrix, double **original_points, double ***new_shift){
// allocates memory for kernel_matrix in GPU and copies the array
Matrix d_kernel_matrix;
d_kernel_matrix.width = NUMBER_OF_POINTS;
d_kernel_matrix.height = NUMBER_OF_POINTS;
int size = NUMBER_OF_POINTS * NUMBER_OF_POINTS * sizeof(double);
gpuErrchk( cudaMalloc(&d_kernel_matrix.elements, size) );
gpuErrchk( cudaMemcpy(d_kernel_matrix.elements, &(kernel_matrix[0][0])
, size, cudaMemcpyHostToDevice) );
// allocates memory for original_points in GPU and copies the array
Matrix d_original_points;
d_original_points.width = DIMENSIONS;
d_original_points.height = NUMBER_OF_POINTS;
size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double);
gpuErrchk( cudaMalloc(&d_original_points.elements, size) );
gpuErrchk( cudaMemcpy(d_original_points.elements, &(original_points[0][0])
, size, cudaMemcpyHostToDevice) );
// allocates memory for new_shift in GPU
Matrix d_new_shift;
d_new_shift.width = DIMENSIONS;
d_new_shift.height = NUMBER_OF_POINTS;
size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double);
gpuErrchk( cudaMalloc(&d_new_shift.elements, size) );
//dim3 dimBlock(16, 16);
//dim3 dimGrid(d_original_points.width / dimBlock.x, d_kernel_matrix.height / dimBlock.y);
dim3 dimBlock(10, 2);
dim3 dimGrid(60, 1);
multiply_kernel<<<dimGrid, dimBlock>>>(d_kernel_matrix, d_original_points, d_new_shift);
gpuErrchk( cudaPeekAtLastError() );
gpuErrchk( cudaDeviceSynchronize() );
size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double);
gpuErrchk( cudaMemcpy(&((*new_shift)[0][0]), d_new_shift.elements
, size, cudaMemcpyDeviceToHost) );
gpuErrchk( cudaFree(d_kernel_matrix.elements) );
gpuErrchk( cudaFree(d_original_points.elements) );
gpuErrchk( cudaFree(d_new_shift.elements) );
}
double calculateDistance(double *y, double *x){
double sum = 0, dif;
for (int i=0; i<DIMENSIONS; i++){

16
mean_shift_cuda/meanshift_utils.h

@ -3,6 +3,17 @@
#include <stdbool.h>
//GPU error check snippet taken from:
//https://stackoverflow.com/a/14038590
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true){
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
extern int DEVIATION;
extern int NUMBER_OF_POINTS;
extern int DIMENSIONS;
@ -30,6 +41,11 @@ int meanshift(double **original_points, double ***shifted_points, int h
//Function norm returns the second norm of matrix of dimensions rowsXcols.
double norm(double **matrix, int rows, int cols);
//Function multiply allocates memory in GPU, sends the data and calls the
//multiply kernel function.
void multiply(double **kernel_matrix, double **original_points
, double ***new_shift);
//Function calculateDistance returns the distance between x and y vectors.
double calculateDistance(double *y, double *x);

Loading…
Cancel
Save