diff --git a/mean_shift_cuda/Makefile b/mean_shift_cuda/Makefile index b244f3d..b2fb2c8 100644 --- a/mean_shift_cuda/Makefile +++ b/mean_shift_cuda/Makefile @@ -3,13 +3,19 @@ SHELL := /bin/bash # ============================================ # COMMANDS -CC = /usr/local/cuda/bin/nvcc -RM = rm -f +CC = nvcc HOST_COMPILER = -ccbin gcc -CFLAGS= -arch=sm_21 -lm -O0 -I. -Wno-deprecated-gpu-targets +CUDA_FLAGS = -arch=sm_21 -Wno-deprecated-gpu-targets +C_FLAGS = -lm -O3 -I. + +COMPILE_FLAGS = $(HOST_COMPILER) -x cu $(CUDA_FLAGS) -dc $(C_FLAGS) +LINK_FLAGS = $(HOST_COMPILER) $(CUDA_FLAGS) $(C_FLAGS) + OBJ = meanshift.o meanshift_utils.o meanshift_kernels.o DEPS = meanshift_utils.h meanshift_kernels.h +RM = rm -f + # ========================================== # TARGETS @@ -23,7 +29,7 @@ all: $(EXECUTABLES) # DEPENDENCIES (HEADERS) %.o: %.cu $(DEPS) - $(CC) $(HOST_COMPILER) -x cu $(CFLAGS) -dc $< -o $@ + $(CC) $(COMPILE_FLAGS) $< -o $@ .PRECIOUS: $(EXECUTABLES) $(OBJ) @@ -31,7 +37,7 @@ all: $(EXECUTABLES) # EXECUTABLE (MAIN) $(EXECUTABLES): $(OBJ) - $(CC) $(HOST_COMPILER) $(CFLAGS) $(OBJ) -o $@ + $(CC) $(LINK_FLAGS) $(OBJ) -o $@ clean: $(RM) *.o *~ $(EXECUTABLES) \ No newline at end of file diff --git a/mean_shift_cuda/meanshift b/mean_shift_cuda/meanshift index 44369bc..02e8444 100644 Binary files a/mean_shift_cuda/meanshift and b/mean_shift_cuda/meanshift differ diff --git a/mean_shift_cuda/meanshift.cu b/mean_shift_cuda/meanshift.cu index f312238..778d081 100644 --- a/mean_shift_cuda/meanshift.cu +++ b/mean_shift_cuda/meanshift.cu @@ -9,30 +9,38 @@ int NUMBER_OF_POINTS = 600; int DIMENSIONS = 2; char* POINTS_FILENAME = "../data/X.bin"; char* LABELS_FILENAME = "../data/L.bin"; +parameters params; struct timeval startwtime, endwtime; double seq_time; int main(int argc, char **argv){ + int iterations; double **vectors, **shifted_points; char *labels; - parameters params; - //get_args(argc, argv); commented out while in development - init(&vectors, &labels, ¶ms); + params.epsilon = 0.0001; + params.verbose = true; + params.display = true; + + //get_args(argc, argv, ¶ms); //commented out while in development + init(&vectors, &labels); //save_matrix(vectors, 0); // tic gettimeofday (&startwtime, NULL); - int iterations = meanshift(vectors, &shifted_points, DEVIATION, ¶ms); - printf("Total iterations = %d\n", iterations); + iterations = meanshift(vectors, &shifted_points, DEVIATION, ¶ms); // toc gettimeofday (&endwtime, NULL); seq_time = (double)((endwtime.tv_usec - startwtime.tv_usec)/1.0e6 + endwtime.tv_sec - startwtime.tv_sec); - printf("%s wall clock time = %f\n","Mean Shift", seq_time); + + if (params.verbose){ + printf("\nTotal number of iterations = %d\n", iterations); + printf("%s wall clock time = %f\n","Mean Shift", seq_time); + } //TODO write output points to file -> plot later //save_matrix(shifted_points, iterations); diff --git a/mean_shift_cuda/meanshift_kernels.cu b/mean_shift_cuda/meanshift_kernels.cu index 69ebe70..ff284b8 100644 --- a/mean_shift_cuda/meanshift_kernels.cu +++ b/mean_shift_cuda/meanshift_kernels.cu @@ -5,8 +5,8 @@ __global__ void multiply_kernel(Matrix matrix1, Matrix matrix2, Matrix output){ // Each thread computes one element of output // by accumulating results into cell_value double cell_value = 0; - int col = blockIdx.y * blockDim.y + threadIdx.y; int row = blockIdx.x * blockDim.x + threadIdx.x; + int col = blockIdx.y * blockDim.y + threadIdx.y; if (row + col < output.height * output.width){ for (int element_index = 0; element_index < matrix1.width; ++element_index){ diff --git a/mean_shift_cuda/meanshift_utils.cu b/mean_shift_cuda/meanshift_utils.cu index 6b2f30e..754c5e5 100644 --- a/mean_shift_cuda/meanshift_utils.cu +++ b/mean_shift_cuda/meanshift_utils.cu @@ -8,28 +8,66 @@ #include "meanshift_kernels.h" #define OUTPUT_PREFIX "../output/output_" -#define BLOCK_SIZE 8 - -void get_args(int argc, char **argv){ - if (argc != 6) { - printf("Usage: %s h N D Pd Pl\nwhere:\n", argv[0]); - printf("\th is the variance\n"); - printf("\tN is the the number of points\n"); - printf("\tD is the number of dimensions of each point\n"); - printf("\tPd is the path of the dataset file\n"); - printf("\tPl is the path of the labels file\n"); +int BLOCK_SIZE = 16; + +cudaDeviceProp device_properties; + +void get_args(int argc, char **argv, parameters *params){ + if (argc < 7) { + printf("Usage: %s h e N D Pd Pl\nwhere:\n" + "\th is the variance\n" + "\te is the min distance, between two points, that is taken into account in computations\n" + "\tN is the the number of points\n" + "\tD is the number of dimensions of each point\n" + "\tPd is the path of the dataset file\n" + "\tPl is the path of the labels file\n" + "\n\t--verbose | -v is an optional flag to enable execution information output" + "\n\t--output | -o is an optional flag to enable points output in each iteration", argv[0]); exit(1); } DEVIATION = atoi(argv[1]); - NUMBER_OF_POINTS = atoi(argv[2]); - DIMENSIONS = atoi(argv[3]); - POINTS_FILENAME = argv[4]; - LABELS_FILENAME = argv[5]; + params->epsilon = atof(argv[2]); + NUMBER_OF_POINTS = atoi(argv[3]); + DIMENSIONS = atoi(argv[4]); + POINTS_FILENAME = argv[5]; + LABELS_FILENAME = argv[6]; + params->verbose = false; + params->display = false; + + if (argc > 7){ + for (int index=7; indexverbose = true; + } else if (!strcmp(argv[index], "--output") || !strcmp(argv[index], "-o")){ + params->display = true; + } else { + printf("Couldn't parse argument %d: %s\n", index, argv[index]); + exit(EXIT_FAILURE); + } + } + } + + /*printf("DEVIATION = %d\n" + "epsilon = %f\n" + "NUMBER_OF_POINTS = %d\n" + "DIMENSIONS = %d\n" + "POINTS_FILENAME = %s\n" + "LABELS_FILENAME = %s\n" + "verbose = %d\n" + "display = %d\n", DEVIATION, params->epsilon, NUMBER_OF_POINTS, DIMENSIONS, POINTS_FILENAME + , LABELS_FILENAME, params->verbose, params->display);*/ } -void init(double ***vectors, char **labels, parameters *params){ +void init(double ***vectors, char **labels){ int bytes_read = 0; + + set_Gpu(); + + if (params.verbose){ + printf("Reading dataset and labels...\n"); + } + // initializes vectors FILE *points_file; points_file = fopen(POINTS_FILENAME, "rb"); @@ -83,10 +121,45 @@ void init(double ***vectors, char **labels, parameters *params){ } fclose(labels_file); - // MEAN SHIFT OPTIONS - params->epsilon = 0.0001; - params->verbose = false; - params->display = false; + if (params.verbose){ + printf("Done.\n\n"); + } +} + +//Based on https://stackoverflow.com/a/28113186 +//Poio psagmeno link https://www.cs.virginia.edu/~csadmin/wiki/index.php/CUDA_Support/Choosing_a_GPU +void set_Gpu(){ + int devices_count = 0, max_multiprocessors = 0, max_device = 0; + + // gets devices count checking for errors like no devices or no drivers to check for + // devices available + gpuErrchk( cudaGetDeviceCount(&devices_count) ); + for(int device_index = 0; device_index < devices_count; ++device_index){ + // gets current index device's properties + cudaDeviceProp this_device_properties; + gpuErrchk( cudaGetDeviceProperties(&this_device_properties, device_index) ); + + // stores best available device's index + // only devices with compute capability >= 2.0 are able to run the code + if (max_multiprocessors < this_device_properties.multiProcessorCount + && this_device_properties.major >= 2 && this_device_properties.minor >= 0){ + // stores devices properties for later use + device_properties = this_device_properties; + max_multiprocessors = this_device_properties.multiProcessorCount; + max_device = device_index; + } + } + // sets the device + gpuErrchk( cudaSetDevice(max_device) ); + BLOCK_SIZE = device_properties.maxThreadsPerBlock; + if (params.verbose){ + printf("Device chosen is \"%s\"\n" + "Device has %d multi processors and compute capability %d.%d\n" + "Setting BLOCK_SIZE to max threads per block supported (%d)\n\n" + , device_properties.name + , device_properties.multiProcessorCount, device_properties.major, device_properties.minor + , BLOCK_SIZE); + } } int meanshift(double **original_points, double ***shifted_points, int deviation @@ -157,11 +230,15 @@ int meanshift(double **original_points, double ***shifted_points, int deviation // updates shifted points pointer to the new array address shifted_points = &new_shift; - save_matrix((*shifted_points), iteration); + if (params.display){ + save_matrix((*shifted_points), iteration); + } // calculates norm of the new mean shift vector double current_norm = norm(mean_shift_vector, NUMBER_OF_POINTS, DIMENSIONS); - printf("Iteration n. %d, error %f \n", iteration, current_norm); + if (params.verbose){ + printf("Iteration n. %d, error %f \n", iteration, current_norm); + } /** iterates until convergence **/ if (current_norm > opt->epsilon) { @@ -170,7 +247,7 @@ int meanshift(double **original_points, double ***shifted_points, int deviation } if (iteration == 0){ - // cleans up this iteration's allocations + // cleans up allocations free(mean_shift_vector[0]); free(mean_shift_vector); free(kernel_matrix[0]); @@ -195,6 +272,8 @@ double norm(double **matrix, int rows, int cols){ } void multiply(double **kernel_matrix, double **original_points, double ***new_shift){ + static bool firstIter = true; + // allocates memory for kernel_matrix in GPU and copies the array Matrix d_kernel_matrix; d_kernel_matrix.width = NUMBER_OF_POINTS; @@ -220,11 +299,16 @@ void multiply(double **kernel_matrix, double **original_points, double ***new_sh size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double); gpuErrchk( cudaMalloc(&d_new_shift.elements, size) ); - //dim3 dimBlock(16, 16); - //dim3 dimGrid(d_original_points.width / dimBlock.x, d_kernel_matrix.height / dimBlock.y); + dim3 dimBlock((d_new_shift.height < sqrt(BLOCK_SIZE)) ? d_new_shift.height : sqrt(BLOCK_SIZE) + , (d_new_shift.width < sqrt(BLOCK_SIZE)) ? d_new_shift.width : sqrt(BLOCK_SIZE)); + dim3 dimGrid((d_new_shift.height + dimBlock.x - 1) / dimBlock.x + , (d_new_shift.width + dimBlock.y - 1) / dimBlock.y); - dim3 dimBlock(10, 2); - dim3 dimGrid(60, 1); + if (firstIter && params.verbose){ + printf("dimBlock.x = %d, dimBlock.y = %d\n", dimBlock.x, dimBlock.y); + printf("dimGrid.x = %d, dimGrid.y = %d\n\n", dimGrid.x, dimGrid.y); + firstIter = false; + } multiply_kernel<<>>(d_kernel_matrix, d_original_points, d_new_shift); gpuErrchk( cudaPeekAtLastError() ); diff --git a/mean_shift_cuda/meanshift_utils.h b/mean_shift_cuda/meanshift_utils.h index a09de91..ae15d02 100644 --- a/mean_shift_cuda/meanshift_utils.h +++ b/mean_shift_cuda/meanshift_utils.h @@ -14,23 +14,29 @@ inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=t } } -extern int DEVIATION; -extern int NUMBER_OF_POINTS; -extern int DIMENSIONS; -extern char* POINTS_FILENAME; -extern char* LABELS_FILENAME; - +/* Structs */ typedef struct parameters { double epsilon; bool verbose; bool display; } parameters; +/* Global variables */ +extern int DEVIATION; +extern int NUMBER_OF_POINTS; +extern int DIMENSIONS; +extern char* POINTS_FILENAME; +extern char* LABELS_FILENAME; +extern parameters params; +extern cudaDeviceProp device_properties; + //Function get_args parses command line arguments. -void get_args(int argc, char **argv); +void get_args(int argc, char **argv, parameters *params); //Function init reads the dataset and label arrays from the corresponding files. -void init(double ***vectors, char **labels, parameters *params); +void init(double ***vectors, char **labels); + +void set_Gpu(); //Function meanshift recursively shifts original points according to th //mean-shift algorithm saving the result to shiftedPoints. Struct opt has user