From 649e1d7850a5db26146e38b5145c3550341511e6 Mon Sep 17 00:00:00 2001 From: Apostolof Date: Sun, 28 Jan 2018 15:19:29 +0200 Subject: [PATCH] Shared memory implementation init --- mean_shift_cuda_shared_mem/Makefile | 43 ++ mean_shift_cuda_shared_mem/meanshift.cu | 45 +++ .../meanshift_gpu_utils.cu | 370 ++++++++++++++++++ .../meanshift_gpu_utils.h | 58 +++ .../meanshift_kernels.cu | 144 +++++++ .../meanshift_kernels.h | 29 ++ mean_shift_cuda_shared_mem/meanshift_utils.cu | 165 ++++++++ mean_shift_cuda_shared_mem/meanshift_utils.h | 35 ++ 8 files changed, 889 insertions(+) create mode 100644 mean_shift_cuda_shared_mem/Makefile create mode 100644 mean_shift_cuda_shared_mem/meanshift.cu create mode 100644 mean_shift_cuda_shared_mem/meanshift_gpu_utils.cu create mode 100644 mean_shift_cuda_shared_mem/meanshift_gpu_utils.h create mode 100644 mean_shift_cuda_shared_mem/meanshift_kernels.cu create mode 100644 mean_shift_cuda_shared_mem/meanshift_kernels.h create mode 100644 mean_shift_cuda_shared_mem/meanshift_utils.cu create mode 100644 mean_shift_cuda_shared_mem/meanshift_utils.h diff --git a/mean_shift_cuda_shared_mem/Makefile b/mean_shift_cuda_shared_mem/Makefile new file mode 100644 index 0000000..f7ff628 --- /dev/null +++ b/mean_shift_cuda_shared_mem/Makefile @@ -0,0 +1,43 @@ +SHELL := /bin/bash + +# ============================================ +# COMMANDS + +CC = nvcc +HOST_COMPILER = -ccbin gcc +CUDA_FLAGS = -arch=sm_21 -Wno-deprecated-gpu-targets -lcublas +C_FLAGS = -lm -O3 -I. + +COMPILE_FLAGS = $(HOST_COMPILER) -x cu $(CUDA_FLAGS) -dc $(C_FLAGS) +LINK_FLAGS = $(HOST_COMPILER) $(CUDA_FLAGS) $(C_FLAGS) + +OBJ = meanshift.o meanshift_utils.o meanshift_gpu_utils.o meanshift_kernels.o +DEPS = meanshift_utils.h meanshift_kernels.h + +RM = rm -f + +# ========================================== +# TARGETS + +EXECUTABLES = meanshift + +.PHONY: all clean + +all: $(EXECUTABLES) + +# ========================================== +# DEPENDENCIES (HEADERS) + +%.o: %.cu $(DEPS) + $(CC) $(COMPILE_FLAGS) $< -o $@ + +.PRECIOUS: $(EXECUTABLES) $(OBJ) + +# ========================================== +# EXECUTABLE (MAIN) + +$(EXECUTABLES): $(OBJ) + $(CC) $(LINK_FLAGS) $(OBJ) -o $@ + +clean: + $(RM) *.o *~ $(EXECUTABLES) \ No newline at end of file diff --git a/mean_shift_cuda_shared_mem/meanshift.cu b/mean_shift_cuda_shared_mem/meanshift.cu new file mode 100644 index 0000000..37b971a --- /dev/null +++ b/mean_shift_cuda_shared_mem/meanshift.cu @@ -0,0 +1,45 @@ +#include +#include +#include + +#include "meanshift_utils.h" +#include "meanshift_gpu_utils.h" + +int DEVIATION = 1; +int NUMBER_OF_POINTS = 600; +int DIMENSIONS = 2; +const char *POINTS_FILENAME = "../data/X.bin"; +const char *LABELS_FILENAME = "../data/L.bin"; +parameters params; + +struct timeval startwtime, endwtime; +double seq_time; + +int main(int argc, char **argv){ + int recursions = 0; + double **vectors, **shifted_points; + char *labels; + + params.epsilon = 0.0001; + params.verbose = false; + params.display = true; + //get_args(argc, argv, ¶ms); //commented out while in development + init(&vectors, &labels); + + // tic + gettimeofday (&startwtime, NULL); + recursions = meanshift(vectors, &shifted_points, DEVIATION); + + // toc + gettimeofday (&endwtime, NULL); + seq_time = (double)((endwtime.tv_usec - startwtime.tv_usec)/1.0e6 + endwtime.tv_sec - startwtime.tv_sec); + + + printf("\nTotal number of recursions = %d\n", recursions); + printf("%s wall clock time = %f\n","Mean Shift", seq_time); + + free(vectors[0]); + free(vectors); + free(shifted_points[0]); + free(shifted_points); +} diff --git a/mean_shift_cuda_shared_mem/meanshift_gpu_utils.cu b/mean_shift_cuda_shared_mem/meanshift_gpu_utils.cu new file mode 100644 index 0000000..7f18758 --- /dev/null +++ b/mean_shift_cuda_shared_mem/meanshift_gpu_utils.cu @@ -0,0 +1,370 @@ +#include +#include +#include +#include +#include +#include + +#include + +#include "meanshift_utils.h" +#include "meanshift_gpu_utils.h" + +cudaDeviceProp device_properties; + +struct timeval start_w_time, end_w_time; +double seq; + +//Based on: +// https://www.cs.virginia.edu/~csadmin/wiki/index.php/CUDA_Support/Choosing_a_GPU +void set_GPU(){ + int devices_count = 0, max_multiprocessors = 0, max_device = 0; + + // gets devices count checking for errors like no devices or no drivers to check for + // devices available + gpuErrchk( cudaGetDeviceCount(&devices_count) ); + for(int device_index = 0; device_index < devices_count; ++device_index){ + // gets current index device's properties + cudaDeviceProp this_device_properties; + gpuErrchk( cudaGetDeviceProperties(&this_device_properties, device_index) ); + + // stores best available device's index + // only devices with compute capability >= 2.0 are able to run the code + if (max_multiprocessors < this_device_properties.multiProcessorCount + && this_device_properties.major >= 2 && this_device_properties.minor >= 0){ + // stores devices properties for later use + device_properties = this_device_properties; + max_multiprocessors = this_device_properties.multiProcessorCount; + max_device = device_index; + } + } + // sets the device + gpuErrchk( cudaSetDevice(max_device) ); + if (params.verbose){ + printf("Device chosen is \"%s\"\n" + "Device has %d multi processors and compute capability %d.%d\n" + "Max threads per block supported are %d\n\n" + , device_properties.name + , device_properties.multiProcessorCount, device_properties.major, device_properties.minor + , device_properties.maxThreadsPerBlock); + } +} + +int meanshift(double **original_points, double ***shifted_points, int deviation){ + // host variables + int size = 0; + static int recursion = 0; + static double **kernel_matrix, **mean_shift_vector, w_memcpy_time; + double **new_shift, current_norm = 0, tmp_w_memcpy_time; + bool is_first_recursion = false; + + // device variables + static Matrix d_original_points, d_shifted_points, d_kernel_matrix, d_denominator, + d_mean_shift_vector; + Matrix d_new_shift; + + // allocates memory and copies original points on first recursion + if (recursion == 0 || (*shifted_points) == NULL){ + is_first_recursion = true; + // allocates memory for shifted points array and copies original points into it + (*shifted_points) = alloc_double(NUMBER_OF_POINTS, DIMENSIONS); + duplicate(original_points, NUMBER_OF_POINTS, DIMENSIONS, shifted_points); + + // allocates memory for mean shift vector + mean_shift_vector = alloc_double(NUMBER_OF_POINTS, DIMENSIONS); + // initializes elements of mean_shift_vector to inf + for (int i=0;i params.epsilon) { + ++recursion; + meanshift(original_points, shifted_points, deviation); + } + + if (is_first_recursion){ + if (params.verbose){ + printf("\nCopying between device and host wall clock time = %f\n", w_memcpy_time); + } + + // cleans up allocations + free(mean_shift_vector[0]); + free(mean_shift_vector); + free(kernel_matrix[0]); + free(kernel_matrix); + + free_device_memory(d_original_points, d_kernel_matrix, d_denominator, d_shifted_points); + } + + return recursion; +} + +void init_device_memory(double **original_points, double **shifted_points, + Matrix *d_original_points, Matrix *d_shifted_points, Matrix *d_kernel_matrix, + Matrix *d_denominator, Matrix *d_mean_shift_vector){ + int size; + + // allocates memory for original_points in GPU and copies the array + d_original_points->width = DIMENSIONS; + d_original_points->height = NUMBER_OF_POINTS; + size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double); + gpuErrchk( cudaMalloc(&(d_original_points->elements), size) ); + gpuErrchk( cudaMemcpy(d_original_points->elements, &(original_points[0][0]) + , size, cudaMemcpyHostToDevice) ); + + // allocates memory for shifted_points in GPU and copies the array + d_shifted_points->width = DIMENSIONS; + d_shifted_points->height = NUMBER_OF_POINTS; + size = DIMENSIONS * NUMBER_OF_POINTS * sizeof(double); + gpuErrchk( cudaMalloc(&(d_shifted_points->elements), size) ); + gpuErrchk( cudaMemcpy(d_shifted_points->elements, &(shifted_points[0][0]) + , size, cudaMemcpyHostToDevice) ); + + // allocates memory for kernel_matrix in GPU + d_kernel_matrix->width = NUMBER_OF_POINTS; + d_kernel_matrix->height = NUMBER_OF_POINTS; + size = NUMBER_OF_POINTS * NUMBER_OF_POINTS * sizeof(double); + gpuErrchk( cudaMalloc(&(d_kernel_matrix->elements), size) ); + + // allocates memory for denominator in GPU + d_denominator->width = 1; + d_denominator->height = NUMBER_OF_POINTS; + size = NUMBER_OF_POINTS * sizeof(double); + gpuErrchk( cudaMalloc(&(d_denominator->elements), size) ); + + // allocates memory for mean_shift_vector in GPU + d_mean_shift_vector->width = DIMENSIONS; + d_mean_shift_vector->height = NUMBER_OF_POINTS; + size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double); + gpuErrchk( cudaMalloc(&(d_mean_shift_vector->elements), size) ); +} + +void calculate_kernel_matrix(Matrix d_shifted_points, Matrix d_original_points, + Matrix d_kernel_matrix, double deviation, double ***kernel_matrix, double *w_memcpy_time){ + int size; + static bool first_iter = true; + // gets max block size supported from the device + static int max_block_size = device_properties.maxThreadsPerBlock; + static int requested_block_size = (int)sqrt(max_block_size); + bool block_size_too_big = true; + + dim3 dimBlock; + dim3 dimGrid; + do { + dimBlock.x = requested_block_size; + dimBlock.y = requested_block_size; + dimGrid.x = (d_kernel_matrix.height + dimBlock.x - 1) / dimBlock.x; + dimGrid.y = (d_kernel_matrix.width + dimBlock.y - 1) / dimBlock.y; + + calculate_kernel_matrix_kernel<<>>(d_shifted_points, d_original_points + , deviation, d_kernel_matrix); + if (cudaGetLastError() != cudaSuccess){ + --requested_block_size; + } else { + block_size_too_big = false; + gpuErrchk( cudaDeviceSynchronize() ); + } + } while(block_size_too_big); + + if (first_iter && params.verbose){ + printf("calculate_kernel_matrix_kernel called with:\n"); + printf("dimBlock.x = %d, dimBlock.y = %d\n", dimBlock.x, dimBlock.y); + printf("dimGrid.x = %d, dimGrid.y = %d\n\n", dimGrid.x, dimGrid.y); + first_iter = false; + } + + size = NUMBER_OF_POINTS * NUMBER_OF_POINTS * sizeof(double); + + // tic + gettimeofday (&start_w_time, NULL); + + gpuErrchk( cudaMemcpy(&((*kernel_matrix)[0][0]), d_kernel_matrix.elements + , size, cudaMemcpyDeviceToHost) ); + + // toc + gettimeofday (&end_w_time, NULL); + *w_memcpy_time = (double)((end_w_time.tv_usec - start_w_time.tv_usec) + / 1.0e6 + end_w_time.tv_sec - start_w_time.tv_sec); +} + +void calculate_denominator(Matrix d_kernel_matrix, Matrix d_denominator){ + static bool first_iter = true; + // gets max block size supported from the device + static int requested_block_size = device_properties.maxThreadsPerBlock; + bool block_size_too_big = true; + + dim3 dimBlock; + dim3 dimGrid; + do { + dimBlock.x = requested_block_size; + dimBlock.y = 1; + dimGrid.x = (d_kernel_matrix.height + dimBlock.x - 1) / dimBlock.x; + dimGrid.y = 1; + + denominator_kernel<<>>(d_denominator, d_kernel_matrix); + if (cudaGetLastError() != cudaSuccess){ + --requested_block_size; + } else { + block_size_too_big = false; + gpuErrchk( cudaDeviceSynchronize() ); + } + } while(block_size_too_big); + + if (first_iter && params.verbose){ + printf("calculate_denominator called with:\n"); + printf("dimBlock.x = %d, dimBlock.y = %d\n", dimBlock.x, dimBlock.y); + printf("dimGrid.x = %d, dimGrid.y = %d\n\n", dimGrid.x, dimGrid.y); + first_iter = false; + } +} + +void shift_points(Matrix d_kernel_matrix, Matrix d_original_points, Matrix d_shifted_points, + Matrix d_new_shift, Matrix d_denominator, Matrix d_mean_shift_vector, double **kernel_matrix, + double **original_points, double ***new_shift, double ***mean_shift_vector, + double *w_memcpy_time){ + int size; + static bool first_iter = true; + // gets max block size supported from the device + static int max_block_size = device_properties.maxThreadsPerBlock; + static int requested_block_size = (int)(max_block_size / d_new_shift.width); + bool block_size_too_big = true; + + dim3 dimBlock; + dim3 dimGrid; + do { + /*dimBlock.x = requested_block_size; + dimBlock.y = d_new_shift.width;*/ + dimBlock.x = 2; + dimBlock.y = 2; + dimGrid.x = (d_denominator.height + dimBlock.x - 1) / dimBlock.x; + dimGrid.y = 1; + + shift_points_kernel<<>>(d_original_points, d_kernel_matrix, d_shifted_points, + d_new_shift, d_denominator, d_mean_shift_vector); + if (cudaGetLastError() != cudaSuccess){ + --requested_block_size; + } else { + block_size_too_big = false; + gpuErrchk( cudaDeviceSynchronize() ); + } + } while(block_size_too_big); + + if (first_iter && params.verbose){ + printf("shift_points_kernel called with:\n"); + printf("dimBlock.x = %d, dimBlock.y = %d\n", dimBlock.x, dimBlock.y); + printf("dimGrid.x = %d, dimGrid.y = %d\n\n", dimGrid.x, dimGrid.y); + first_iter = false; + } + + size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double); + + // tic + gettimeofday (&start_w_time, NULL); + + gpuErrchk( cudaMemcpy(&((*new_shift)[0][0]), d_new_shift.elements + , size, cudaMemcpyDeviceToHost) ); + gpuErrchk( cudaMemcpy(&((*mean_shift_vector)[0][0]), d_mean_shift_vector.elements + , size, cudaMemcpyDeviceToHost) ); + + // toc + gettimeofday (&end_w_time, NULL); + *w_memcpy_time = (double)((end_w_time.tv_usec - start_w_time.tv_usec) + / 1.0e6 + end_w_time.tv_sec - start_w_time.tv_sec); +} + +void free_device_memory(Matrix d_original_points, Matrix d_kernel_matrix, Matrix d_denominator, + Matrix d_shifted_points){ + // frees all memory previously allocated in device + gpuErrchk( cudaFree(d_original_points.elements) ); + gpuErrchk( cudaFree(d_kernel_matrix.elements) ); + gpuErrchk( cudaFree(d_denominator.elements) ); + gpuErrchk( cudaFree(d_shifted_points.elements) ); +} \ No newline at end of file diff --git a/mean_shift_cuda_shared_mem/meanshift_gpu_utils.h b/mean_shift_cuda_shared_mem/meanshift_gpu_utils.h new file mode 100644 index 0000000..83f9784 --- /dev/null +++ b/mean_shift_cuda_shared_mem/meanshift_gpu_utils.h @@ -0,0 +1,58 @@ +#ifndef SERIAL_GPU_UTILS_H /* Include guard */ +#define SERIAL_GPU_UTILS_H + +#include "meanshift_kernels.h" + +//GPU error check snippet taken from: +// https://stackoverflow.com/a/14038590 +#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); } +inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true){ + if (code != cudaSuccess){ + fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); + if (abort) exit(code); + } +} + +/* Global variables */ +extern int DEVIATION; +extern int NUMBER_OF_POINTS; +extern int DIMENSIONS; +extern const char* POINTS_FILENAME; +extern const char* LABELS_FILENAME; +extern Parameters params; +extern cudaDeviceProp device_properties; + +//Function set_GPU parses available GPU devices, selects the one with the most multi-processors for +//usage and stores its properties in global struct device_properties +void set_GPU(); + +//Function meanshift recursively shifts original points according to the mean-shift algorithm saving +//the result to shiftedPoints, h is the desirable deviation +int meanshift(double **original_points, double ***shifted_points, int h); + +//Function init_device_memory allocates memory for necessary arrays in the device +void init_device_memory(double **original_points, double **shifted_points, + Matrix *d_original_points, Matrix *d_shifted_points, Matrix *d_kernel_matrix, + Matrix *d_denominator, Matrix *d_new_shift); + +//Function calculate_kernel_matrix is a wrapper for the kernel call of the corresponding kernel +//"calculate_kernel_matrix_kernel" that calculates the kernel matrix +void calculate_kernel_matrix(Matrix d_shifted_points, Matrix d_original_points, + Matrix d_kernel_matrix, double deviation, double ***kernel_matrix, double *w_memcpy_time); + +//Function calculate_denominator is a wrapper for the kernel call of the corresponding kernel +//"calculate_denominator_kernel" that calculates the denominator of shifted points fraction +void calculate_denominator(Matrix d_kernel_matrix, Matrix d_denominator); + +//Function shift_points is a wrapper for the kernel call of the corresponding kernel +//"shift_points_kernel" that shifts the positions of all points +void shift_points(Matrix d_kernel_matrix, Matrix d_original_points, Matrix d_shifted_points, + Matrix d_new_shift, Matrix d_denominator, Matrix d_mean_shift_vector, double **kernel_matrix, + double **original_points, double ***new_shift, double ***mean_shift_vector, + double *w_memcpy_time); + +//Function free_device_memory frees device's previously allocated memory +void free_device_memory(Matrix d_original_points, Matrix d_kernel_matrix, Matrix d_denominator, + Matrix d_shifted_points); + +#endif //SERIAL_GPU_UTILS_H \ No newline at end of file diff --git a/mean_shift_cuda_shared_mem/meanshift_kernels.cu b/mean_shift_cuda_shared_mem/meanshift_kernels.cu new file mode 100644 index 0000000..f8078fb --- /dev/null +++ b/mean_shift_cuda_shared_mem/meanshift_kernels.cu @@ -0,0 +1,144 @@ +#include "meanshift_kernels.h" +#include +#include + +__global__ void calculate_kernel_matrix_kernel(Matrix shifted_points, Matrix original_points, + double deviation, Matrix kernel_matrix){ + // each thread calculates one element of kernel_matrix + int row = blockIdx.x * blockDim.x + threadIdx.x; + int col = blockIdx.y * blockDim.y + threadIdx.y; + + // performs calculations only if thread's indexes are within matrix bounds + if (row * kernel_matrix.width + col >= kernel_matrix.width * kernel_matrix.height){ + return; + } + + int dimensions = shifted_points.width; + // calculate distance + double sum = 0, dif; + for (int i=0; i= denominator.height){ + return; + } + + for (int column = 0; column < kernel_matrix.width; ++column){ + cell_value += kernel_matrix.elements[row * kernel_matrix.width + column]; + } + denominator.elements[row] = cell_value; +} + +__global__ void shift_points_kernel(Matrix original_points, Matrix kernel_matrix, + Matrix shifted_points, Matrix new_shift, Matrix denominator, Matrix mean_shift_vector){ + int BLOCK_SIZE = blockDim.y; + int block_row = blockIdx.x; + int block_col = blockIdx.y; + + // each thread computes one element of new_shift by accumulating results into cell_value + double cell_value = 0; + + // Thread row and column within sub_new_shift + int row = threadIdx.x; + int col = threadIdx.y; + + // performs calculations only if thread's indexes are within matrix bounds + //if (row * new_shift.width + col >= new_shift.width * new_shift.height){ + /*if (new_shift.stride * BLOCK_SIZE * block_row + BLOCK_SIZE * block_col >= + new_shift.width * new_shift.height){*/ + if (BLOCK_SIZE * block_row >= new_shift.height || BLOCK_SIZE * block_col >= new_shift.width){ + return; + } + + // Each thread block computes one sub-matrix sub_new_shift of C + Matrix sub_new_shift = GetSubMatrix(new_shift, block_row, block_col, BLOCK_SIZE); + + // shared memory used to store sub_kernel_matrix and sub_original_points respectively + __shared__ double *s_sub_kernel_matrix; + s_sub_kernel_matrix = (double*)malloc(BLOCK_SIZE * BLOCK_SIZE * sizeof(double)); + __shared__ double *s_sub_original_points; + s_sub_original_points = (double*)malloc(BLOCK_SIZE * BLOCK_SIZE * sizeof(double)); + + // loops over all the sub-matrices of kernel_matrix and original_points that are required to + //compute sub_new_shift, multiplies each pair of sub-matrices and accumulates the results + for (int sub_matrix_index = 0; sub_matrix_index < (kernel_matrix.width / BLOCK_SIZE); ++sub_matrix_index) { + + // gets sub-matrix sub_kernel_matrix of kernel_matrix + Matrix sub_kernel_matrix = GetSubMatrix(kernel_matrix, block_row, sub_matrix_index, BLOCK_SIZE); + // gets sub-matrix sub_original_points of original_points + Matrix sub_original_points = GetSubMatrix(original_points, sub_matrix_index, block_col, BLOCK_SIZE); + + // loads s_sub_kernel_matrix and s_sub_original_points from device global memory to shared + //memory, each thread loads one element of each sub-matrix + s_sub_kernel_matrix[row * BLOCK_SIZE + col] = + sub_kernel_matrix.elements[row * sub_kernel_matrix.stride + col]; + s_sub_original_points[row * BLOCK_SIZE + col] = + sub_original_points.elements[row * sub_original_points.stride + col]; + + // synchronizes to make sure the sub-matrices are loaded before starting the computation + __syncthreads(); + + // multiplies sub_kernel_matrix and sub_original_points + for (int element_index = 0; element_index < BLOCK_SIZE; ++element_index){ + cell_value += s_sub_kernel_matrix[row * sub_kernel_matrix.stride + element_index] * + s_sub_original_points[element_index * sub_original_points.stride + col]; + } + + // synchronizes to make sure that the preceding computation is done before loading two new + // sub-matrices of kernel_matrix and original_points in the next iteration + __syncthreads(); + } + + // new_shift elements are calculated by dividing with the denominator + int cell_row = (block_row * BLOCK_SIZE + row) * new_shift.stride; + int cell_col = block_col * BLOCK_SIZE + col; + //sub_new_shift.elements[cell_row + cell_col] = cell_value / denominator.elements[cell_row]; + sub_new_shift.elements[row * sub_new_shift.stride + col] = + cell_value / denominator.elements[block_row * BLOCK_SIZE + row]; + + // calculates mean-shift vector + /*mean_shift_vector.elements[(block_row * BLOCK_SIZE + row) * mean_shift_vector.stride + + (block_col * BLOCK_SIZE + col)] = + sub_new_shift.elements[row * sub_new_shift.stride + col] - + shifted_points.elements[(block_row * BLOCK_SIZE + row) * shifted_points.stride + + (block_col * BLOCK_SIZE + col)];*/ + + /*free(s_sub_kernel_matrix); + free(s_sub_original_points);*/ +} + +// Get the BLOCK_SIZExBLOCK_SIZE sub-matrix Asub of A that is +// located col sub-matrices to the right and row sub-matrices down +// from the upper-left corner of A +__device__ Matrix GetSubMatrix(Matrix A, int row, int col, int BLOCK_SIZE){ + Matrix Asub; + Asub.width = BLOCK_SIZE; + Asub.height = BLOCK_SIZE; + Asub.stride = BLOCK_SIZE; + Asub.elements = &(A.elements[A.stride * BLOCK_SIZE * row + BLOCK_SIZE * col]); + return Asub; +} \ No newline at end of file diff --git a/mean_shift_cuda_shared_mem/meanshift_kernels.h b/mean_shift_cuda_shared_mem/meanshift_kernels.h new file mode 100644 index 0000000..58836e6 --- /dev/null +++ b/mean_shift_cuda_shared_mem/meanshift_kernels.h @@ -0,0 +1,29 @@ +#ifndef SERIAL_KERNELS_H /* Include guard */ +#define SERIAL_KERNELS_H + +/* Structures */ + +//Matrix is used to describe matrices +typedef struct { + int width; + int height; + int stride; + double *elements; +} Matrix; + +//Kernel calculate_kernel_matrix_kernel calculates the current kernel matrix +__global__ void calculate_kernel_matrix_kernel(Matrix shifted_points, Matrix original_points, + double deviation, Matrix kernel_matrix); + +//Kernel denominator_kernel calculates the sum in the denominator of the fraction used to find new +//(shifted) positions of the points +__global__ void denominator_kernel(Matrix denominator, Matrix kernel_matrix); + +//Kernel shift_points_kernel shifts the positions of all points and calculates the new mean shift +//vector according to the new point array +__global__ void shift_points_kernel(Matrix original_points, Matrix kernel_matrix, + Matrix shifted_points, Matrix new_shift, Matrix denominator, Matrix mean_shift_vector); + +__device__ Matrix GetSubMatrix(Matrix A, int row, int col, int BLOCK_SIZE); + +#endif //SERIAL_KERNELS_H \ No newline at end of file diff --git a/mean_shift_cuda_shared_mem/meanshift_utils.cu b/mean_shift_cuda_shared_mem/meanshift_utils.cu new file mode 100644 index 0000000..6cef7dd --- /dev/null +++ b/mean_shift_cuda_shared_mem/meanshift_utils.cu @@ -0,0 +1,165 @@ +#include +#include +#include +#include +#include + +#include "meanshift_utils.h" +#include "meanshift_gpu_utils.h" + +#define OUTPUT_PREFIX "../output/output_" + +void get_args(int argc, char **argv, parameters *params){ + if (argc < 7) { + printf("Usage: %s h e N D Pd Pl\nwhere:\n" + "\th is the variance\n" + "\te is the min distance, between two points, that is taken into account in computations\n" + "\tN is the the number of points\n" + "\tD is the number of dimensions of each point\n" + "\tPd is the path of the dataset file\n" + "\tPl is the path of the labels file\n" + "\n\t--verbose | -v is an optional flag to enable execution information output" + "\n\t--output | -o is an optional flag to enable points output in each iteration", argv[0]); + exit(1); + } + + DEVIATION = atoi(argv[1]); + params->epsilon = atof(argv[2]); + NUMBER_OF_POINTS = atoi(argv[3]); + DIMENSIONS = atoi(argv[4]); + POINTS_FILENAME = argv[5]; + LABELS_FILENAME = argv[6]; + params->verbose = false; + params->display = false; + + if (argc > 7){ + for (int index=7; indexverbose = true; + } else if (!strcmp(argv[index], "--output") || !strcmp(argv[index], "-o")){ + params->display = true; + } else { + printf("Couldn't parse argument %d: %s\n", index, argv[index]); + exit(EXIT_FAILURE); + } + } + } + + /*printf("DEVIATION = %d\n" + "epsilon = %f\n" + "NUMBER_OF_POINTS = %d\n" + "DIMENSIONS = %d\n" + "POINTS_FILENAME = %s\n" + "LABELS_FILENAME = %s\n" + "verbose = %d\n" + "display = %d\n", DEVIATION, params->epsilon, NUMBER_OF_POINTS, DIMENSIONS, POINTS_FILENAME + , LABELS_FILENAME, params->verbose, params->display);*/ +} + +void init(double ***vectors, char **labels){ + int bytes_read = 0; + + set_GPU(); + + if (params.verbose){ + printf("Reading dataset and labels...\n"); + } + + // initializes vectors + FILE *points_file; + points_file = fopen(POINTS_FILENAME, "rb"); + if (points_file != NULL){ + // allocates memory for the array + (*vectors) = alloc_double(NUMBER_OF_POINTS, DIMENSIONS); + // reads vectors dataset from file + for (int i=0; i + // variables of type uint8 are stored as 1-byte (8-bit) unsigned integers + // gets number of labels + fseek(labels_file, 0L, SEEK_END); + long int pos = ftell(labels_file); + rewind(labels_file); + int label_elements = pos/ sizeof(char); + + // allocates memory for the array + *labels = (char*)malloc(label_elements* sizeof(char)); + fseek(labels_file, 0L, SEEK_SET); + bytes_read = fread((*labels), sizeof(char), label_elements, labels_file); + if ( bytes_read != label_elements ){ + if(feof(points_file)){ + printf("Premature end of file reached.\n"); + } else{ + printf("Error reading points file."); + } + fclose(labels_file); + exit(EXIT_FAILURE); + } + } + fclose(labels_file); + + if (params.verbose){ + printf("Done.\n\n"); + } +} + +double **alloc_double(int rows, int cols) { + double *data = (double *) malloc(rows*cols*sizeof(double)); + double **array = (double **) malloc(rows*sizeof(double*)); + for (int i=0; i + +/* Structures */ + +//Parameters is used to store session specific variables in an orderly way +typedef struct parameters { + double epsilon; + bool verbose; + bool display; +} Parameters; + +//Function get_args parses command line arguments +void get_args(int argc, char **argv, Parameters *params); + +//Function init sets up the GPU for later use, gets its properties and reads the dataset and label +//arrays from the corresponding files +void init(double ***vectors, char **labels); + +//Function alloc_double allocates rows*cols bytes of continuous memory +double **alloc_double(int rows, int cols); + +//Function duplicate copies the values of source array to dest array +void duplicate(double **source, int rows, int cols, double ***dest); + +//Function print_matrix prints array of dimensions to the console +void print_matrix(double **array, int rows, int cols); + +//Function save_matrix stores matrix in a csv file with path/filename "../output/output_iteration". +//If a file already exists new lines are concatenated +void save_matrix(double **matrix, int iteration); + +#endif //SERIAL_UTILS_H \ No newline at end of file