Apostolos Fanakis
7 years ago
8 changed files with 889 additions and 0 deletions
@ -0,0 +1,43 @@ |
|||||
|
SHELL := /bin/bash |
||||
|
|
||||
|
# ============================================
|
||||
|
# COMMANDS
|
||||
|
|
||||
|
CC = nvcc |
||||
|
HOST_COMPILER = -ccbin gcc |
||||
|
CUDA_FLAGS = -arch=sm_21 -Wno-deprecated-gpu-targets -lcublas |
||||
|
C_FLAGS = -lm -O3 -I. |
||||
|
|
||||
|
COMPILE_FLAGS = $(HOST_COMPILER) -x cu $(CUDA_FLAGS) -dc $(C_FLAGS) |
||||
|
LINK_FLAGS = $(HOST_COMPILER) $(CUDA_FLAGS) $(C_FLAGS) |
||||
|
|
||||
|
OBJ = meanshift.o meanshift_utils.o meanshift_gpu_utils.o meanshift_kernels.o |
||||
|
DEPS = meanshift_utils.h meanshift_kernels.h |
||||
|
|
||||
|
RM = rm -f |
||||
|
|
||||
|
# ==========================================
|
||||
|
# TARGETS
|
||||
|
|
||||
|
EXECUTABLES = meanshift |
||||
|
|
||||
|
.PHONY: all clean |
||||
|
|
||||
|
all: $(EXECUTABLES) |
||||
|
|
||||
|
# ==========================================
|
||||
|
# DEPENDENCIES (HEADERS)
|
||||
|
|
||||
|
%.o: %.cu $(DEPS) |
||||
|
$(CC) $(COMPILE_FLAGS) $< -o $@ |
||||
|
|
||||
|
.PRECIOUS: $(EXECUTABLES) $(OBJ) |
||||
|
|
||||
|
# ==========================================
|
||||
|
# EXECUTABLE (MAIN)
|
||||
|
|
||||
|
$(EXECUTABLES): $(OBJ) |
||||
|
$(CC) $(LINK_FLAGS) $(OBJ) -o $@ |
||||
|
|
||||
|
clean: |
||||
|
$(RM) *.o *~ $(EXECUTABLES) |
@ -0,0 +1,45 @@ |
|||||
|
#include <stdio.h> |
||||
|
#include <stdlib.h> |
||||
|
#include <sys/time.h> |
||||
|
|
||||
|
#include "meanshift_utils.h" |
||||
|
#include "meanshift_gpu_utils.h" |
||||
|
|
||||
|
int DEVIATION = 1; |
||||
|
int NUMBER_OF_POINTS = 600; |
||||
|
int DIMENSIONS = 2; |
||||
|
const char *POINTS_FILENAME = "../data/X.bin"; |
||||
|
const char *LABELS_FILENAME = "../data/L.bin"; |
||||
|
parameters params; |
||||
|
|
||||
|
struct timeval startwtime, endwtime; |
||||
|
double seq_time; |
||||
|
|
||||
|
int main(int argc, char **argv){ |
||||
|
int recursions = 0; |
||||
|
double **vectors, **shifted_points; |
||||
|
char *labels; |
||||
|
|
||||
|
params.epsilon = 0.0001; |
||||
|
params.verbose = false; |
||||
|
params.display = true; |
||||
|
//get_args(argc, argv, ¶ms); //commented out while in development |
||||
|
init(&vectors, &labels); |
||||
|
|
||||
|
// tic |
||||
|
gettimeofday (&startwtime, NULL); |
||||
|
recursions = meanshift(vectors, &shifted_points, DEVIATION); |
||||
|
|
||||
|
// toc |
||||
|
gettimeofday (&endwtime, NULL); |
||||
|
seq_time = (double)((endwtime.tv_usec - startwtime.tv_usec)/1.0e6 + endwtime.tv_sec - startwtime.tv_sec); |
||||
|
|
||||
|
|
||||
|
printf("\nTotal number of recursions = %d\n", recursions); |
||||
|
printf("%s wall clock time = %f\n","Mean Shift", seq_time); |
||||
|
|
||||
|
free(vectors[0]); |
||||
|
free(vectors); |
||||
|
free(shifted_points[0]); |
||||
|
free(shifted_points); |
||||
|
} |
@ -0,0 +1,370 @@ |
|||||
|
#include <stdio.h> |
||||
|
#include <stdlib.h> |
||||
|
#include <math.h> |
||||
|
#include <float.h> |
||||
|
#include <string.h> |
||||
|
#include <sys/time.h> |
||||
|
|
||||
|
#include <cublas_v2.h> |
||||
|
|
||||
|
#include "meanshift_utils.h" |
||||
|
#include "meanshift_gpu_utils.h" |
||||
|
|
||||
|
cudaDeviceProp device_properties; |
||||
|
|
||||
|
struct timeval start_w_time, end_w_time; |
||||
|
double seq; |
||||
|
|
||||
|
//Based on: |
||||
|
// https://www.cs.virginia.edu/~csadmin/wiki/index.php/CUDA_Support/Choosing_a_GPU |
||||
|
void set_GPU(){ |
||||
|
int devices_count = 0, max_multiprocessors = 0, max_device = 0; |
||||
|
|
||||
|
// gets devices count checking for errors like no devices or no drivers to check for |
||||
|
// devices available |
||||
|
gpuErrchk( cudaGetDeviceCount(&devices_count) ); |
||||
|
for(int device_index = 0; device_index < devices_count; ++device_index){ |
||||
|
// gets current index device's properties |
||||
|
cudaDeviceProp this_device_properties; |
||||
|
gpuErrchk( cudaGetDeviceProperties(&this_device_properties, device_index) ); |
||||
|
|
||||
|
// stores best available device's index |
||||
|
// only devices with compute capability >= 2.0 are able to run the code |
||||
|
if (max_multiprocessors < this_device_properties.multiProcessorCount |
||||
|
&& this_device_properties.major >= 2 && this_device_properties.minor >= 0){ |
||||
|
// stores devices properties for later use |
||||
|
device_properties = this_device_properties; |
||||
|
max_multiprocessors = this_device_properties.multiProcessorCount; |
||||
|
max_device = device_index; |
||||
|
} |
||||
|
} |
||||
|
// sets the device |
||||
|
gpuErrchk( cudaSetDevice(max_device) ); |
||||
|
if (params.verbose){ |
||||
|
printf("Device chosen is \"%s\"\n" |
||||
|
"Device has %d multi processors and compute capability %d.%d\n" |
||||
|
"Max threads per block supported are %d\n\n" |
||||
|
, device_properties.name |
||||
|
, device_properties.multiProcessorCount, device_properties.major, device_properties.minor |
||||
|
, device_properties.maxThreadsPerBlock); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
int meanshift(double **original_points, double ***shifted_points, int deviation){ |
||||
|
// host variables |
||||
|
int size = 0; |
||||
|
static int recursion = 0; |
||||
|
static double **kernel_matrix, **mean_shift_vector, w_memcpy_time; |
||||
|
double **new_shift, current_norm = 0, tmp_w_memcpy_time; |
||||
|
bool is_first_recursion = false; |
||||
|
|
||||
|
// device variables |
||||
|
static Matrix d_original_points, d_shifted_points, d_kernel_matrix, d_denominator, |
||||
|
d_mean_shift_vector; |
||||
|
Matrix d_new_shift; |
||||
|
|
||||
|
// allocates memory and copies original points on first recursion |
||||
|
if (recursion == 0 || (*shifted_points) == NULL){ |
||||
|
is_first_recursion = true; |
||||
|
// allocates memory for shifted points array and copies original points into it |
||||
|
(*shifted_points) = alloc_double(NUMBER_OF_POINTS, DIMENSIONS); |
||||
|
duplicate(original_points, NUMBER_OF_POINTS, DIMENSIONS, shifted_points); |
||||
|
|
||||
|
// allocates memory for mean shift vector |
||||
|
mean_shift_vector = alloc_double(NUMBER_OF_POINTS, DIMENSIONS); |
||||
|
// initializes elements of mean_shift_vector to inf |
||||
|
for (int i=0;i<NUMBER_OF_POINTS;i++){ |
||||
|
for (int j=0;j<DIMENSIONS;j++){ |
||||
|
mean_shift_vector[i][j] = DBL_MAX; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
// allocates memory for kernel_matrix |
||||
|
kernel_matrix = alloc_double(NUMBER_OF_POINTS, NUMBER_OF_POINTS); |
||||
|
|
||||
|
// tic |
||||
|
gettimeofday (&start_w_time, NULL); |
||||
|
|
||||
|
// allocates corresponding memory in device |
||||
|
init_device_memory(original_points, *shifted_points, &d_original_points, &d_shifted_points, |
||||
|
&d_kernel_matrix, &d_denominator, &d_mean_shift_vector); |
||||
|
// toc |
||||
|
gettimeofday (&end_w_time, NULL); |
||||
|
seq = (double)((end_w_time.tv_usec - start_w_time.tv_usec) |
||||
|
/ 1.0e6 + end_w_time.tv_sec - start_w_time.tv_sec); |
||||
|
|
||||
|
if (params.verbose){ |
||||
|
printf("Device memory allocation wall clock time = %f\n\n", seq); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
// finds pairwise distance matrix (inside radius) |
||||
|
// [I, D] = rangesearch(x,y,h); |
||||
|
calculate_kernel_matrix(d_shifted_points, d_original_points, d_kernel_matrix, deviation, |
||||
|
&kernel_matrix, &tmp_w_memcpy_time); |
||||
|
w_memcpy_time += tmp_w_memcpy_time; |
||||
|
|
||||
|
// calculates denominator |
||||
|
calculate_denominator(d_kernel_matrix, d_denominator); |
||||
|
|
||||
|
// creates new y vector |
||||
|
// allocates memory in every recursion |
||||
|
new_shift = alloc_double(NUMBER_OF_POINTS, DIMENSIONS); |
||||
|
// allocates corresponding memory in device |
||||
|
d_new_shift.width = DIMENSIONS; |
||||
|
d_new_shift.height = NUMBER_OF_POINTS; |
||||
|
size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double); |
||||
|
gpuErrchk( cudaMalloc(&(d_new_shift.elements), size) ); |
||||
|
|
||||
|
shift_points(d_kernel_matrix, d_original_points, d_shifted_points, d_new_shift, d_denominator, |
||||
|
d_mean_shift_vector, kernel_matrix, original_points, &new_shift, &mean_shift_vector, |
||||
|
&tmp_w_memcpy_time); |
||||
|
w_memcpy_time += tmp_w_memcpy_time; |
||||
|
|
||||
|
for (int row=0; row<2; ++row){ |
||||
|
for (int col=0; col<2; ++col){ |
||||
|
printf("new_shift[%d][%d] = %f\n", row, col, new_shift[row][col]); |
||||
|
printf("new_shift[%d][%d] = %f\n", 300+row, 216+col, new_shift[300+row][216+col]); |
||||
|
printf("new_shift[%d][%d] = %f\n", 562+row, 487+col, new_shift[562+row][487+col]); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
if(is_first_recursion){ |
||||
|
exit(0); |
||||
|
} |
||||
|
|
||||
|
// frees previously shifted points, they're now garbage |
||||
|
free((*shifted_points)[0]); |
||||
|
gpuErrchk( cudaFree(d_shifted_points.elements) ); |
||||
|
// updates shifted points pointer to the new array address |
||||
|
shifted_points = &new_shift; |
||||
|
d_shifted_points.elements = d_new_shift.elements; |
||||
|
|
||||
|
if (params.display){ |
||||
|
save_matrix((*shifted_points), recursion); |
||||
|
} |
||||
|
|
||||
|
// calculates norm of the new mean shift vector in GPU using "cuBlas" library function |
||||
|
cublasHandle_t handle; |
||||
|
cublasStatus_t cublas_status = cublasCreate(&handle); |
||||
|
if (cublas_status != CUBLAS_STATUS_SUCCESS){ |
||||
|
exit(cublas_status); |
||||
|
} |
||||
|
cublas_status = cublasDnrm2(handle, NUMBER_OF_POINTS * DIMENSIONS, d_mean_shift_vector.elements, |
||||
|
1, ¤t_norm); |
||||
|
if (cublas_status != CUBLAS_STATUS_SUCCESS){ |
||||
|
exit(cublas_status); |
||||
|
} |
||||
|
cublas_status = cublasDestroy(handle); |
||||
|
if (cublas_status != CUBLAS_STATUS_SUCCESS){ |
||||
|
exit(cublas_status); |
||||
|
} |
||||
|
|
||||
|
if (params.verbose){ |
||||
|
printf("Recursion n. %d, error\t%f \n", recursion, current_norm); |
||||
|
} |
||||
|
|
||||
|
// recurses until convergence |
||||
|
if (current_norm > params.epsilon) { |
||||
|
++recursion; |
||||
|
meanshift(original_points, shifted_points, deviation); |
||||
|
} |
||||
|
|
||||
|
if (is_first_recursion){ |
||||
|
if (params.verbose){ |
||||
|
printf("\nCopying between device and host wall clock time = %f\n", w_memcpy_time); |
||||
|
} |
||||
|
|
||||
|
// cleans up allocations |
||||
|
free(mean_shift_vector[0]); |
||||
|
free(mean_shift_vector); |
||||
|
free(kernel_matrix[0]); |
||||
|
free(kernel_matrix); |
||||
|
|
||||
|
free_device_memory(d_original_points, d_kernel_matrix, d_denominator, d_shifted_points); |
||||
|
} |
||||
|
|
||||
|
return recursion; |
||||
|
} |
||||
|
|
||||
|
void init_device_memory(double **original_points, double **shifted_points, |
||||
|
Matrix *d_original_points, Matrix *d_shifted_points, Matrix *d_kernel_matrix, |
||||
|
Matrix *d_denominator, Matrix *d_mean_shift_vector){ |
||||
|
int size; |
||||
|
|
||||
|
// allocates memory for original_points in GPU and copies the array |
||||
|
d_original_points->width = DIMENSIONS; |
||||
|
d_original_points->height = NUMBER_OF_POINTS; |
||||
|
size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double); |
||||
|
gpuErrchk( cudaMalloc(&(d_original_points->elements), size) ); |
||||
|
gpuErrchk( cudaMemcpy(d_original_points->elements, &(original_points[0][0]) |
||||
|
, size, cudaMemcpyHostToDevice) ); |
||||
|
|
||||
|
// allocates memory for shifted_points in GPU and copies the array |
||||
|
d_shifted_points->width = DIMENSIONS; |
||||
|
d_shifted_points->height = NUMBER_OF_POINTS; |
||||
|
size = DIMENSIONS * NUMBER_OF_POINTS * sizeof(double); |
||||
|
gpuErrchk( cudaMalloc(&(d_shifted_points->elements), size) ); |
||||
|
gpuErrchk( cudaMemcpy(d_shifted_points->elements, &(shifted_points[0][0]) |
||||
|
, size, cudaMemcpyHostToDevice) ); |
||||
|
|
||||
|
// allocates memory for kernel_matrix in GPU |
||||
|
d_kernel_matrix->width = NUMBER_OF_POINTS; |
||||
|
d_kernel_matrix->height = NUMBER_OF_POINTS; |
||||
|
size = NUMBER_OF_POINTS * NUMBER_OF_POINTS * sizeof(double); |
||||
|
gpuErrchk( cudaMalloc(&(d_kernel_matrix->elements), size) ); |
||||
|
|
||||
|
// allocates memory for denominator in GPU |
||||
|
d_denominator->width = 1; |
||||
|
d_denominator->height = NUMBER_OF_POINTS; |
||||
|
size = NUMBER_OF_POINTS * sizeof(double); |
||||
|
gpuErrchk( cudaMalloc(&(d_denominator->elements), size) ); |
||||
|
|
||||
|
// allocates memory for mean_shift_vector in GPU |
||||
|
d_mean_shift_vector->width = DIMENSIONS; |
||||
|
d_mean_shift_vector->height = NUMBER_OF_POINTS; |
||||
|
size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double); |
||||
|
gpuErrchk( cudaMalloc(&(d_mean_shift_vector->elements), size) ); |
||||
|
} |
||||
|
|
||||
|
void calculate_kernel_matrix(Matrix d_shifted_points, Matrix d_original_points, |
||||
|
Matrix d_kernel_matrix, double deviation, double ***kernel_matrix, double *w_memcpy_time){ |
||||
|
int size; |
||||
|
static bool first_iter = true; |
||||
|
// gets max block size supported from the device |
||||
|
static int max_block_size = device_properties.maxThreadsPerBlock; |
||||
|
static int requested_block_size = (int)sqrt(max_block_size); |
||||
|
bool block_size_too_big = true; |
||||
|
|
||||
|
dim3 dimBlock; |
||||
|
dim3 dimGrid; |
||||
|
do { |
||||
|
dimBlock.x = requested_block_size; |
||||
|
dimBlock.y = requested_block_size; |
||||
|
dimGrid.x = (d_kernel_matrix.height + dimBlock.x - 1) / dimBlock.x; |
||||
|
dimGrid.y = (d_kernel_matrix.width + dimBlock.y - 1) / dimBlock.y; |
||||
|
|
||||
|
calculate_kernel_matrix_kernel<<<dimGrid, dimBlock>>>(d_shifted_points, d_original_points |
||||
|
, deviation, d_kernel_matrix); |
||||
|
if (cudaGetLastError() != cudaSuccess){ |
||||
|
--requested_block_size; |
||||
|
} else { |
||||
|
block_size_too_big = false; |
||||
|
gpuErrchk( cudaDeviceSynchronize() ); |
||||
|
} |
||||
|
} while(block_size_too_big); |
||||
|
|
||||
|
if (first_iter && params.verbose){ |
||||
|
printf("calculate_kernel_matrix_kernel called with:\n"); |
||||
|
printf("dimBlock.x = %d, dimBlock.y = %d\n", dimBlock.x, dimBlock.y); |
||||
|
printf("dimGrid.x = %d, dimGrid.y = %d\n\n", dimGrid.x, dimGrid.y); |
||||
|
first_iter = false; |
||||
|
} |
||||
|
|
||||
|
size = NUMBER_OF_POINTS * NUMBER_OF_POINTS * sizeof(double); |
||||
|
|
||||
|
// tic |
||||
|
gettimeofday (&start_w_time, NULL); |
||||
|
|
||||
|
gpuErrchk( cudaMemcpy(&((*kernel_matrix)[0][0]), d_kernel_matrix.elements |
||||
|
, size, cudaMemcpyDeviceToHost) ); |
||||
|
|
||||
|
// toc |
||||
|
gettimeofday (&end_w_time, NULL); |
||||
|
*w_memcpy_time = (double)((end_w_time.tv_usec - start_w_time.tv_usec) |
||||
|
/ 1.0e6 + end_w_time.tv_sec - start_w_time.tv_sec); |
||||
|
} |
||||
|
|
||||
|
void calculate_denominator(Matrix d_kernel_matrix, Matrix d_denominator){ |
||||
|
static bool first_iter = true; |
||||
|
// gets max block size supported from the device |
||||
|
static int requested_block_size = device_properties.maxThreadsPerBlock; |
||||
|
bool block_size_too_big = true; |
||||
|
|
||||
|
dim3 dimBlock; |
||||
|
dim3 dimGrid; |
||||
|
do { |
||||
|
dimBlock.x = requested_block_size; |
||||
|
dimBlock.y = 1; |
||||
|
dimGrid.x = (d_kernel_matrix.height + dimBlock.x - 1) / dimBlock.x; |
||||
|
dimGrid.y = 1; |
||||
|
|
||||
|
denominator_kernel<<<dimGrid, dimBlock>>>(d_denominator, d_kernel_matrix); |
||||
|
if (cudaGetLastError() != cudaSuccess){ |
||||
|
--requested_block_size; |
||||
|
} else { |
||||
|
block_size_too_big = false; |
||||
|
gpuErrchk( cudaDeviceSynchronize() ); |
||||
|
} |
||||
|
} while(block_size_too_big); |
||||
|
|
||||
|
if (first_iter && params.verbose){ |
||||
|
printf("calculate_denominator called with:\n"); |
||||
|
printf("dimBlock.x = %d, dimBlock.y = %d\n", dimBlock.x, dimBlock.y); |
||||
|
printf("dimGrid.x = %d, dimGrid.y = %d\n\n", dimGrid.x, dimGrid.y); |
||||
|
first_iter = false; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
void shift_points(Matrix d_kernel_matrix, Matrix d_original_points, Matrix d_shifted_points, |
||||
|
Matrix d_new_shift, Matrix d_denominator, Matrix d_mean_shift_vector, double **kernel_matrix, |
||||
|
double **original_points, double ***new_shift, double ***mean_shift_vector, |
||||
|
double *w_memcpy_time){ |
||||
|
int size; |
||||
|
static bool first_iter = true; |
||||
|
// gets max block size supported from the device |
||||
|
static int max_block_size = device_properties.maxThreadsPerBlock; |
||||
|
static int requested_block_size = (int)(max_block_size / d_new_shift.width); |
||||
|
bool block_size_too_big = true; |
||||
|
|
||||
|
dim3 dimBlock; |
||||
|
dim3 dimGrid; |
||||
|
do { |
||||
|
/*dimBlock.x = requested_block_size; |
||||
|
dimBlock.y = d_new_shift.width;*/ |
||||
|
dimBlock.x = 2; |
||||
|
dimBlock.y = 2; |
||||
|
dimGrid.x = (d_denominator.height + dimBlock.x - 1) / dimBlock.x; |
||||
|
dimGrid.y = 1; |
||||
|
|
||||
|
shift_points_kernel<<<dimGrid, dimBlock>>>(d_original_points, d_kernel_matrix, d_shifted_points, |
||||
|
d_new_shift, d_denominator, d_mean_shift_vector); |
||||
|
if (cudaGetLastError() != cudaSuccess){ |
||||
|
--requested_block_size; |
||||
|
} else { |
||||
|
block_size_too_big = false; |
||||
|
gpuErrchk( cudaDeviceSynchronize() ); |
||||
|
} |
||||
|
} while(block_size_too_big); |
||||
|
|
||||
|
if (first_iter && params.verbose){ |
||||
|
printf("shift_points_kernel called with:\n"); |
||||
|
printf("dimBlock.x = %d, dimBlock.y = %d\n", dimBlock.x, dimBlock.y); |
||||
|
printf("dimGrid.x = %d, dimGrid.y = %d\n\n", dimGrid.x, dimGrid.y); |
||||
|
first_iter = false; |
||||
|
} |
||||
|
|
||||
|
size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double); |
||||
|
|
||||
|
// tic |
||||
|
gettimeofday (&start_w_time, NULL); |
||||
|
|
||||
|
gpuErrchk( cudaMemcpy(&((*new_shift)[0][0]), d_new_shift.elements |
||||
|
, size, cudaMemcpyDeviceToHost) ); |
||||
|
gpuErrchk( cudaMemcpy(&((*mean_shift_vector)[0][0]), d_mean_shift_vector.elements |
||||
|
, size, cudaMemcpyDeviceToHost) ); |
||||
|
|
||||
|
// toc |
||||
|
gettimeofday (&end_w_time, NULL); |
||||
|
*w_memcpy_time = (double)((end_w_time.tv_usec - start_w_time.tv_usec) |
||||
|
/ 1.0e6 + end_w_time.tv_sec - start_w_time.tv_sec); |
||||
|
} |
||||
|
|
||||
|
void free_device_memory(Matrix d_original_points, Matrix d_kernel_matrix, Matrix d_denominator, |
||||
|
Matrix d_shifted_points){ |
||||
|
// frees all memory previously allocated in device |
||||
|
gpuErrchk( cudaFree(d_original_points.elements) ); |
||||
|
gpuErrchk( cudaFree(d_kernel_matrix.elements) ); |
||||
|
gpuErrchk( cudaFree(d_denominator.elements) ); |
||||
|
gpuErrchk( cudaFree(d_shifted_points.elements) ); |
||||
|
} |
@ -0,0 +1,58 @@ |
|||||
|
#ifndef SERIAL_GPU_UTILS_H /* Include guard */ |
||||
|
#define SERIAL_GPU_UTILS_H |
||||
|
|
||||
|
#include "meanshift_kernels.h" |
||||
|
|
||||
|
//GPU error check snippet taken from:
|
||||
|
// https://stackoverflow.com/a/14038590
|
||||
|
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); } |
||||
|
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true){ |
||||
|
if (code != cudaSuccess){ |
||||
|
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); |
||||
|
if (abort) exit(code); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
/* Global variables */ |
||||
|
extern int DEVIATION; |
||||
|
extern int NUMBER_OF_POINTS; |
||||
|
extern int DIMENSIONS; |
||||
|
extern const char* POINTS_FILENAME; |
||||
|
extern const char* LABELS_FILENAME; |
||||
|
extern Parameters params; |
||||
|
extern cudaDeviceProp device_properties; |
||||
|
|
||||
|
//Function set_GPU parses available GPU devices, selects the one with the most multi-processors for
|
||||
|
//usage and stores its properties in global struct device_properties
|
||||
|
void set_GPU(); |
||||
|
|
||||
|
//Function meanshift recursively shifts original points according to the mean-shift algorithm saving
|
||||
|
//the result to shiftedPoints, h is the desirable deviation
|
||||
|
int meanshift(double **original_points, double ***shifted_points, int h); |
||||
|
|
||||
|
//Function init_device_memory allocates memory for necessary arrays in the device
|
||||
|
void init_device_memory(double **original_points, double **shifted_points, |
||||
|
Matrix *d_original_points, Matrix *d_shifted_points, Matrix *d_kernel_matrix, |
||||
|
Matrix *d_denominator, Matrix *d_new_shift); |
||||
|
|
||||
|
//Function calculate_kernel_matrix is a wrapper for the kernel call of the corresponding kernel
|
||||
|
//"calculate_kernel_matrix_kernel" that calculates the kernel matrix
|
||||
|
void calculate_kernel_matrix(Matrix d_shifted_points, Matrix d_original_points, |
||||
|
Matrix d_kernel_matrix, double deviation, double ***kernel_matrix, double *w_memcpy_time); |
||||
|
|
||||
|
//Function calculate_denominator is a wrapper for the kernel call of the corresponding kernel
|
||||
|
//"calculate_denominator_kernel" that calculates the denominator of shifted points fraction
|
||||
|
void calculate_denominator(Matrix d_kernel_matrix, Matrix d_denominator); |
||||
|
|
||||
|
//Function shift_points is a wrapper for the kernel call of the corresponding kernel
|
||||
|
//"shift_points_kernel" that shifts the positions of all points
|
||||
|
void shift_points(Matrix d_kernel_matrix, Matrix d_original_points, Matrix d_shifted_points, |
||||
|
Matrix d_new_shift, Matrix d_denominator, Matrix d_mean_shift_vector, double **kernel_matrix, |
||||
|
double **original_points, double ***new_shift, double ***mean_shift_vector, |
||||
|
double *w_memcpy_time); |
||||
|
|
||||
|
//Function free_device_memory frees device's previously allocated memory
|
||||
|
void free_device_memory(Matrix d_original_points, Matrix d_kernel_matrix, Matrix d_denominator, |
||||
|
Matrix d_shifted_points); |
||||
|
|
||||
|
#endif //SERIAL_GPU_UTILS_H
|
@ -0,0 +1,144 @@ |
|||||
|
#include "meanshift_kernels.h" |
||||
|
#include <stdio.h> |
||||
|
#include <stdlib.h> |
||||
|
|
||||
|
__global__ void calculate_kernel_matrix_kernel(Matrix shifted_points, Matrix original_points, |
||||
|
double deviation, Matrix kernel_matrix){ |
||||
|
// each thread calculates one element of kernel_matrix |
||||
|
int row = blockIdx.x * blockDim.x + threadIdx.x; |
||||
|
int col = blockIdx.y * blockDim.y + threadIdx.y; |
||||
|
|
||||
|
// performs calculations only if thread's indexes are within matrix bounds |
||||
|
if (row * kernel_matrix.width + col >= kernel_matrix.width * kernel_matrix.height){ |
||||
|
return; |
||||
|
} |
||||
|
|
||||
|
int dimensions = shifted_points.width; |
||||
|
// calculate distance |
||||
|
double sum = 0, dif; |
||||
|
for (int i=0; i<dimensions; i++){ |
||||
|
dif = shifted_points.elements[row * dimensions + i] |
||||
|
- original_points.elements[col * dimensions + i]; |
||||
|
sum += dif * dif; |
||||
|
} |
||||
|
double distance = sqrt(sum); |
||||
|
|
||||
|
double deviation_square = deviation*deviation; |
||||
|
if (distance < deviation_square){ |
||||
|
// computes kernel matrix |
||||
|
double pow = ((-1)*(distance * distance))/(2*(deviation_square)); |
||||
|
kernel_matrix.elements[row * kernel_matrix.width + col] = exp(pow); |
||||
|
} else { |
||||
|
kernel_matrix.elements[row * kernel_matrix.width + col] = 0; |
||||
|
} |
||||
|
if (row == col){ |
||||
|
kernel_matrix.elements[row * kernel_matrix.width + col] += 1; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
__global__ void denominator_kernel(Matrix denominator, Matrix kernel_matrix){ |
||||
|
// each thread computes one element of denominator_kernel |
||||
|
// by accumulating results into cell_value |
||||
|
double cell_value = 0; |
||||
|
int row = blockIdx.x * blockDim.x + threadIdx.x; |
||||
|
|
||||
|
// performs calculations only if thread's indexes are within matrix bounds |
||||
|
if (row >= denominator.height){ |
||||
|
return; |
||||
|
} |
||||
|
|
||||
|
for (int column = 0; column < kernel_matrix.width; ++column){ |
||||
|
cell_value += kernel_matrix.elements[row * kernel_matrix.width + column]; |
||||
|
} |
||||
|
denominator.elements[row] = cell_value; |
||||
|
} |
||||
|
|
||||
|
__global__ void shift_points_kernel(Matrix original_points, Matrix kernel_matrix, |
||||
|
Matrix shifted_points, Matrix new_shift, Matrix denominator, Matrix mean_shift_vector){ |
||||
|
int BLOCK_SIZE = blockDim.y; |
||||
|
int block_row = blockIdx.x; |
||||
|
int block_col = blockIdx.y; |
||||
|
|
||||
|
// each thread computes one element of new_shift by accumulating results into cell_value |
||||
|
double cell_value = 0; |
||||
|
|
||||
|
// Thread row and column within sub_new_shift |
||||
|
int row = threadIdx.x; |
||||
|
int col = threadIdx.y; |
||||
|
|
||||
|
// performs calculations only if thread's indexes are within matrix bounds |
||||
|
//if (row * new_shift.width + col >= new_shift.width * new_shift.height){ |
||||
|
/*if (new_shift.stride * BLOCK_SIZE * block_row + BLOCK_SIZE * block_col >= |
||||
|
new_shift.width * new_shift.height){*/ |
||||
|
if (BLOCK_SIZE * block_row >= new_shift.height || BLOCK_SIZE * block_col >= new_shift.width){ |
||||
|
return; |
||||
|
} |
||||
|
|
||||
|
// Each thread block computes one sub-matrix sub_new_shift of C |
||||
|
Matrix sub_new_shift = GetSubMatrix(new_shift, block_row, block_col, BLOCK_SIZE); |
||||
|
|
||||
|
// shared memory used to store sub_kernel_matrix and sub_original_points respectively |
||||
|
__shared__ double *s_sub_kernel_matrix; |
||||
|
s_sub_kernel_matrix = (double*)malloc(BLOCK_SIZE * BLOCK_SIZE * sizeof(double)); |
||||
|
__shared__ double *s_sub_original_points; |
||||
|
s_sub_original_points = (double*)malloc(BLOCK_SIZE * BLOCK_SIZE * sizeof(double)); |
||||
|
|
||||
|
// loops over all the sub-matrices of kernel_matrix and original_points that are required to |
||||
|
//compute sub_new_shift, multiplies each pair of sub-matrices and accumulates the results |
||||
|
for (int sub_matrix_index = 0; sub_matrix_index < (kernel_matrix.width / BLOCK_SIZE); ++sub_matrix_index) { |
||||
|
|
||||
|
// gets sub-matrix sub_kernel_matrix of kernel_matrix |
||||
|
Matrix sub_kernel_matrix = GetSubMatrix(kernel_matrix, block_row, sub_matrix_index, BLOCK_SIZE); |
||||
|
// gets sub-matrix sub_original_points of original_points |
||||
|
Matrix sub_original_points = GetSubMatrix(original_points, sub_matrix_index, block_col, BLOCK_SIZE); |
||||
|
|
||||
|
// loads s_sub_kernel_matrix and s_sub_original_points from device global memory to shared |
||||
|
//memory, each thread loads one element of each sub-matrix |
||||
|
s_sub_kernel_matrix[row * BLOCK_SIZE + col] = |
||||
|
sub_kernel_matrix.elements[row * sub_kernel_matrix.stride + col]; |
||||
|
s_sub_original_points[row * BLOCK_SIZE + col] = |
||||
|
sub_original_points.elements[row * sub_original_points.stride + col]; |
||||
|
|
||||
|
// synchronizes to make sure the sub-matrices are loaded before starting the computation |
||||
|
__syncthreads(); |
||||
|
|
||||
|
// multiplies sub_kernel_matrix and sub_original_points |
||||
|
for (int element_index = 0; element_index < BLOCK_SIZE; ++element_index){ |
||||
|
cell_value += s_sub_kernel_matrix[row * sub_kernel_matrix.stride + element_index] * |
||||
|
s_sub_original_points[element_index * sub_original_points.stride + col]; |
||||
|
} |
||||
|
|
||||
|
// synchronizes to make sure that the preceding computation is done before loading two new |
||||
|
// sub-matrices of kernel_matrix and original_points in the next iteration |
||||
|
__syncthreads(); |
||||
|
} |
||||
|
|
||||
|
// new_shift elements are calculated by dividing with the denominator |
||||
|
int cell_row = (block_row * BLOCK_SIZE + row) * new_shift.stride; |
||||
|
int cell_col = block_col * BLOCK_SIZE + col; |
||||
|
//sub_new_shift.elements[cell_row + cell_col] = cell_value / denominator.elements[cell_row]; |
||||
|
sub_new_shift.elements[row * sub_new_shift.stride + col] = |
||||
|
cell_value / denominator.elements[block_row * BLOCK_SIZE + row]; |
||||
|
|
||||
|
// calculates mean-shift vector |
||||
|
/*mean_shift_vector.elements[(block_row * BLOCK_SIZE + row) * mean_shift_vector.stride |
||||
|
+ (block_col * BLOCK_SIZE + col)] = |
||||
|
sub_new_shift.elements[row * sub_new_shift.stride + col] - |
||||
|
shifted_points.elements[(block_row * BLOCK_SIZE + row) * shifted_points.stride |
||||
|
+ (block_col * BLOCK_SIZE + col)];*/ |
||||
|
|
||||
|
/*free(s_sub_kernel_matrix); |
||||
|
free(s_sub_original_points);*/ |
||||
|
} |
||||
|
|
||||
|
// Get the BLOCK_SIZExBLOCK_SIZE sub-matrix Asub of A that is |
||||
|
// located col sub-matrices to the right and row sub-matrices down |
||||
|
// from the upper-left corner of A |
||||
|
__device__ Matrix GetSubMatrix(Matrix A, int row, int col, int BLOCK_SIZE){ |
||||
|
Matrix Asub; |
||||
|
Asub.width = BLOCK_SIZE; |
||||
|
Asub.height = BLOCK_SIZE; |
||||
|
Asub.stride = BLOCK_SIZE; |
||||
|
Asub.elements = &(A.elements[A.stride * BLOCK_SIZE * row + BLOCK_SIZE * col]); |
||||
|
return Asub; |
||||
|
} |
@ -0,0 +1,29 @@ |
|||||
|
#ifndef SERIAL_KERNELS_H /* Include guard */ |
||||
|
#define SERIAL_KERNELS_H |
||||
|
|
||||
|
/* Structures */ |
||||
|
|
||||
|
//Matrix is used to describe matrices
|
||||
|
typedef struct { |
||||
|
int width; |
||||
|
int height; |
||||
|
int stride; |
||||
|
double *elements; |
||||
|
} Matrix; |
||||
|
|
||||
|
//Kernel calculate_kernel_matrix_kernel calculates the current kernel matrix
|
||||
|
__global__ void calculate_kernel_matrix_kernel(Matrix shifted_points, Matrix original_points, |
||||
|
double deviation, Matrix kernel_matrix); |
||||
|
|
||||
|
//Kernel denominator_kernel calculates the sum in the denominator of the fraction used to find new
|
||||
|
//(shifted) positions of the points
|
||||
|
__global__ void denominator_kernel(Matrix denominator, Matrix kernel_matrix); |
||||
|
|
||||
|
//Kernel shift_points_kernel shifts the positions of all points and calculates the new mean shift
|
||||
|
//vector according to the new point array
|
||||
|
__global__ void shift_points_kernel(Matrix original_points, Matrix kernel_matrix, |
||||
|
Matrix shifted_points, Matrix new_shift, Matrix denominator, Matrix mean_shift_vector); |
||||
|
|
||||
|
__device__ Matrix GetSubMatrix(Matrix A, int row, int col, int BLOCK_SIZE); |
||||
|
|
||||
|
#endif //SERIAL_KERNELS_H
|
@ -0,0 +1,165 @@ |
|||||
|
#include <stdio.h> |
||||
|
#include <stdlib.h> |
||||
|
#include <math.h> |
||||
|
#include <float.h> |
||||
|
#include <string.h> |
||||
|
|
||||
|
#include "meanshift_utils.h" |
||||
|
#include "meanshift_gpu_utils.h" |
||||
|
|
||||
|
#define OUTPUT_PREFIX "../output/output_" |
||||
|
|
||||
|
void get_args(int argc, char **argv, parameters *params){ |
||||
|
if (argc < 7) { |
||||
|
printf("Usage: %s h e N D Pd Pl\nwhere:\n" |
||||
|
"\th is the variance\n" |
||||
|
"\te is the min distance, between two points, that is taken into account in computations\n" |
||||
|
"\tN is the the number of points\n" |
||||
|
"\tD is the number of dimensions of each point\n" |
||||
|
"\tPd is the path of the dataset file\n" |
||||
|
"\tPl is the path of the labels file\n" |
||||
|
"\n\t--verbose | -v is an optional flag to enable execution information output" |
||||
|
"\n\t--output | -o is an optional flag to enable points output in each iteration", argv[0]); |
||||
|
exit(1); |
||||
|
} |
||||
|
|
||||
|
DEVIATION = atoi(argv[1]); |
||||
|
params->epsilon = atof(argv[2]); |
||||
|
NUMBER_OF_POINTS = atoi(argv[3]); |
||||
|
DIMENSIONS = atoi(argv[4]); |
||||
|
POINTS_FILENAME = argv[5]; |
||||
|
LABELS_FILENAME = argv[6]; |
||||
|
params->verbose = false; |
||||
|
params->display = false; |
||||
|
|
||||
|
if (argc > 7){ |
||||
|
for (int index=7; index<argc; ++index){ |
||||
|
if (!strcmp(argv[index], "--verbose") || !strcmp(argv[index], "-v")){ |
||||
|
params->verbose = true; |
||||
|
} else if (!strcmp(argv[index], "--output") || !strcmp(argv[index], "-o")){ |
||||
|
params->display = true; |
||||
|
} else { |
||||
|
printf("Couldn't parse argument %d: %s\n", index, argv[index]); |
||||
|
exit(EXIT_FAILURE); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
/*printf("DEVIATION = %d\n" |
||||
|
"epsilon = %f\n" |
||||
|
"NUMBER_OF_POINTS = %d\n" |
||||
|
"DIMENSIONS = %d\n" |
||||
|
"POINTS_FILENAME = %s\n" |
||||
|
"LABELS_FILENAME = %s\n" |
||||
|
"verbose = %d\n" |
||||
|
"display = %d\n", DEVIATION, params->epsilon, NUMBER_OF_POINTS, DIMENSIONS, POINTS_FILENAME |
||||
|
, LABELS_FILENAME, params->verbose, params->display);*/ |
||||
|
} |
||||
|
|
||||
|
void init(double ***vectors, char **labels){ |
||||
|
int bytes_read = 0; |
||||
|
|
||||
|
set_GPU(); |
||||
|
|
||||
|
if (params.verbose){ |
||||
|
printf("Reading dataset and labels...\n"); |
||||
|
} |
||||
|
|
||||
|
// initializes vectors |
||||
|
FILE *points_file; |
||||
|
points_file = fopen(POINTS_FILENAME, "rb"); |
||||
|
if (points_file != NULL){ |
||||
|
// allocates memory for the array |
||||
|
(*vectors) = alloc_double(NUMBER_OF_POINTS, DIMENSIONS); |
||||
|
// reads vectors dataset from file |
||||
|
for (int i=0; i<NUMBER_OF_POINTS; i++){ |
||||
|
bytes_read = fread((*vectors)[i], sizeof(double), DIMENSIONS, points_file); |
||||
|
if ( bytes_read != DIMENSIONS ){ |
||||
|
if(feof(points_file)){ |
||||
|
printf("Premature end of file reached.\n"); |
||||
|
} else{ |
||||
|
printf("Error reading points file."); |
||||
|
} |
||||
|
fclose(points_file); |
||||
|
exit(EXIT_FAILURE); |
||||
|
} |
||||
|
} |
||||
|
} else { |
||||
|
printf("Error reading dataset file.\n"); |
||||
|
exit(EXIT_FAILURE); |
||||
|
} |
||||
|
fclose(points_file); |
||||
|
|
||||
|
// initializes file that will contain the labels (train) |
||||
|
FILE *labels_file; |
||||
|
labels_file = fopen(LABELS_FILENAME, "rb"); |
||||
|
if (labels_file != NULL){ |
||||
|
// NOTE : Labels were classified as <class 'numpy.uint8'> |
||||
|
// variables of type uint8 are stored as 1-byte (8-bit) unsigned integers |
||||
|
// gets number of labels |
||||
|
fseek(labels_file, 0L, SEEK_END); |
||||
|
long int pos = ftell(labels_file); |
||||
|
rewind(labels_file); |
||||
|
int label_elements = pos/ sizeof(char); |
||||
|
|
||||
|
// allocates memory for the array |
||||
|
*labels = (char*)malloc(label_elements* sizeof(char)); |
||||
|
fseek(labels_file, 0L, SEEK_SET); |
||||
|
bytes_read = fread((*labels), sizeof(char), label_elements, labels_file); |
||||
|
if ( bytes_read != label_elements ){ |
||||
|
if(feof(points_file)){ |
||||
|
printf("Premature end of file reached.\n"); |
||||
|
} else{ |
||||
|
printf("Error reading points file."); |
||||
|
} |
||||
|
fclose(labels_file); |
||||
|
exit(EXIT_FAILURE); |
||||
|
} |
||||
|
} |
||||
|
fclose(labels_file); |
||||
|
|
||||
|
if (params.verbose){ |
||||
|
printf("Done.\n\n"); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
double **alloc_double(int rows, int cols) { |
||||
|
double *data = (double *) malloc(rows*cols*sizeof(double)); |
||||
|
double **array = (double **) malloc(rows*sizeof(double*)); |
||||
|
for (int i=0; i<rows; i++) |
||||
|
array[i] = &(data[cols*i]); |
||||
|
return array; |
||||
|
} |
||||
|
|
||||
|
void duplicate(double **source, int rows, int cols, double ***dest){ |
||||
|
for (int i=0; i<rows; i++){ |
||||
|
for (int j=0; j<cols; j++){ |
||||
|
(*dest)[i][j] = source[i][j]; |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
void print_matrix(double **array, int rows, int cols){ |
||||
|
for (int i=0; i<cols; i++){ |
||||
|
for (int j=0; j<rows; j++){ |
||||
|
printf("%f ", array[j][i]); |
||||
|
} |
||||
|
printf("\n"); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
void save_matrix(double **matrix, int iteration){ |
||||
|
char filename[50]; |
||||
|
snprintf(filename, sizeof(filename), "%s%d", "../output/output_", iteration); |
||||
|
FILE *file; |
||||
|
file = fopen(filename, "w"); |
||||
|
for (int rows=0; rows<NUMBER_OF_POINTS; ++rows){ |
||||
|
for (int cols=0; cols<DIMENSIONS; ++cols){ |
||||
|
fprintf(file, "%f", matrix[rows][cols]); |
||||
|
if (cols != DIMENSIONS - 1){ |
||||
|
fprintf(file, ","); |
||||
|
} |
||||
|
} |
||||
|
fprintf(file, "\n"); |
||||
|
} |
||||
|
} |
@ -0,0 +1,35 @@ |
|||||
|
#ifndef SERIAL_UTILS_H /* Include guard */ |
||||
|
#define SERIAL_UTILS_H |
||||
|
|
||||
|
#include <stdbool.h> |
||||
|
|
||||
|
/* Structures */ |
||||
|
|
||||
|
//Parameters is used to store session specific variables in an orderly way
|
||||
|
typedef struct parameters { |
||||
|
double epsilon; |
||||
|
bool verbose; |
||||
|
bool display; |
||||
|
} Parameters; |
||||
|
|
||||
|
//Function get_args parses command line arguments
|
||||
|
void get_args(int argc, char **argv, Parameters *params); |
||||
|
|
||||
|
//Function init sets up the GPU for later use, gets its properties and reads the dataset and label
|
||||
|
//arrays from the corresponding files
|
||||
|
void init(double ***vectors, char **labels); |
||||
|
|
||||
|
//Function alloc_double allocates rows*cols bytes of continuous memory
|
||||
|
double **alloc_double(int rows, int cols); |
||||
|
|
||||
|
//Function duplicate copies the values of source array to dest array
|
||||
|
void duplicate(double **source, int rows, int cols, double ***dest); |
||||
|
|
||||
|
//Function print_matrix prints array of dimensions <rows X cols> to the console
|
||||
|
void print_matrix(double **array, int rows, int cols); |
||||
|
|
||||
|
//Function save_matrix stores matrix in a csv file with path/filename "../output/output_iteration".
|
||||
|
//If a file already exists new lines are concatenated
|
||||
|
void save_matrix(double **matrix, int iteration); |
||||
|
|
||||
|
#endif //SERIAL_UTILS_H
|
Loading…
Reference in new issue