Apostolos Fanakis
7 years ago
8 changed files with 889 additions and 0 deletions
@ -0,0 +1,43 @@ |
|||
SHELL := /bin/bash |
|||
|
|||
# ============================================
|
|||
# COMMANDS
|
|||
|
|||
CC = nvcc |
|||
HOST_COMPILER = -ccbin gcc |
|||
CUDA_FLAGS = -arch=sm_21 -Wno-deprecated-gpu-targets -lcublas |
|||
C_FLAGS = -lm -O3 -I. |
|||
|
|||
COMPILE_FLAGS = $(HOST_COMPILER) -x cu $(CUDA_FLAGS) -dc $(C_FLAGS) |
|||
LINK_FLAGS = $(HOST_COMPILER) $(CUDA_FLAGS) $(C_FLAGS) |
|||
|
|||
OBJ = meanshift.o meanshift_utils.o meanshift_gpu_utils.o meanshift_kernels.o |
|||
DEPS = meanshift_utils.h meanshift_kernels.h |
|||
|
|||
RM = rm -f |
|||
|
|||
# ==========================================
|
|||
# TARGETS
|
|||
|
|||
EXECUTABLES = meanshift |
|||
|
|||
.PHONY: all clean |
|||
|
|||
all: $(EXECUTABLES) |
|||
|
|||
# ==========================================
|
|||
# DEPENDENCIES (HEADERS)
|
|||
|
|||
%.o: %.cu $(DEPS) |
|||
$(CC) $(COMPILE_FLAGS) $< -o $@ |
|||
|
|||
.PRECIOUS: $(EXECUTABLES) $(OBJ) |
|||
|
|||
# ==========================================
|
|||
# EXECUTABLE (MAIN)
|
|||
|
|||
$(EXECUTABLES): $(OBJ) |
|||
$(CC) $(LINK_FLAGS) $(OBJ) -o $@ |
|||
|
|||
clean: |
|||
$(RM) *.o *~ $(EXECUTABLES) |
@ -0,0 +1,45 @@ |
|||
#include <stdio.h> |
|||
#include <stdlib.h> |
|||
#include <sys/time.h> |
|||
|
|||
#include "meanshift_utils.h" |
|||
#include "meanshift_gpu_utils.h" |
|||
|
|||
int DEVIATION = 1; |
|||
int NUMBER_OF_POINTS = 600; |
|||
int DIMENSIONS = 2; |
|||
const char *POINTS_FILENAME = "../data/X.bin"; |
|||
const char *LABELS_FILENAME = "../data/L.bin"; |
|||
parameters params; |
|||
|
|||
struct timeval startwtime, endwtime; |
|||
double seq_time; |
|||
|
|||
int main(int argc, char **argv){ |
|||
int recursions = 0; |
|||
double **vectors, **shifted_points; |
|||
char *labels; |
|||
|
|||
params.epsilon = 0.0001; |
|||
params.verbose = false; |
|||
params.display = true; |
|||
//get_args(argc, argv, ¶ms); //commented out while in development |
|||
init(&vectors, &labels); |
|||
|
|||
// tic |
|||
gettimeofday (&startwtime, NULL); |
|||
recursions = meanshift(vectors, &shifted_points, DEVIATION); |
|||
|
|||
// toc |
|||
gettimeofday (&endwtime, NULL); |
|||
seq_time = (double)((endwtime.tv_usec - startwtime.tv_usec)/1.0e6 + endwtime.tv_sec - startwtime.tv_sec); |
|||
|
|||
|
|||
printf("\nTotal number of recursions = %d\n", recursions); |
|||
printf("%s wall clock time = %f\n","Mean Shift", seq_time); |
|||
|
|||
free(vectors[0]); |
|||
free(vectors); |
|||
free(shifted_points[0]); |
|||
free(shifted_points); |
|||
} |
@ -0,0 +1,370 @@ |
|||
#include <stdio.h> |
|||
#include <stdlib.h> |
|||
#include <math.h> |
|||
#include <float.h> |
|||
#include <string.h> |
|||
#include <sys/time.h> |
|||
|
|||
#include <cublas_v2.h> |
|||
|
|||
#include "meanshift_utils.h" |
|||
#include "meanshift_gpu_utils.h" |
|||
|
|||
cudaDeviceProp device_properties; |
|||
|
|||
struct timeval start_w_time, end_w_time; |
|||
double seq; |
|||
|
|||
//Based on: |
|||
// https://www.cs.virginia.edu/~csadmin/wiki/index.php/CUDA_Support/Choosing_a_GPU |
|||
void set_GPU(){ |
|||
int devices_count = 0, max_multiprocessors = 0, max_device = 0; |
|||
|
|||
// gets devices count checking for errors like no devices or no drivers to check for |
|||
// devices available |
|||
gpuErrchk( cudaGetDeviceCount(&devices_count) ); |
|||
for(int device_index = 0; device_index < devices_count; ++device_index){ |
|||
// gets current index device's properties |
|||
cudaDeviceProp this_device_properties; |
|||
gpuErrchk( cudaGetDeviceProperties(&this_device_properties, device_index) ); |
|||
|
|||
// stores best available device's index |
|||
// only devices with compute capability >= 2.0 are able to run the code |
|||
if (max_multiprocessors < this_device_properties.multiProcessorCount |
|||
&& this_device_properties.major >= 2 && this_device_properties.minor >= 0){ |
|||
// stores devices properties for later use |
|||
device_properties = this_device_properties; |
|||
max_multiprocessors = this_device_properties.multiProcessorCount; |
|||
max_device = device_index; |
|||
} |
|||
} |
|||
// sets the device |
|||
gpuErrchk( cudaSetDevice(max_device) ); |
|||
if (params.verbose){ |
|||
printf("Device chosen is \"%s\"\n" |
|||
"Device has %d multi processors and compute capability %d.%d\n" |
|||
"Max threads per block supported are %d\n\n" |
|||
, device_properties.name |
|||
, device_properties.multiProcessorCount, device_properties.major, device_properties.minor |
|||
, device_properties.maxThreadsPerBlock); |
|||
} |
|||
} |
|||
|
|||
int meanshift(double **original_points, double ***shifted_points, int deviation){ |
|||
// host variables |
|||
int size = 0; |
|||
static int recursion = 0; |
|||
static double **kernel_matrix, **mean_shift_vector, w_memcpy_time; |
|||
double **new_shift, current_norm = 0, tmp_w_memcpy_time; |
|||
bool is_first_recursion = false; |
|||
|
|||
// device variables |
|||
static Matrix d_original_points, d_shifted_points, d_kernel_matrix, d_denominator, |
|||
d_mean_shift_vector; |
|||
Matrix d_new_shift; |
|||
|
|||
// allocates memory and copies original points on first recursion |
|||
if (recursion == 0 || (*shifted_points) == NULL){ |
|||
is_first_recursion = true; |
|||
// allocates memory for shifted points array and copies original points into it |
|||
(*shifted_points) = alloc_double(NUMBER_OF_POINTS, DIMENSIONS); |
|||
duplicate(original_points, NUMBER_OF_POINTS, DIMENSIONS, shifted_points); |
|||
|
|||
// allocates memory for mean shift vector |
|||
mean_shift_vector = alloc_double(NUMBER_OF_POINTS, DIMENSIONS); |
|||
// initializes elements of mean_shift_vector to inf |
|||
for (int i=0;i<NUMBER_OF_POINTS;i++){ |
|||
for (int j=0;j<DIMENSIONS;j++){ |
|||
mean_shift_vector[i][j] = DBL_MAX; |
|||
} |
|||
} |
|||
|
|||
// allocates memory for kernel_matrix |
|||
kernel_matrix = alloc_double(NUMBER_OF_POINTS, NUMBER_OF_POINTS); |
|||
|
|||
// tic |
|||
gettimeofday (&start_w_time, NULL); |
|||
|
|||
// allocates corresponding memory in device |
|||
init_device_memory(original_points, *shifted_points, &d_original_points, &d_shifted_points, |
|||
&d_kernel_matrix, &d_denominator, &d_mean_shift_vector); |
|||
// toc |
|||
gettimeofday (&end_w_time, NULL); |
|||
seq = (double)((end_w_time.tv_usec - start_w_time.tv_usec) |
|||
/ 1.0e6 + end_w_time.tv_sec - start_w_time.tv_sec); |
|||
|
|||
if (params.verbose){ |
|||
printf("Device memory allocation wall clock time = %f\n\n", seq); |
|||
} |
|||
} |
|||
|
|||
// finds pairwise distance matrix (inside radius) |
|||
// [I, D] = rangesearch(x,y,h); |
|||
calculate_kernel_matrix(d_shifted_points, d_original_points, d_kernel_matrix, deviation, |
|||
&kernel_matrix, &tmp_w_memcpy_time); |
|||
w_memcpy_time += tmp_w_memcpy_time; |
|||
|
|||
// calculates denominator |
|||
calculate_denominator(d_kernel_matrix, d_denominator); |
|||
|
|||
// creates new y vector |
|||
// allocates memory in every recursion |
|||
new_shift = alloc_double(NUMBER_OF_POINTS, DIMENSIONS); |
|||
// allocates corresponding memory in device |
|||
d_new_shift.width = DIMENSIONS; |
|||
d_new_shift.height = NUMBER_OF_POINTS; |
|||
size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double); |
|||
gpuErrchk( cudaMalloc(&(d_new_shift.elements), size) ); |
|||
|
|||
shift_points(d_kernel_matrix, d_original_points, d_shifted_points, d_new_shift, d_denominator, |
|||
d_mean_shift_vector, kernel_matrix, original_points, &new_shift, &mean_shift_vector, |
|||
&tmp_w_memcpy_time); |
|||
w_memcpy_time += tmp_w_memcpy_time; |
|||
|
|||
for (int row=0; row<2; ++row){ |
|||
for (int col=0; col<2; ++col){ |
|||
printf("new_shift[%d][%d] = %f\n", row, col, new_shift[row][col]); |
|||
printf("new_shift[%d][%d] = %f\n", 300+row, 216+col, new_shift[300+row][216+col]); |
|||
printf("new_shift[%d][%d] = %f\n", 562+row, 487+col, new_shift[562+row][487+col]); |
|||
} |
|||
} |
|||
|
|||
if(is_first_recursion){ |
|||
exit(0); |
|||
} |
|||
|
|||
// frees previously shifted points, they're now garbage |
|||
free((*shifted_points)[0]); |
|||
gpuErrchk( cudaFree(d_shifted_points.elements) ); |
|||
// updates shifted points pointer to the new array address |
|||
shifted_points = &new_shift; |
|||
d_shifted_points.elements = d_new_shift.elements; |
|||
|
|||
if (params.display){ |
|||
save_matrix((*shifted_points), recursion); |
|||
} |
|||
|
|||
// calculates norm of the new mean shift vector in GPU using "cuBlas" library function |
|||
cublasHandle_t handle; |
|||
cublasStatus_t cublas_status = cublasCreate(&handle); |
|||
if (cublas_status != CUBLAS_STATUS_SUCCESS){ |
|||
exit(cublas_status); |
|||
} |
|||
cublas_status = cublasDnrm2(handle, NUMBER_OF_POINTS * DIMENSIONS, d_mean_shift_vector.elements, |
|||
1, ¤t_norm); |
|||
if (cublas_status != CUBLAS_STATUS_SUCCESS){ |
|||
exit(cublas_status); |
|||
} |
|||
cublas_status = cublasDestroy(handle); |
|||
if (cublas_status != CUBLAS_STATUS_SUCCESS){ |
|||
exit(cublas_status); |
|||
} |
|||
|
|||
if (params.verbose){ |
|||
printf("Recursion n. %d, error\t%f \n", recursion, current_norm); |
|||
} |
|||
|
|||
// recurses until convergence |
|||
if (current_norm > params.epsilon) { |
|||
++recursion; |
|||
meanshift(original_points, shifted_points, deviation); |
|||
} |
|||
|
|||
if (is_first_recursion){ |
|||
if (params.verbose){ |
|||
printf("\nCopying between device and host wall clock time = %f\n", w_memcpy_time); |
|||
} |
|||
|
|||
// cleans up allocations |
|||
free(mean_shift_vector[0]); |
|||
free(mean_shift_vector); |
|||
free(kernel_matrix[0]); |
|||
free(kernel_matrix); |
|||
|
|||
free_device_memory(d_original_points, d_kernel_matrix, d_denominator, d_shifted_points); |
|||
} |
|||
|
|||
return recursion; |
|||
} |
|||
|
|||
void init_device_memory(double **original_points, double **shifted_points, |
|||
Matrix *d_original_points, Matrix *d_shifted_points, Matrix *d_kernel_matrix, |
|||
Matrix *d_denominator, Matrix *d_mean_shift_vector){ |
|||
int size; |
|||
|
|||
// allocates memory for original_points in GPU and copies the array |
|||
d_original_points->width = DIMENSIONS; |
|||
d_original_points->height = NUMBER_OF_POINTS; |
|||
size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double); |
|||
gpuErrchk( cudaMalloc(&(d_original_points->elements), size) ); |
|||
gpuErrchk( cudaMemcpy(d_original_points->elements, &(original_points[0][0]) |
|||
, size, cudaMemcpyHostToDevice) ); |
|||
|
|||
// allocates memory for shifted_points in GPU and copies the array |
|||
d_shifted_points->width = DIMENSIONS; |
|||
d_shifted_points->height = NUMBER_OF_POINTS; |
|||
size = DIMENSIONS * NUMBER_OF_POINTS * sizeof(double); |
|||
gpuErrchk( cudaMalloc(&(d_shifted_points->elements), size) ); |
|||
gpuErrchk( cudaMemcpy(d_shifted_points->elements, &(shifted_points[0][0]) |
|||
, size, cudaMemcpyHostToDevice) ); |
|||
|
|||
// allocates memory for kernel_matrix in GPU |
|||
d_kernel_matrix->width = NUMBER_OF_POINTS; |
|||
d_kernel_matrix->height = NUMBER_OF_POINTS; |
|||
size = NUMBER_OF_POINTS * NUMBER_OF_POINTS * sizeof(double); |
|||
gpuErrchk( cudaMalloc(&(d_kernel_matrix->elements), size) ); |
|||
|
|||
// allocates memory for denominator in GPU |
|||
d_denominator->width = 1; |
|||
d_denominator->height = NUMBER_OF_POINTS; |
|||
size = NUMBER_OF_POINTS * sizeof(double); |
|||
gpuErrchk( cudaMalloc(&(d_denominator->elements), size) ); |
|||
|
|||
// allocates memory for mean_shift_vector in GPU |
|||
d_mean_shift_vector->width = DIMENSIONS; |
|||
d_mean_shift_vector->height = NUMBER_OF_POINTS; |
|||
size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double); |
|||
gpuErrchk( cudaMalloc(&(d_mean_shift_vector->elements), size) ); |
|||
} |
|||
|
|||
void calculate_kernel_matrix(Matrix d_shifted_points, Matrix d_original_points, |
|||
Matrix d_kernel_matrix, double deviation, double ***kernel_matrix, double *w_memcpy_time){ |
|||
int size; |
|||
static bool first_iter = true; |
|||
// gets max block size supported from the device |
|||
static int max_block_size = device_properties.maxThreadsPerBlock; |
|||
static int requested_block_size = (int)sqrt(max_block_size); |
|||
bool block_size_too_big = true; |
|||
|
|||
dim3 dimBlock; |
|||
dim3 dimGrid; |
|||
do { |
|||
dimBlock.x = requested_block_size; |
|||
dimBlock.y = requested_block_size; |
|||
dimGrid.x = (d_kernel_matrix.height + dimBlock.x - 1) / dimBlock.x; |
|||
dimGrid.y = (d_kernel_matrix.width + dimBlock.y - 1) / dimBlock.y; |
|||
|
|||
calculate_kernel_matrix_kernel<<<dimGrid, dimBlock>>>(d_shifted_points, d_original_points |
|||
, deviation, d_kernel_matrix); |
|||
if (cudaGetLastError() != cudaSuccess){ |
|||
--requested_block_size; |
|||
} else { |
|||
block_size_too_big = false; |
|||
gpuErrchk( cudaDeviceSynchronize() ); |
|||
} |
|||
} while(block_size_too_big); |
|||
|
|||
if (first_iter && params.verbose){ |
|||
printf("calculate_kernel_matrix_kernel called with:\n"); |
|||
printf("dimBlock.x = %d, dimBlock.y = %d\n", dimBlock.x, dimBlock.y); |
|||
printf("dimGrid.x = %d, dimGrid.y = %d\n\n", dimGrid.x, dimGrid.y); |
|||
first_iter = false; |
|||
} |
|||
|
|||
size = NUMBER_OF_POINTS * NUMBER_OF_POINTS * sizeof(double); |
|||
|
|||
// tic |
|||
gettimeofday (&start_w_time, NULL); |
|||
|
|||
gpuErrchk( cudaMemcpy(&((*kernel_matrix)[0][0]), d_kernel_matrix.elements |
|||
, size, cudaMemcpyDeviceToHost) ); |
|||
|
|||
// toc |
|||
gettimeofday (&end_w_time, NULL); |
|||
*w_memcpy_time = (double)((end_w_time.tv_usec - start_w_time.tv_usec) |
|||
/ 1.0e6 + end_w_time.tv_sec - start_w_time.tv_sec); |
|||
} |
|||
|
|||
void calculate_denominator(Matrix d_kernel_matrix, Matrix d_denominator){ |
|||
static bool first_iter = true; |
|||
// gets max block size supported from the device |
|||
static int requested_block_size = device_properties.maxThreadsPerBlock; |
|||
bool block_size_too_big = true; |
|||
|
|||
dim3 dimBlock; |
|||
dim3 dimGrid; |
|||
do { |
|||
dimBlock.x = requested_block_size; |
|||
dimBlock.y = 1; |
|||
dimGrid.x = (d_kernel_matrix.height + dimBlock.x - 1) / dimBlock.x; |
|||
dimGrid.y = 1; |
|||
|
|||
denominator_kernel<<<dimGrid, dimBlock>>>(d_denominator, d_kernel_matrix); |
|||
if (cudaGetLastError() != cudaSuccess){ |
|||
--requested_block_size; |
|||
} else { |
|||
block_size_too_big = false; |
|||
gpuErrchk( cudaDeviceSynchronize() ); |
|||
} |
|||
} while(block_size_too_big); |
|||
|
|||
if (first_iter && params.verbose){ |
|||
printf("calculate_denominator called with:\n"); |
|||
printf("dimBlock.x = %d, dimBlock.y = %d\n", dimBlock.x, dimBlock.y); |
|||
printf("dimGrid.x = %d, dimGrid.y = %d\n\n", dimGrid.x, dimGrid.y); |
|||
first_iter = false; |
|||
} |
|||
} |
|||
|
|||
void shift_points(Matrix d_kernel_matrix, Matrix d_original_points, Matrix d_shifted_points, |
|||
Matrix d_new_shift, Matrix d_denominator, Matrix d_mean_shift_vector, double **kernel_matrix, |
|||
double **original_points, double ***new_shift, double ***mean_shift_vector, |
|||
double *w_memcpy_time){ |
|||
int size; |
|||
static bool first_iter = true; |
|||
// gets max block size supported from the device |
|||
static int max_block_size = device_properties.maxThreadsPerBlock; |
|||
static int requested_block_size = (int)(max_block_size / d_new_shift.width); |
|||
bool block_size_too_big = true; |
|||
|
|||
dim3 dimBlock; |
|||
dim3 dimGrid; |
|||
do { |
|||
/*dimBlock.x = requested_block_size; |
|||
dimBlock.y = d_new_shift.width;*/ |
|||
dimBlock.x = 2; |
|||
dimBlock.y = 2; |
|||
dimGrid.x = (d_denominator.height + dimBlock.x - 1) / dimBlock.x; |
|||
dimGrid.y = 1; |
|||
|
|||
shift_points_kernel<<<dimGrid, dimBlock>>>(d_original_points, d_kernel_matrix, d_shifted_points, |
|||
d_new_shift, d_denominator, d_mean_shift_vector); |
|||
if (cudaGetLastError() != cudaSuccess){ |
|||
--requested_block_size; |
|||
} else { |
|||
block_size_too_big = false; |
|||
gpuErrchk( cudaDeviceSynchronize() ); |
|||
} |
|||
} while(block_size_too_big); |
|||
|
|||
if (first_iter && params.verbose){ |
|||
printf("shift_points_kernel called with:\n"); |
|||
printf("dimBlock.x = %d, dimBlock.y = %d\n", dimBlock.x, dimBlock.y); |
|||
printf("dimGrid.x = %d, dimGrid.y = %d\n\n", dimGrid.x, dimGrid.y); |
|||
first_iter = false; |
|||
} |
|||
|
|||
size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double); |
|||
|
|||
// tic |
|||
gettimeofday (&start_w_time, NULL); |
|||
|
|||
gpuErrchk( cudaMemcpy(&((*new_shift)[0][0]), d_new_shift.elements |
|||
, size, cudaMemcpyDeviceToHost) ); |
|||
gpuErrchk( cudaMemcpy(&((*mean_shift_vector)[0][0]), d_mean_shift_vector.elements |
|||
, size, cudaMemcpyDeviceToHost) ); |
|||
|
|||
// toc |
|||
gettimeofday (&end_w_time, NULL); |
|||
*w_memcpy_time = (double)((end_w_time.tv_usec - start_w_time.tv_usec) |
|||
/ 1.0e6 + end_w_time.tv_sec - start_w_time.tv_sec); |
|||
} |
|||
|
|||
void free_device_memory(Matrix d_original_points, Matrix d_kernel_matrix, Matrix d_denominator, |
|||
Matrix d_shifted_points){ |
|||
// frees all memory previously allocated in device |
|||
gpuErrchk( cudaFree(d_original_points.elements) ); |
|||
gpuErrchk( cudaFree(d_kernel_matrix.elements) ); |
|||
gpuErrchk( cudaFree(d_denominator.elements) ); |
|||
gpuErrchk( cudaFree(d_shifted_points.elements) ); |
|||
} |
@ -0,0 +1,58 @@ |
|||
#ifndef SERIAL_GPU_UTILS_H /* Include guard */ |
|||
#define SERIAL_GPU_UTILS_H |
|||
|
|||
#include "meanshift_kernels.h" |
|||
|
|||
//GPU error check snippet taken from:
|
|||
// https://stackoverflow.com/a/14038590
|
|||
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); } |
|||
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true){ |
|||
if (code != cudaSuccess){ |
|||
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); |
|||
if (abort) exit(code); |
|||
} |
|||
} |
|||
|
|||
/* Global variables */ |
|||
extern int DEVIATION; |
|||
extern int NUMBER_OF_POINTS; |
|||
extern int DIMENSIONS; |
|||
extern const char* POINTS_FILENAME; |
|||
extern const char* LABELS_FILENAME; |
|||
extern Parameters params; |
|||
extern cudaDeviceProp device_properties; |
|||
|
|||
//Function set_GPU parses available GPU devices, selects the one with the most multi-processors for
|
|||
//usage and stores its properties in global struct device_properties
|
|||
void set_GPU(); |
|||
|
|||
//Function meanshift recursively shifts original points according to the mean-shift algorithm saving
|
|||
//the result to shiftedPoints, h is the desirable deviation
|
|||
int meanshift(double **original_points, double ***shifted_points, int h); |
|||
|
|||
//Function init_device_memory allocates memory for necessary arrays in the device
|
|||
void init_device_memory(double **original_points, double **shifted_points, |
|||
Matrix *d_original_points, Matrix *d_shifted_points, Matrix *d_kernel_matrix, |
|||
Matrix *d_denominator, Matrix *d_new_shift); |
|||
|
|||
//Function calculate_kernel_matrix is a wrapper for the kernel call of the corresponding kernel
|
|||
//"calculate_kernel_matrix_kernel" that calculates the kernel matrix
|
|||
void calculate_kernel_matrix(Matrix d_shifted_points, Matrix d_original_points, |
|||
Matrix d_kernel_matrix, double deviation, double ***kernel_matrix, double *w_memcpy_time); |
|||
|
|||
//Function calculate_denominator is a wrapper for the kernel call of the corresponding kernel
|
|||
//"calculate_denominator_kernel" that calculates the denominator of shifted points fraction
|
|||
void calculate_denominator(Matrix d_kernel_matrix, Matrix d_denominator); |
|||
|
|||
//Function shift_points is a wrapper for the kernel call of the corresponding kernel
|
|||
//"shift_points_kernel" that shifts the positions of all points
|
|||
void shift_points(Matrix d_kernel_matrix, Matrix d_original_points, Matrix d_shifted_points, |
|||
Matrix d_new_shift, Matrix d_denominator, Matrix d_mean_shift_vector, double **kernel_matrix, |
|||
double **original_points, double ***new_shift, double ***mean_shift_vector, |
|||
double *w_memcpy_time); |
|||
|
|||
//Function free_device_memory frees device's previously allocated memory
|
|||
void free_device_memory(Matrix d_original_points, Matrix d_kernel_matrix, Matrix d_denominator, |
|||
Matrix d_shifted_points); |
|||
|
|||
#endif //SERIAL_GPU_UTILS_H
|
@ -0,0 +1,144 @@ |
|||
#include "meanshift_kernels.h" |
|||
#include <stdio.h> |
|||
#include <stdlib.h> |
|||
|
|||
__global__ void calculate_kernel_matrix_kernel(Matrix shifted_points, Matrix original_points, |
|||
double deviation, Matrix kernel_matrix){ |
|||
// each thread calculates one element of kernel_matrix |
|||
int row = blockIdx.x * blockDim.x + threadIdx.x; |
|||
int col = blockIdx.y * blockDim.y + threadIdx.y; |
|||
|
|||
// performs calculations only if thread's indexes are within matrix bounds |
|||
if (row * kernel_matrix.width + col >= kernel_matrix.width * kernel_matrix.height){ |
|||
return; |
|||
} |
|||
|
|||
int dimensions = shifted_points.width; |
|||
// calculate distance |
|||
double sum = 0, dif; |
|||
for (int i=0; i<dimensions; i++){ |
|||
dif = shifted_points.elements[row * dimensions + i] |
|||
- original_points.elements[col * dimensions + i]; |
|||
sum += dif * dif; |
|||
} |
|||
double distance = sqrt(sum); |
|||
|
|||
double deviation_square = deviation*deviation; |
|||
if (distance < deviation_square){ |
|||
// computes kernel matrix |
|||
double pow = ((-1)*(distance * distance))/(2*(deviation_square)); |
|||
kernel_matrix.elements[row * kernel_matrix.width + col] = exp(pow); |
|||
} else { |
|||
kernel_matrix.elements[row * kernel_matrix.width + col] = 0; |
|||
} |
|||
if (row == col){ |
|||
kernel_matrix.elements[row * kernel_matrix.width + col] += 1; |
|||
} |
|||
} |
|||
|
|||
__global__ void denominator_kernel(Matrix denominator, Matrix kernel_matrix){ |
|||
// each thread computes one element of denominator_kernel |
|||
// by accumulating results into cell_value |
|||
double cell_value = 0; |
|||
int row = blockIdx.x * blockDim.x + threadIdx.x; |
|||
|
|||
// performs calculations only if thread's indexes are within matrix bounds |
|||
if (row >= denominator.height){ |
|||
return; |
|||
} |
|||
|
|||
for (int column = 0; column < kernel_matrix.width; ++column){ |
|||
cell_value += kernel_matrix.elements[row * kernel_matrix.width + column]; |
|||
} |
|||
denominator.elements[row] = cell_value; |
|||
} |
|||
|
|||
__global__ void shift_points_kernel(Matrix original_points, Matrix kernel_matrix, |
|||
Matrix shifted_points, Matrix new_shift, Matrix denominator, Matrix mean_shift_vector){ |
|||
int BLOCK_SIZE = blockDim.y; |
|||
int block_row = blockIdx.x; |
|||
int block_col = blockIdx.y; |
|||
|
|||
// each thread computes one element of new_shift by accumulating results into cell_value |
|||
double cell_value = 0; |
|||
|
|||
// Thread row and column within sub_new_shift |
|||
int row = threadIdx.x; |
|||
int col = threadIdx.y; |
|||
|
|||
// performs calculations only if thread's indexes are within matrix bounds |
|||
//if (row * new_shift.width + col >= new_shift.width * new_shift.height){ |
|||
/*if (new_shift.stride * BLOCK_SIZE * block_row + BLOCK_SIZE * block_col >= |
|||
new_shift.width * new_shift.height){*/ |
|||
if (BLOCK_SIZE * block_row >= new_shift.height || BLOCK_SIZE * block_col >= new_shift.width){ |
|||
return; |
|||
} |
|||
|
|||
// Each thread block computes one sub-matrix sub_new_shift of C |
|||
Matrix sub_new_shift = GetSubMatrix(new_shift, block_row, block_col, BLOCK_SIZE); |
|||
|
|||
// shared memory used to store sub_kernel_matrix and sub_original_points respectively |
|||
__shared__ double *s_sub_kernel_matrix; |
|||
s_sub_kernel_matrix = (double*)malloc(BLOCK_SIZE * BLOCK_SIZE * sizeof(double)); |
|||
__shared__ double *s_sub_original_points; |
|||
s_sub_original_points = (double*)malloc(BLOCK_SIZE * BLOCK_SIZE * sizeof(double)); |
|||
|
|||
// loops over all the sub-matrices of kernel_matrix and original_points that are required to |
|||
//compute sub_new_shift, multiplies each pair of sub-matrices and accumulates the results |
|||
for (int sub_matrix_index = 0; sub_matrix_index < (kernel_matrix.width / BLOCK_SIZE); ++sub_matrix_index) { |
|||
|
|||
// gets sub-matrix sub_kernel_matrix of kernel_matrix |
|||
Matrix sub_kernel_matrix = GetSubMatrix(kernel_matrix, block_row, sub_matrix_index, BLOCK_SIZE); |
|||
// gets sub-matrix sub_original_points of original_points |
|||
Matrix sub_original_points = GetSubMatrix(original_points, sub_matrix_index, block_col, BLOCK_SIZE); |
|||
|
|||
// loads s_sub_kernel_matrix and s_sub_original_points from device global memory to shared |
|||
//memory, each thread loads one element of each sub-matrix |
|||
s_sub_kernel_matrix[row * BLOCK_SIZE + col] = |
|||
sub_kernel_matrix.elements[row * sub_kernel_matrix.stride + col]; |
|||
s_sub_original_points[row * BLOCK_SIZE + col] = |
|||
sub_original_points.elements[row * sub_original_points.stride + col]; |
|||
|
|||
// synchronizes to make sure the sub-matrices are loaded before starting the computation |
|||
__syncthreads(); |
|||
|
|||
// multiplies sub_kernel_matrix and sub_original_points |
|||
for (int element_index = 0; element_index < BLOCK_SIZE; ++element_index){ |
|||
cell_value += s_sub_kernel_matrix[row * sub_kernel_matrix.stride + element_index] * |
|||
s_sub_original_points[element_index * sub_original_points.stride + col]; |
|||
} |
|||
|
|||
// synchronizes to make sure that the preceding computation is done before loading two new |
|||
// sub-matrices of kernel_matrix and original_points in the next iteration |
|||
__syncthreads(); |
|||
} |
|||
|
|||
// new_shift elements are calculated by dividing with the denominator |
|||
int cell_row = (block_row * BLOCK_SIZE + row) * new_shift.stride; |
|||
int cell_col = block_col * BLOCK_SIZE + col; |
|||
//sub_new_shift.elements[cell_row + cell_col] = cell_value / denominator.elements[cell_row]; |
|||
sub_new_shift.elements[row * sub_new_shift.stride + col] = |
|||
cell_value / denominator.elements[block_row * BLOCK_SIZE + row]; |
|||
|
|||
// calculates mean-shift vector |
|||
/*mean_shift_vector.elements[(block_row * BLOCK_SIZE + row) * mean_shift_vector.stride |
|||
+ (block_col * BLOCK_SIZE + col)] = |
|||
sub_new_shift.elements[row * sub_new_shift.stride + col] - |
|||
shifted_points.elements[(block_row * BLOCK_SIZE + row) * shifted_points.stride |
|||
+ (block_col * BLOCK_SIZE + col)];*/ |
|||
|
|||
/*free(s_sub_kernel_matrix); |
|||
free(s_sub_original_points);*/ |
|||
} |
|||
|
|||
// Get the BLOCK_SIZExBLOCK_SIZE sub-matrix Asub of A that is |
|||
// located col sub-matrices to the right and row sub-matrices down |
|||
// from the upper-left corner of A |
|||
__device__ Matrix GetSubMatrix(Matrix A, int row, int col, int BLOCK_SIZE){ |
|||
Matrix Asub; |
|||
Asub.width = BLOCK_SIZE; |
|||
Asub.height = BLOCK_SIZE; |
|||
Asub.stride = BLOCK_SIZE; |
|||
Asub.elements = &(A.elements[A.stride * BLOCK_SIZE * row + BLOCK_SIZE * col]); |
|||
return Asub; |
|||
} |
@ -0,0 +1,29 @@ |
|||
#ifndef SERIAL_KERNELS_H /* Include guard */ |
|||
#define SERIAL_KERNELS_H |
|||
|
|||
/* Structures */ |
|||
|
|||
//Matrix is used to describe matrices
|
|||
typedef struct { |
|||
int width; |
|||
int height; |
|||
int stride; |
|||
double *elements; |
|||
} Matrix; |
|||
|
|||
//Kernel calculate_kernel_matrix_kernel calculates the current kernel matrix
|
|||
__global__ void calculate_kernel_matrix_kernel(Matrix shifted_points, Matrix original_points, |
|||
double deviation, Matrix kernel_matrix); |
|||
|
|||
//Kernel denominator_kernel calculates the sum in the denominator of the fraction used to find new
|
|||
//(shifted) positions of the points
|
|||
__global__ void denominator_kernel(Matrix denominator, Matrix kernel_matrix); |
|||
|
|||
//Kernel shift_points_kernel shifts the positions of all points and calculates the new mean shift
|
|||
//vector according to the new point array
|
|||
__global__ void shift_points_kernel(Matrix original_points, Matrix kernel_matrix, |
|||
Matrix shifted_points, Matrix new_shift, Matrix denominator, Matrix mean_shift_vector); |
|||
|
|||
__device__ Matrix GetSubMatrix(Matrix A, int row, int col, int BLOCK_SIZE); |
|||
|
|||
#endif //SERIAL_KERNELS_H
|
@ -0,0 +1,165 @@ |
|||
#include <stdio.h> |
|||
#include <stdlib.h> |
|||
#include <math.h> |
|||
#include <float.h> |
|||
#include <string.h> |
|||
|
|||
#include "meanshift_utils.h" |
|||
#include "meanshift_gpu_utils.h" |
|||
|
|||
#define OUTPUT_PREFIX "../output/output_" |
|||
|
|||
void get_args(int argc, char **argv, parameters *params){ |
|||
if (argc < 7) { |
|||
printf("Usage: %s h e N D Pd Pl\nwhere:\n" |
|||
"\th is the variance\n" |
|||
"\te is the min distance, between two points, that is taken into account in computations\n" |
|||
"\tN is the the number of points\n" |
|||
"\tD is the number of dimensions of each point\n" |
|||
"\tPd is the path of the dataset file\n" |
|||
"\tPl is the path of the labels file\n" |
|||
"\n\t--verbose | -v is an optional flag to enable execution information output" |
|||
"\n\t--output | -o is an optional flag to enable points output in each iteration", argv[0]); |
|||
exit(1); |
|||
} |
|||
|
|||
DEVIATION = atoi(argv[1]); |
|||
params->epsilon = atof(argv[2]); |
|||
NUMBER_OF_POINTS = atoi(argv[3]); |
|||
DIMENSIONS = atoi(argv[4]); |
|||
POINTS_FILENAME = argv[5]; |
|||
LABELS_FILENAME = argv[6]; |
|||
params->verbose = false; |
|||
params->display = false; |
|||
|
|||
if (argc > 7){ |
|||
for (int index=7; index<argc; ++index){ |
|||
if (!strcmp(argv[index], "--verbose") || !strcmp(argv[index], "-v")){ |
|||
params->verbose = true; |
|||
} else if (!strcmp(argv[index], "--output") || !strcmp(argv[index], "-o")){ |
|||
params->display = true; |
|||
} else { |
|||
printf("Couldn't parse argument %d: %s\n", index, argv[index]); |
|||
exit(EXIT_FAILURE); |
|||
} |
|||
} |
|||
} |
|||
|
|||
/*printf("DEVIATION = %d\n" |
|||
"epsilon = %f\n" |
|||
"NUMBER_OF_POINTS = %d\n" |
|||
"DIMENSIONS = %d\n" |
|||
"POINTS_FILENAME = %s\n" |
|||
"LABELS_FILENAME = %s\n" |
|||
"verbose = %d\n" |
|||
"display = %d\n", DEVIATION, params->epsilon, NUMBER_OF_POINTS, DIMENSIONS, POINTS_FILENAME |
|||
, LABELS_FILENAME, params->verbose, params->display);*/ |
|||
} |
|||
|
|||
void init(double ***vectors, char **labels){ |
|||
int bytes_read = 0; |
|||
|
|||
set_GPU(); |
|||
|
|||
if (params.verbose){ |
|||
printf("Reading dataset and labels...\n"); |
|||
} |
|||
|
|||
// initializes vectors |
|||
FILE *points_file; |
|||
points_file = fopen(POINTS_FILENAME, "rb"); |
|||
if (points_file != NULL){ |
|||
// allocates memory for the array |
|||
(*vectors) = alloc_double(NUMBER_OF_POINTS, DIMENSIONS); |
|||
// reads vectors dataset from file |
|||
for (int i=0; i<NUMBER_OF_POINTS; i++){ |
|||
bytes_read = fread((*vectors)[i], sizeof(double), DIMENSIONS, points_file); |
|||
if ( bytes_read != DIMENSIONS ){ |
|||
if(feof(points_file)){ |
|||
printf("Premature end of file reached.\n"); |
|||
} else{ |
|||
printf("Error reading points file."); |
|||
} |
|||
fclose(points_file); |
|||
exit(EXIT_FAILURE); |
|||
} |
|||
} |
|||
} else { |
|||
printf("Error reading dataset file.\n"); |
|||
exit(EXIT_FAILURE); |
|||
} |
|||
fclose(points_file); |
|||
|
|||
// initializes file that will contain the labels (train) |
|||
FILE *labels_file; |
|||
labels_file = fopen(LABELS_FILENAME, "rb"); |
|||
if (labels_file != NULL){ |
|||
// NOTE : Labels were classified as <class 'numpy.uint8'> |
|||
// variables of type uint8 are stored as 1-byte (8-bit) unsigned integers |
|||
// gets number of labels |
|||
fseek(labels_file, 0L, SEEK_END); |
|||
long int pos = ftell(labels_file); |
|||
rewind(labels_file); |
|||
int label_elements = pos/ sizeof(char); |
|||
|
|||
// allocates memory for the array |
|||
*labels = (char*)malloc(label_elements* sizeof(char)); |
|||
fseek(labels_file, 0L, SEEK_SET); |
|||
bytes_read = fread((*labels), sizeof(char), label_elements, labels_file); |
|||
if ( bytes_read != label_elements ){ |
|||
if(feof(points_file)){ |
|||
printf("Premature end of file reached.\n"); |
|||
} else{ |
|||
printf("Error reading points file."); |
|||
} |
|||
fclose(labels_file); |
|||
exit(EXIT_FAILURE); |
|||
} |
|||
} |
|||
fclose(labels_file); |
|||
|
|||
if (params.verbose){ |
|||
printf("Done.\n\n"); |
|||
} |
|||
} |
|||
|
|||
double **alloc_double(int rows, int cols) { |
|||
double *data = (double *) malloc(rows*cols*sizeof(double)); |
|||
double **array = (double **) malloc(rows*sizeof(double*)); |
|||
for (int i=0; i<rows; i++) |
|||
array[i] = &(data[cols*i]); |
|||
return array; |
|||
} |
|||
|
|||
void duplicate(double **source, int rows, int cols, double ***dest){ |
|||
for (int i=0; i<rows; i++){ |
|||
for (int j=0; j<cols; j++){ |
|||
(*dest)[i][j] = source[i][j]; |
|||
} |
|||
} |
|||
} |
|||
|
|||
void print_matrix(double **array, int rows, int cols){ |
|||
for (int i=0; i<cols; i++){ |
|||
for (int j=0; j<rows; j++){ |
|||
printf("%f ", array[j][i]); |
|||
} |
|||
printf("\n"); |
|||
} |
|||
} |
|||
|
|||
void save_matrix(double **matrix, int iteration){ |
|||
char filename[50]; |
|||
snprintf(filename, sizeof(filename), "%s%d", "../output/output_", iteration); |
|||
FILE *file; |
|||
file = fopen(filename, "w"); |
|||
for (int rows=0; rows<NUMBER_OF_POINTS; ++rows){ |
|||
for (int cols=0; cols<DIMENSIONS; ++cols){ |
|||
fprintf(file, "%f", matrix[rows][cols]); |
|||
if (cols != DIMENSIONS - 1){ |
|||
fprintf(file, ","); |
|||
} |
|||
} |
|||
fprintf(file, "\n"); |
|||
} |
|||
} |
@ -0,0 +1,35 @@ |
|||
#ifndef SERIAL_UTILS_H /* Include guard */ |
|||
#define SERIAL_UTILS_H |
|||
|
|||
#include <stdbool.h> |
|||
|
|||
/* Structures */ |
|||
|
|||
//Parameters is used to store session specific variables in an orderly way
|
|||
typedef struct parameters { |
|||
double epsilon; |
|||
bool verbose; |
|||
bool display; |
|||
} Parameters; |
|||
|
|||
//Function get_args parses command line arguments
|
|||
void get_args(int argc, char **argv, Parameters *params); |
|||
|
|||
//Function init sets up the GPU for later use, gets its properties and reads the dataset and label
|
|||
//arrays from the corresponding files
|
|||
void init(double ***vectors, char **labels); |
|||
|
|||
//Function alloc_double allocates rows*cols bytes of continuous memory
|
|||
double **alloc_double(int rows, int cols); |
|||
|
|||
//Function duplicate copies the values of source array to dest array
|
|||
void duplicate(double **source, int rows, int cols, double ***dest); |
|||
|
|||
//Function print_matrix prints array of dimensions <rows X cols> to the console
|
|||
void print_matrix(double **array, int rows, int cols); |
|||
|
|||
//Function save_matrix stores matrix in a csv file with path/filename "../output/output_iteration".
|
|||
//If a file already exists new lines are concatenated
|
|||
void save_matrix(double **matrix, int iteration); |
|||
|
|||
#endif //SERIAL_UTILS_H
|
Loading…
Reference in new issue