Apostolos Fanakis
7 years ago
9 changed files with 419 additions and 390 deletions
Binary file not shown.
@ -0,0 +1,308 @@ |
|||
#include <stdio.h> |
|||
#include <stdlib.h> |
|||
#include <math.h> |
|||
#include <float.h> |
|||
#include <string.h> |
|||
|
|||
#include "meanshift_utils.h" |
|||
#include "meanshift_gpu_utils.h" |
|||
|
|||
cudaDeviceProp device_properties; |
|||
|
|||
//Based on https://stackoverflow.com/a/28113186 |
|||
//Pio psagmeno link https://www.cs.virginia.edu/~csadmin/wiki/index.php/CUDA_Support/Choosing_a_GPU |
|||
void set_GPU(){ |
|||
int devices_count = 0, max_multiprocessors = 0, max_device = 0; |
|||
|
|||
// gets devices count checking for errors like no devices or no drivers to check for |
|||
// devices available |
|||
gpuErrchk( cudaGetDeviceCount(&devices_count) ); |
|||
for(int device_index = 0; device_index < devices_count; ++device_index){ |
|||
// gets current index device's properties |
|||
cudaDeviceProp this_device_properties; |
|||
gpuErrchk( cudaGetDeviceProperties(&this_device_properties, device_index) ); |
|||
|
|||
// stores best available device's index |
|||
// only devices with compute capability >= 2.0 are able to run the code |
|||
if (max_multiprocessors < this_device_properties.multiProcessorCount |
|||
&& this_device_properties.major >= 2 && this_device_properties.minor >= 0){ |
|||
// stores devices properties for later use |
|||
device_properties = this_device_properties; |
|||
max_multiprocessors = this_device_properties.multiProcessorCount; |
|||
max_device = device_index; |
|||
} |
|||
} |
|||
// sets the device |
|||
gpuErrchk( cudaSetDevice(max_device) ); |
|||
if (params.verbose){ |
|||
printf("Device chosen is \"%s\"\n" |
|||
"Device has %d multi processors and compute capability %d.%d\n" |
|||
"Max threads per block supported are %d\n\n" |
|||
, device_properties.name |
|||
, device_properties.multiProcessorCount, device_properties.major, device_properties.minor |
|||
, device_properties.maxThreadsPerBlock); |
|||
} |
|||
} |
|||
|
|||
int meanshift(double **original_points, double ***shifted_points, int deviation |
|||
, parameters *opt){ |
|||
// host variables |
|||
int size = 0; |
|||
static int iteration = 0; |
|||
static double **kernel_matrix, *denominator, **mean_shift_vector; |
|||
double **new_shift; |
|||
|
|||
// device variables |
|||
static Matrix d_original_points, d_shifted_points, d_kernel_matrix, d_denominator, |
|||
d_mean_shift_vector; |
|||
Matrix d_new_shift; |
|||
|
|||
// allocates memory and copies original points on first iteration |
|||
if (iteration == 0 || (*shifted_points) == NULL){ |
|||
// allocates memory for shifted points array and copies original points into it |
|||
(*shifted_points) = alloc_double(NUMBER_OF_POINTS, DIMENSIONS); |
|||
duplicate(original_points, NUMBER_OF_POINTS, DIMENSIONS, shifted_points); |
|||
|
|||
// allocates memory for mean shift vector |
|||
mean_shift_vector = alloc_double(NUMBER_OF_POINTS, DIMENSIONS); |
|||
// initializes elements of mean_shift_vector to inf |
|||
for (int i=0;i<NUMBER_OF_POINTS;i++){ |
|||
for (int j=0;j<DIMENSIONS;j++){ |
|||
mean_shift_vector[i][j] = DBL_MAX; |
|||
} |
|||
} |
|||
|
|||
// allocates memory for other arrays needed |
|||
kernel_matrix = alloc_double(NUMBER_OF_POINTS, NUMBER_OF_POINTS); |
|||
denominator = (double *)malloc(NUMBER_OF_POINTS * sizeof(double)); |
|||
|
|||
// allocates corresponding memory in device |
|||
init_device_memory(original_points, *shifted_points, &d_original_points, &d_shifted_points, |
|||
&d_kernel_matrix, &d_denominator, &d_mean_shift_vector); |
|||
} |
|||
|
|||
// finds pairwise distance matrix (inside radius) |
|||
// [I, D] = rangesearch(x,y,h); |
|||
calculate_kernel_matrix(d_shifted_points, d_original_points, d_kernel_matrix, deviation, |
|||
&kernel_matrix); |
|||
|
|||
// calculates denominator |
|||
calculate_denominator(d_kernel_matrix, d_denominator, &denominator); |
|||
|
|||
size = NUMBER_OF_POINTS * sizeof(double); |
|||
gpuErrchk( cudaMemcpy(d_denominator.elements, &(denominator[0]) |
|||
, size, cudaMemcpyHostToDevice) ); |
|||
|
|||
// creates new y vector |
|||
// allocates memory in every recursion |
|||
new_shift = alloc_double(NUMBER_OF_POINTS, DIMENSIONS); |
|||
// allocates corresponding memory in device |
|||
d_new_shift.width = DIMENSIONS; |
|||
d_new_shift.height = NUMBER_OF_POINTS; |
|||
size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double); |
|||
gpuErrchk( cudaMalloc(&(d_new_shift.elements), size) ); |
|||
|
|||
shift_points(d_kernel_matrix, d_original_points, d_shifted_points, d_new_shift, d_denominator, |
|||
d_mean_shift_vector, kernel_matrix, original_points, &new_shift, &mean_shift_vector); |
|||
|
|||
// frees previously shifted points, they're now garbage |
|||
free((*shifted_points)[0]); |
|||
// updates shifted points pointer to the new array address |
|||
shifted_points = &new_shift; |
|||
d_shifted_points.elements = d_new_shift.elements; |
|||
|
|||
if (params.display){ |
|||
save_matrix((*shifted_points), iteration); |
|||
} |
|||
|
|||
// calculates norm of the new mean shift vector |
|||
double current_norm = norm(mean_shift_vector, NUMBER_OF_POINTS, DIMENSIONS); |
|||
if (params.verbose){ |
|||
printf("Iteration n. %d, error\t%f \n", iteration, current_norm); |
|||
} |
|||
|
|||
/** iterates until convergence **/ |
|||
if (current_norm > opt->epsilon) { |
|||
++iteration; |
|||
meanshift(original_points, shifted_points, deviation, opt); |
|||
} |
|||
|
|||
if (iteration == 0){ |
|||
// cleans up allocations |
|||
free(mean_shift_vector[0]); |
|||
free(mean_shift_vector); |
|||
free(kernel_matrix[0]); |
|||
free(kernel_matrix); |
|||
free(denominator); |
|||
|
|||
free_device_memory(d_original_points, d_kernel_matrix, d_denominator, d_new_shift); |
|||
} |
|||
|
|||
return iteration; |
|||
} |
|||
|
|||
void init_device_memory(double **original_points, double **shifted_points, |
|||
Matrix *d_original_points, Matrix *d_shifted_points, Matrix *d_kernel_matrix, |
|||
Matrix *d_denominator, Matrix *d_mean_shift_vector){ |
|||
int size; |
|||
|
|||
// allocates memory for original_points in GPU and copies the array |
|||
d_original_points->width = DIMENSIONS; |
|||
d_original_points->height = NUMBER_OF_POINTS; |
|||
size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double); |
|||
gpuErrchk( cudaMalloc(&(d_original_points->elements), size) ); |
|||
gpuErrchk( cudaMemcpy(d_original_points->elements, &(original_points[0][0]) |
|||
, size, cudaMemcpyHostToDevice) ); |
|||
|
|||
// allocates memory for shifted_points in GPU and copies the array |
|||
d_shifted_points->width = DIMENSIONS; |
|||
d_shifted_points->height = NUMBER_OF_POINTS; |
|||
size = DIMENSIONS * NUMBER_OF_POINTS * sizeof(double); |
|||
gpuErrchk( cudaMalloc(&(d_shifted_points->elements), size) ); |
|||
gpuErrchk( cudaMemcpy(d_shifted_points->elements, &(shifted_points[0][0]) |
|||
, size, cudaMemcpyHostToDevice) ); |
|||
|
|||
// allocates memory for kernel_matrix in GPU |
|||
d_kernel_matrix->width = NUMBER_OF_POINTS; |
|||
d_kernel_matrix->height = NUMBER_OF_POINTS; |
|||
size = NUMBER_OF_POINTS * NUMBER_OF_POINTS * sizeof(double); |
|||
gpuErrchk( cudaMalloc(&(d_kernel_matrix->elements), size) ); |
|||
|
|||
// allocates memory for denominator in GPU |
|||
d_denominator->width = 1; |
|||
d_denominator->height = NUMBER_OF_POINTS; |
|||
size = NUMBER_OF_POINTS * sizeof(double); |
|||
gpuErrchk( cudaMalloc(&(d_denominator->elements), size) ); |
|||
|
|||
// allocates memory for mean_shift_vector in GPU |
|||
d_mean_shift_vector->width = DIMENSIONS; |
|||
d_mean_shift_vector->height = NUMBER_OF_POINTS; |
|||
size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double); |
|||
gpuErrchk( cudaMalloc(&(d_mean_shift_vector->elements), size) ); |
|||
} |
|||
|
|||
void calculate_kernel_matrix(Matrix d_shifted_points, Matrix d_original_points, |
|||
Matrix d_kernel_matrix, double deviation, double ***kernel_matrix){ |
|||
int size; |
|||
static bool first_iter = true; |
|||
// gets max block size supported from the device |
|||
static int max_block_size = device_properties.maxThreadsPerBlock; |
|||
static int requested_block_size = (int)sqrt(max_block_size); |
|||
bool block_size_too_big = true; |
|||
|
|||
dim3 dimBlock; |
|||
dim3 dimGrid; |
|||
do { |
|||
dimBlock.x = requested_block_size; |
|||
dimBlock.y = requested_block_size; |
|||
dimGrid.x = (d_kernel_matrix.height + dimBlock.x - 1) / dimBlock.x; |
|||
dimGrid.y = (d_kernel_matrix.width + dimBlock.y - 1) / dimBlock.y; |
|||
|
|||
calculate_kernel_matrix_kernel<<<dimGrid, dimBlock>>>(d_shifted_points, d_original_points |
|||
, deviation, d_kernel_matrix); |
|||
if (cudaGetLastError() != cudaSuccess){ |
|||
--requested_block_size; |
|||
} else { |
|||
block_size_too_big = false; |
|||
gpuErrchk( cudaDeviceSynchronize() ); |
|||
} |
|||
} while(block_size_too_big); |
|||
|
|||
if (first_iter && params.verbose){ |
|||
printf("calculate_kernel_matrix_kernel called with:\n"); |
|||
printf("dimBlock.x = %d, dimBlock.y = %d\n", dimBlock.x, dimBlock.y); |
|||
printf("dimGrid.x = %d, dimGrid.y = %d\n\n", dimGrid.x, dimGrid.y); |
|||
first_iter = false; |
|||
} |
|||
|
|||
size = NUMBER_OF_POINTS * NUMBER_OF_POINTS * sizeof(double); |
|||
gpuErrchk( cudaMemcpy(&((*kernel_matrix)[0][0]), d_kernel_matrix.elements |
|||
, size, cudaMemcpyDeviceToHost) ); |
|||
} |
|||
|
|||
void calculate_denominator(Matrix d_kernel_matrix, Matrix d_denominator, double **denominator){ |
|||
int size; |
|||
static bool first_iter = true; |
|||
// gets max block size supported from the device |
|||
static int requested_block_size = device_properties.maxThreadsPerBlock; |
|||
bool block_size_too_big = true; |
|||
|
|||
dim3 dimBlock; |
|||
dim3 dimGrid; |
|||
do { |
|||
dimBlock.x = requested_block_size; |
|||
dimBlock.y = 1; |
|||
dimGrid.x = (d_kernel_matrix.height + dimBlock.x - 1) / dimBlock.x; |
|||
dimGrid.y = 1; |
|||
|
|||
denominator_kernel<<<dimGrid, dimBlock>>>(d_denominator, d_kernel_matrix); |
|||
if (cudaGetLastError() != cudaSuccess){ |
|||
--requested_block_size; |
|||
} else { |
|||
block_size_too_big = false; |
|||
gpuErrchk( cudaDeviceSynchronize() ); |
|||
} |
|||
} while(block_size_too_big); |
|||
|
|||
if (first_iter && params.verbose){ |
|||
printf("calculate_denominator called with:\n"); |
|||
printf("dimBlock.x = %d, dimBlock.y = %d\n", dimBlock.x, dimBlock.y); |
|||
printf("dimGrid.x = %d, dimGrid.y = %d\n\n", dimGrid.x, dimGrid.y); |
|||
first_iter = false; |
|||
} |
|||
|
|||
size = NUMBER_OF_POINTS * sizeof(double); |
|||
gpuErrchk( cudaMemcpy(&((*denominator)[0]), d_denominator.elements |
|||
, size, cudaMemcpyDeviceToHost) ); |
|||
} |
|||
|
|||
void shift_points(Matrix d_kernel_matrix, Matrix d_original_points, Matrix d_shifted_points, |
|||
Matrix d_new_shift, Matrix d_denominator, Matrix d_mean_shift_vector, double **kernel_matrix, |
|||
double **original_points, double ***new_shift, double ***mean_shift_vector){ |
|||
int size; |
|||
static bool first_iter = true; |
|||
// gets max block size supported from the device |
|||
static int max_block_size = device_properties.maxThreadsPerBlock; |
|||
static int requested_block_size = (int)(max_block_size / d_new_shift.width); |
|||
bool block_size_too_big = true; |
|||
|
|||
dim3 dimBlock; |
|||
dim3 dimGrid; |
|||
do { |
|||
dimBlock.x = requested_block_size; |
|||
dimBlock.y = d_new_shift.width; |
|||
dimGrid.x = (d_denominator.height + dimBlock.x - 1) / dimBlock.x; |
|||
dimGrid.y = 1; |
|||
|
|||
shift_points_kernel<<<dimGrid, dimBlock>>>(d_original_points, d_kernel_matrix, d_shifted_points, |
|||
d_new_shift, d_denominator, d_mean_shift_vector); |
|||
if (cudaGetLastError() != cudaSuccess){ |
|||
--requested_block_size; |
|||
} else { |
|||
block_size_too_big = false; |
|||
gpuErrchk( cudaDeviceSynchronize() ); |
|||
} |
|||
} while(block_size_too_big); |
|||
|
|||
if (first_iter && params.verbose){ |
|||
printf("shift_points_kernel called with:\n"); |
|||
printf("dimBlock.x = %d, dimBlock.y = %d\n", dimBlock.x, dimBlock.y); |
|||
printf("dimGrid.x = %d, dimGrid.y = %d\n\n", dimGrid.x, dimGrid.y); |
|||
first_iter = false; |
|||
} |
|||
|
|||
size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double); |
|||
gpuErrchk( cudaMemcpy(&((*new_shift)[0][0]), d_new_shift.elements |
|||
, size, cudaMemcpyDeviceToHost) ); |
|||
gpuErrchk( cudaMemcpy(&((*mean_shift_vector)[0][0]), d_mean_shift_vector.elements |
|||
, size, cudaMemcpyDeviceToHost) ); |
|||
} |
|||
|
|||
void free_device_memory(Matrix d_original_points, Matrix d_kernel_matrix, Matrix d_denominator, |
|||
Matrix d_new_shift){ |
|||
// frees all memory previously allocated in device |
|||
gpuErrchk( cudaFree(d_original_points.elements) ); |
|||
gpuErrchk( cudaFree(d_kernel_matrix.elements) ); |
|||
gpuErrchk( cudaFree(d_denominator.elements) ); |
|||
gpuErrchk( cudaFree(d_new_shift.elements) ); |
|||
} |
@ -0,0 +1,53 @@ |
|||
#ifndef SERIAL_GPU_UTILS_H /* Include guard */ |
|||
#define SERIAL_GPU_UTILS_H |
|||
|
|||
#include "meanshift_kernels.h" |
|||
|
|||
//GPU error check snippet taken from:
|
|||
//https://stackoverflow.com/a/14038590
|
|||
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); } |
|||
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true){ |
|||
if (code != cudaSuccess){ |
|||
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); |
|||
if (abort) exit(code); |
|||
} |
|||
} |
|||
|
|||
/* Global variables */ |
|||
extern int DEVIATION; |
|||
extern int NUMBER_OF_POINTS; |
|||
extern int DIMENSIONS; |
|||
extern char* POINTS_FILENAME; |
|||
extern char* LABELS_FILENAME; |
|||
extern parameters params; |
|||
extern cudaDeviceProp device_properties; |
|||
|
|||
void set_GPU(); |
|||
|
|||
//Function meanshift recursively shifts original points according to th
|
|||
//mean-shift algorithm saving the result to shiftedPoints. Struct opt has user
|
|||
//options, h is the desirable deviation.
|
|||
int meanshift(double **original_points, double ***shifted_points, int h |
|||
, parameters *opt); |
|||
|
|||
void init_device_memory(double **original_points, double **shifted_points, |
|||
Matrix *d_original_points, Matrix *d_shifted_points, |
|||
Matrix *d_kernel_matrix, Matrix *d_denominator, Matrix *d_new_shift); |
|||
|
|||
void calculate_kernel_matrix(Matrix d_shifted_points, Matrix d_original_points, |
|||
Matrix d_kernel_matrix, double deviation, double ***kernel_matrix); |
|||
|
|||
//Function multiply allocates memory in GPU, sends the data and calls the
|
|||
//multiply kernel function.
|
|||
void shift_points(Matrix d_kernel_matrix, Matrix d_original_points, Matrix d_shifted_points, |
|||
Matrix d_new_shift, Matrix d_denominator, Matrix d_mean_shift_vector, double **kernel_matrix, |
|||
double **original_points, double ***new_shift, double ***mean_shift_vector); |
|||
|
|||
void free_device_memory(Matrix d_original_points, Matrix d_kernel_matrix, Matrix d_denominator, |
|||
Matrix d_new_shift); |
|||
|
|||
//Function calculate_denominator allocates memory in GPU, sends the data and calls the
|
|||
//denominator kernel function.
|
|||
void calculate_denominator(Matrix d_kernel_matrix, Matrix d_denominator, double **denominator); |
|||
|
|||
#endif //SERIAL_GPU_UTILS_H
|
Loading…
Reference in new issue