Exercise 3 for the course "Parallel and distributed systems" of THMMY in AUTH university.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

370 lines
14 KiB

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <float.h>
#include <string.h>
#include <sys/time.h>
#include <cublas_v2.h>
#include "meanshift_utils.h"
#include "meanshift_gpu_utils.h"
cudaDeviceProp device_properties;
struct timeval start_w_time, end_w_time;
double seq;
//Based on:
// https://www.cs.virginia.edu/~csadmin/wiki/index.php/CUDA_Support/Choosing_a_GPU
void set_GPU(){
int devices_count = 0, max_multiprocessors = 0, max_device = 0;
// gets devices count checking for errors like no devices or no drivers to check for
// devices available
gpuErrchk( cudaGetDeviceCount(&devices_count) );
for(int device_index = 0; device_index < devices_count; ++device_index){
// gets current index device's properties
cudaDeviceProp this_device_properties;
gpuErrchk( cudaGetDeviceProperties(&this_device_properties, device_index) );
// stores best available device's index
// only devices with compute capability >= 2.0 are able to run the code
if (max_multiprocessors < this_device_properties.multiProcessorCount
&& this_device_properties.major >= 2 && this_device_properties.minor >= 0){
// stores devices properties for later use
device_properties = this_device_properties;
max_multiprocessors = this_device_properties.multiProcessorCount;
max_device = device_index;
}
}
// sets the device
gpuErrchk( cudaSetDevice(max_device) );
if (params.verbose){
printf("Device chosen is \"%s\"\n"
"Device has %d multi processors and compute capability %d.%d\n"
"Max threads per block supported are %d\n\n"
, device_properties.name
, device_properties.multiProcessorCount, device_properties.major, device_properties.minor
, device_properties.maxThreadsPerBlock);
}
}
int meanshift(double **original_points, double ***shifted_points, int deviation){
// host variables
int size = 0;
static int recursion = 0;
static double **kernel_matrix, **mean_shift_vector, w_memcpy_time;
double **new_shift, current_norm = 0, tmp_w_memcpy_time;
bool is_first_recursion = false;
// device variables
static Matrix d_original_points, d_shifted_points, d_kernel_matrix, d_denominator,
d_mean_shift_vector;
Matrix d_new_shift;
// allocates memory and copies original points on first recursion
if (recursion == 0 || (*shifted_points) == NULL){
is_first_recursion = true;
// allocates memory for shifted points array and copies original points into it
(*shifted_points) = alloc_double(NUMBER_OF_POINTS, DIMENSIONS);
duplicate(original_points, NUMBER_OF_POINTS, DIMENSIONS, shifted_points);
// allocates memory for mean shift vector
mean_shift_vector = alloc_double(NUMBER_OF_POINTS, DIMENSIONS);
// initializes elements of mean_shift_vector to inf
for (int i=0;i<NUMBER_OF_POINTS;i++){
for (int j=0;j<DIMENSIONS;j++){
mean_shift_vector[i][j] = DBL_MAX;
}
}
// allocates memory for kernel_matrix
kernel_matrix = alloc_double(NUMBER_OF_POINTS, NUMBER_OF_POINTS);
// tic
gettimeofday (&start_w_time, NULL);
// allocates corresponding memory in device
init_device_memory(original_points, *shifted_points, &d_original_points, &d_shifted_points,
&d_kernel_matrix, &d_denominator, &d_mean_shift_vector);
// toc
gettimeofday (&end_w_time, NULL);
seq = (double)((end_w_time.tv_usec - start_w_time.tv_usec)
/ 1.0e6 + end_w_time.tv_sec - start_w_time.tv_sec);
if (params.verbose){
printf("Device memory allocation wall clock time = %f\n\n", seq);
}
}
// finds pairwise distance matrix (inside radius)
// [I, D] = rangesearch(x,y,h);
calculate_kernel_matrix(d_shifted_points, d_original_points, d_kernel_matrix, deviation,
&kernel_matrix, &tmp_w_memcpy_time);
w_memcpy_time += tmp_w_memcpy_time;
// calculates denominator
calculate_denominator(d_kernel_matrix, d_denominator);
// creates new y vector
// allocates memory in every recursion
new_shift = alloc_double(NUMBER_OF_POINTS, DIMENSIONS);
// allocates corresponding memory in device
d_new_shift.width = DIMENSIONS;
d_new_shift.height = NUMBER_OF_POINTS;
size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double);
gpuErrchk( cudaMalloc(&(d_new_shift.elements), size) );
shift_points(d_kernel_matrix, d_original_points, d_shifted_points, d_new_shift, d_denominator,
d_mean_shift_vector, kernel_matrix, original_points, &new_shift, &mean_shift_vector,
&tmp_w_memcpy_time);
w_memcpy_time += tmp_w_memcpy_time;
for (int row=0; row<2; ++row){
for (int col=0; col<2; ++col){
printf("new_shift[%d][%d] = %f\n", row, col, new_shift[row][col]);
printf("new_shift[%d][%d] = %f\n", 300+row, 216+col, new_shift[300+row][216+col]);
printf("new_shift[%d][%d] = %f\n", 562+row, 487+col, new_shift[562+row][487+col]);
}
}
if(is_first_recursion){
exit(0);
}
// frees previously shifted points, they're now garbage
free((*shifted_points)[0]);
gpuErrchk( cudaFree(d_shifted_points.elements) );
// updates shifted points pointer to the new array address
shifted_points = &new_shift;
d_shifted_points.elements = d_new_shift.elements;
if (params.display){
save_matrix((*shifted_points), recursion);
}
// calculates norm of the new mean shift vector in GPU using "cuBlas" library function
cublasHandle_t handle;
cublasStatus_t cublas_status = cublasCreate(&handle);
if (cublas_status != CUBLAS_STATUS_SUCCESS){
exit(cublas_status);
}
cublas_status = cublasDnrm2(handle, NUMBER_OF_POINTS * DIMENSIONS, d_mean_shift_vector.elements,
1, &current_norm);
if (cublas_status != CUBLAS_STATUS_SUCCESS){
exit(cublas_status);
}
cublas_status = cublasDestroy(handle);
if (cublas_status != CUBLAS_STATUS_SUCCESS){
exit(cublas_status);
}
if (params.verbose){
printf("Recursion n. %d, error\t%f \n", recursion, current_norm);
}
// recurses until convergence
if (current_norm > params.epsilon) {
++recursion;
meanshift(original_points, shifted_points, deviation);
}
if (is_first_recursion){
if (params.verbose){
printf("\nCopying between device and host wall clock time = %f\n", w_memcpy_time);
}
// cleans up allocations
free(mean_shift_vector[0]);
free(mean_shift_vector);
free(kernel_matrix[0]);
free(kernel_matrix);
free_device_memory(d_original_points, d_kernel_matrix, d_denominator, d_shifted_points);
}
return recursion;
}
void init_device_memory(double **original_points, double **shifted_points,
Matrix *d_original_points, Matrix *d_shifted_points, Matrix *d_kernel_matrix,
Matrix *d_denominator, Matrix *d_mean_shift_vector){
int size;
// allocates memory for original_points in GPU and copies the array
d_original_points->width = DIMENSIONS;
d_original_points->height = NUMBER_OF_POINTS;
size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double);
gpuErrchk( cudaMalloc(&(d_original_points->elements), size) );
gpuErrchk( cudaMemcpy(d_original_points->elements, &(original_points[0][0])
, size, cudaMemcpyHostToDevice) );
// allocates memory for shifted_points in GPU and copies the array
d_shifted_points->width = DIMENSIONS;
d_shifted_points->height = NUMBER_OF_POINTS;
size = DIMENSIONS * NUMBER_OF_POINTS * sizeof(double);
gpuErrchk( cudaMalloc(&(d_shifted_points->elements), size) );
gpuErrchk( cudaMemcpy(d_shifted_points->elements, &(shifted_points[0][0])
, size, cudaMemcpyHostToDevice) );
// allocates memory for kernel_matrix in GPU
d_kernel_matrix->width = NUMBER_OF_POINTS;
d_kernel_matrix->height = NUMBER_OF_POINTS;
size = NUMBER_OF_POINTS * NUMBER_OF_POINTS * sizeof(double);
gpuErrchk( cudaMalloc(&(d_kernel_matrix->elements), size) );
// allocates memory for denominator in GPU
d_denominator->width = 1;
d_denominator->height = NUMBER_OF_POINTS;
size = NUMBER_OF_POINTS * sizeof(double);
gpuErrchk( cudaMalloc(&(d_denominator->elements), size) );
// allocates memory for mean_shift_vector in GPU
d_mean_shift_vector->width = DIMENSIONS;
d_mean_shift_vector->height = NUMBER_OF_POINTS;
size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double);
gpuErrchk( cudaMalloc(&(d_mean_shift_vector->elements), size) );
}
void calculate_kernel_matrix(Matrix d_shifted_points, Matrix d_original_points,
Matrix d_kernel_matrix, double deviation, double ***kernel_matrix, double *w_memcpy_time){
int size;
static bool first_iter = true;
// gets max block size supported from the device
static int max_block_size = device_properties.maxThreadsPerBlock;
static int requested_block_size = (int)sqrt(max_block_size);
bool block_size_too_big = true;
dim3 dimBlock;
dim3 dimGrid;
do {
dimBlock.x = requested_block_size;
dimBlock.y = requested_block_size;
dimGrid.x = (d_kernel_matrix.height + dimBlock.x - 1) / dimBlock.x;
dimGrid.y = (d_kernel_matrix.width + dimBlock.y - 1) / dimBlock.y;
calculate_kernel_matrix_kernel<<<dimGrid, dimBlock>>>(d_shifted_points, d_original_points
, deviation, d_kernel_matrix);
if (cudaGetLastError() != cudaSuccess){
--requested_block_size;
} else {
block_size_too_big = false;
gpuErrchk( cudaDeviceSynchronize() );
}
} while(block_size_too_big);
if (first_iter && params.verbose){
printf("calculate_kernel_matrix_kernel called with:\n");
printf("dimBlock.x = %d, dimBlock.y = %d\n", dimBlock.x, dimBlock.y);
printf("dimGrid.x = %d, dimGrid.y = %d\n\n", dimGrid.x, dimGrid.y);
first_iter = false;
}
size = NUMBER_OF_POINTS * NUMBER_OF_POINTS * sizeof(double);
// tic
gettimeofday (&start_w_time, NULL);
gpuErrchk( cudaMemcpy(&((*kernel_matrix)[0][0]), d_kernel_matrix.elements
, size, cudaMemcpyDeviceToHost) );
// toc
gettimeofday (&end_w_time, NULL);
*w_memcpy_time = (double)((end_w_time.tv_usec - start_w_time.tv_usec)
/ 1.0e6 + end_w_time.tv_sec - start_w_time.tv_sec);
}
void calculate_denominator(Matrix d_kernel_matrix, Matrix d_denominator){
static bool first_iter = true;
// gets max block size supported from the device
static int requested_block_size = device_properties.maxThreadsPerBlock;
bool block_size_too_big = true;
dim3 dimBlock;
dim3 dimGrid;
do {
dimBlock.x = requested_block_size;
dimBlock.y = 1;
dimGrid.x = (d_kernel_matrix.height + dimBlock.x - 1) / dimBlock.x;
dimGrid.y = 1;
denominator_kernel<<<dimGrid, dimBlock>>>(d_denominator, d_kernel_matrix);
if (cudaGetLastError() != cudaSuccess){
--requested_block_size;
} else {
block_size_too_big = false;
gpuErrchk( cudaDeviceSynchronize() );
}
} while(block_size_too_big);
if (first_iter && params.verbose){
printf("calculate_denominator called with:\n");
printf("dimBlock.x = %d, dimBlock.y = %d\n", dimBlock.x, dimBlock.y);
printf("dimGrid.x = %d, dimGrid.y = %d\n\n", dimGrid.x, dimGrid.y);
first_iter = false;
}
}
void shift_points(Matrix d_kernel_matrix, Matrix d_original_points, Matrix d_shifted_points,
Matrix d_new_shift, Matrix d_denominator, Matrix d_mean_shift_vector, double **kernel_matrix,
double **original_points, double ***new_shift, double ***mean_shift_vector,
double *w_memcpy_time){
int size;
static bool first_iter = true;
// gets max block size supported from the device
static int max_block_size = device_properties.maxThreadsPerBlock;
static int requested_block_size = (int)(max_block_size / d_new_shift.width);
bool block_size_too_big = true;
dim3 dimBlock;
dim3 dimGrid;
do {
/*dimBlock.x = requested_block_size;
dimBlock.y = d_new_shift.width;*/
dimBlock.x = 2;
dimBlock.y = 2;
dimGrid.x = (d_denominator.height + dimBlock.x - 1) / dimBlock.x;
dimGrid.y = 1;
shift_points_kernel<<<dimGrid, dimBlock>>>(d_original_points, d_kernel_matrix, d_shifted_points,
d_new_shift, d_denominator, d_mean_shift_vector);
if (cudaGetLastError() != cudaSuccess){
--requested_block_size;
} else {
block_size_too_big = false;
gpuErrchk( cudaDeviceSynchronize() );
}
} while(block_size_too_big);
if (first_iter && params.verbose){
printf("shift_points_kernel called with:\n");
printf("dimBlock.x = %d, dimBlock.y = %d\n", dimBlock.x, dimBlock.y);
printf("dimGrid.x = %d, dimGrid.y = %d\n\n", dimGrid.x, dimGrid.y);
first_iter = false;
}
size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double);
// tic
gettimeofday (&start_w_time, NULL);
gpuErrchk( cudaMemcpy(&((*new_shift)[0][0]), d_new_shift.elements
, size, cudaMemcpyDeviceToHost) );
gpuErrchk( cudaMemcpy(&((*mean_shift_vector)[0][0]), d_mean_shift_vector.elements
, size, cudaMemcpyDeviceToHost) );
// toc
gettimeofday (&end_w_time, NULL);
*w_memcpy_time = (double)((end_w_time.tv_usec - start_w_time.tv_usec)
/ 1.0e6 + end_w_time.tv_sec - start_w_time.tv_sec);
}
void free_device_memory(Matrix d_original_points, Matrix d_kernel_matrix, Matrix d_denominator,
Matrix d_shifted_points){
// frees all memory previously allocated in device
gpuErrchk( cudaFree(d_original_points.elements) );
gpuErrchk( cudaFree(d_kernel_matrix.elements) );
gpuErrchk( cudaFree(d_denominator.elements) );
gpuErrchk( cudaFree(d_shifted_points.elements) );
}