Exercise 3 for the course "Parallel and distributed systems" of THMMY in AUTH university.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

486 lines
18 KiB

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <float.h>
#include <string.h>
#include "meanshift_utils.h"
#include "meanshift_kernels.h"
#define OUTPUT_PREFIX "../output/output_"
7 years ago
cudaDeviceProp device_properties;
void get_args(int argc, char **argv, parameters *params){
if (argc < 7) {
printf("Usage: %s h e N D Pd Pl\nwhere:\n"
"\th is the variance\n"
"\te is the min distance, between two points, that is taken into account in computations\n"
"\tN is the the number of points\n"
"\tD is the number of dimensions of each point\n"
"\tPd is the path of the dataset file\n"
"\tPl is the path of the labels file\n"
"\n\t--verbose | -v is an optional flag to enable execution information output"
"\n\t--output | -o is an optional flag to enable points output in each iteration", argv[0]);
exit(1);
}
DEVIATION = atoi(argv[1]);
7 years ago
params->epsilon = atof(argv[2]);
NUMBER_OF_POINTS = atoi(argv[3]);
DIMENSIONS = atoi(argv[4]);
POINTS_FILENAME = argv[5];
LABELS_FILENAME = argv[6];
params->verbose = false;
params->display = false;
if (argc > 7){
for (int index=7; index<argc; ++index){
if (!strcmp(argv[index], "--verbose") || !strcmp(argv[index], "-v")){
params->verbose = true;
} else if (!strcmp(argv[index], "--output") || !strcmp(argv[index], "-o")){
params->display = true;
} else {
printf("Couldn't parse argument %d: %s\n", index, argv[index]);
exit(EXIT_FAILURE);
}
}
}
/*printf("DEVIATION = %d\n"
"epsilon = %f\n"
"NUMBER_OF_POINTS = %d\n"
"DIMENSIONS = %d\n"
"POINTS_FILENAME = %s\n"
"LABELS_FILENAME = %s\n"
"verbose = %d\n"
"display = %d\n", DEVIATION, params->epsilon, NUMBER_OF_POINTS, DIMENSIONS, POINTS_FILENAME
, LABELS_FILENAME, params->verbose, params->display);*/
}
7 years ago
void init(double ***vectors, char **labels){
int bytes_read = 0;
7 years ago
set_Gpu();
if (params.verbose){
printf("Reading dataset and labels...\n");
}
// initializes vectors
FILE *points_file;
points_file = fopen(POINTS_FILENAME, "rb");
if (points_file != NULL){
// allocates memory for the array
(*vectors) = alloc_2d_double(NUMBER_OF_POINTS, DIMENSIONS);
// reads vectors dataset from file
for (int i=0; i<NUMBER_OF_POINTS; i++){
bytes_read = fread((*vectors)[i], sizeof(double), DIMENSIONS, points_file);
if ( bytes_read != DIMENSIONS ){
if(feof(points_file)){
printf("Premature end of file reached.\n");
} else{
printf("Error reading points file.");
}
fclose(points_file);
exit(EXIT_FAILURE);
}
}
} else {
printf("Error reading dataset file.\n");
exit(EXIT_FAILURE);
}
fclose(points_file);
// initializes file that will contain the labels (train)
FILE *labels_file;
labels_file = fopen(LABELS_FILENAME, "rb");
if (labels_file != NULL){
// NOTE : Labels were classified as <class 'numpy.uint8'>
// variables of type uint8 are stored as 1-byte (8-bit) unsigned integers
// gets number of labels
fseek(labels_file, 0L, SEEK_END);
long int pos = ftell(labels_file);
rewind(labels_file);
int label_elements = pos/ sizeof(char);
// allocates memory for the array
*labels = (char*)malloc(label_elements* sizeof(char));
fseek(labels_file, 0L, SEEK_SET);
bytes_read = fread((*labels), sizeof(char), label_elements, labels_file);
if ( bytes_read != label_elements ){
if(feof(points_file)){
printf("Premature end of file reached.\n");
} else{
printf("Error reading points file.");
}
fclose(labels_file);
exit(EXIT_FAILURE);
}
}
fclose(labels_file);
7 years ago
if (params.verbose){
printf("Done.\n\n");
}
}
//Based on https://stackoverflow.com/a/28113186
//Poio psagmeno link https://www.cs.virginia.edu/~csadmin/wiki/index.php/CUDA_Support/Choosing_a_GPU
void set_Gpu(){
int devices_count = 0, max_multiprocessors = 0, max_device = 0;
// gets devices count checking for errors like no devices or no drivers to check for
// devices available
gpuErrchk( cudaGetDeviceCount(&devices_count) );
for(int device_index = 0; device_index < devices_count; ++device_index){
// gets current index device's properties
cudaDeviceProp this_device_properties;
gpuErrchk( cudaGetDeviceProperties(&this_device_properties, device_index) );
// stores best available device's index
// only devices with compute capability >= 2.0 are able to run the code
if (max_multiprocessors < this_device_properties.multiProcessorCount
&& this_device_properties.major >= 2 && this_device_properties.minor >= 0){
// stores devices properties for later use
device_properties = this_device_properties;
max_multiprocessors = this_device_properties.multiProcessorCount;
max_device = device_index;
}
}
// sets the device
gpuErrchk( cudaSetDevice(max_device) );
if (params.verbose){
printf("Device chosen is \"%s\"\n"
"Device has %d multi processors and compute capability %d.%d\n"
"Max threads per block supported are %d\n\n"
7 years ago
, device_properties.name
, device_properties.multiProcessorCount, device_properties.major, device_properties.minor
, device_properties.maxThreadsPerBlock);
7 years ago
}
}
int meanshift(double **original_points, double ***shifted_points, int deviation
, parameters *opt){
static int iteration = 0;
static double **mean_shift_vector, **kernel_matrix, *denominator;
// allocates memory and copies original points on first iteration
if (iteration == 0 || (*shifted_points) == NULL){
(*shifted_points) = alloc_2d_double(NUMBER_OF_POINTS, DIMENSIONS);
duplicate(original_points, NUMBER_OF_POINTS, DIMENSIONS, shifted_points);
// allocates memory for mean shift vector
mean_shift_vector = alloc_2d_double(NUMBER_OF_POINTS, DIMENSIONS);
// initializes elements of mean_shift_vector to inf
for (int i=0;i<NUMBER_OF_POINTS;i++){
for (int j=0;j<DIMENSIONS;j++){
mean_shift_vector[i][j] = DBL_MAX;
}
}
// allocates memory for other arrays needed
kernel_matrix = alloc_2d_double(NUMBER_OF_POINTS, NUMBER_OF_POINTS);
denominator = (double *)malloc(NUMBER_OF_POINTS * sizeof(double));
}
// TODO move arrays to device and create global kernel for the iteration
// finds pairwise distance matrix (inside radius)
// [I, D] = rangesearch(x,y,h);
calculate_kernel_matrix((*shifted_points), original_points, deviation, &kernel_matrix);
// // calculate denominator
// for (int i=0; i<NUMBER_OF_POINTS; i++){
// double sum = 0;
// for (int j=0; j<NUMBER_OF_POINTS; j++){
// sum = sum + kernel_matrix[i][j];
// }
// denominator[i] = sum;
// }
denominator = calculate_denominator(kernel_matrix);
// creates new y vector
double **new_shift = alloc_2d_double(NUMBER_OF_POINTS, DIMENSIONS);
// builds nominator
multiply(kernel_matrix, original_points, &new_shift);
// divides element-wise
for (int i=0; i<NUMBER_OF_POINTS; i++){
for (int j=0; j<DIMENSIONS; j++){
new_shift[i][j] = new_shift[i][j] / denominator[i];
// calculates mean-shift vector at the same time
mean_shift_vector[i][j] = new_shift[i][j] - (*shifted_points)[i][j];
}
}
// frees previously shifted points, they're now garbage
free((*shifted_points)[0]);
// updates shifted points pointer to the new array address
shifted_points = &new_shift;
7 years ago
if (params.display){
save_matrix((*shifted_points), iteration);
}
// calculates norm of the new mean shift vector
double current_norm = norm(mean_shift_vector, NUMBER_OF_POINTS, DIMENSIONS);
7 years ago
if (params.verbose){
printf("Iteration n. %d, error %f \n", iteration, current_norm);
}
/** iterates until convergence **/
if (current_norm > opt->epsilon) {
++iteration;
meanshift(original_points, shifted_points, deviation, opt);
}
if (iteration == 0){
7 years ago
// cleans up allocations
free(mean_shift_vector[0]);
free(mean_shift_vector);
free(kernel_matrix[0]);
free(kernel_matrix);
free(denominator);
}
return iteration;
}
double norm(double **matrix, int rows, int cols){
double sum=0, temp_mul=0;
for (int i=0; i<rows; i++) {
for (int j=0; j<cols; j++) {
temp_mul = matrix[i][j] * matrix[i][j];
sum = sum + temp_mul;
}
}
double norm = sqrt(sum);
return norm;
}
void calculate_kernel_matrix(double **shifted_points, double **original_points, double deviation
, double ***kernel_matrix){
static bool first_iter = true;
// allocates memory for shifted_points in GPU and copies the array
Matrix d_shifted_points;
d_shifted_points.width = DIMENSIONS;
d_shifted_points.height = NUMBER_OF_POINTS;
int size = DIMENSIONS * NUMBER_OF_POINTS * sizeof(double);
gpuErrchk( cudaMalloc(&d_shifted_points.elements, size) );
gpuErrchk( cudaMemcpy(d_shifted_points.elements, &(shifted_points[0][0])
, size, cudaMemcpyHostToDevice) );
// allocates memory for original_points in GPU and copies the array
Matrix d_original_points;
d_original_points.width = DIMENSIONS;
d_original_points.height = NUMBER_OF_POINTS;
size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double);
gpuErrchk( cudaMalloc(&d_original_points.elements, size) );
gpuErrchk( cudaMemcpy(d_original_points.elements, &(original_points[0][0])
, size, cudaMemcpyHostToDevice) );
// allocates memory for kernel_matrix in GPU
Matrix d_kernel_matrix;
d_kernel_matrix.width = NUMBER_OF_POINTS;
d_kernel_matrix.height = NUMBER_OF_POINTS;
size = NUMBER_OF_POINTS * NUMBER_OF_POINTS * sizeof(double);
gpuErrchk( cudaMalloc(&d_kernel_matrix.elements, size) );
// get max sizes supported from the device
int max_block_size = (int)sqrt(device_properties.maxThreadsPerBlock);
int requested_block_size = max_block_size;
bool block_size_too_big = true;
dim3 dimBlock;
dim3 dimGrid;
do {
dimBlock.x = requested_block_size;
dimBlock.y = requested_block_size;
dimGrid.x = (d_kernel_matrix.height + dimBlock.x - 1) / dimBlock.x;
dimGrid.y = (d_kernel_matrix.width + dimBlock.y - 1) / dimBlock.y;
calculate_kernel_matrix_kernel<<<dimGrid, dimBlock>>>(d_shifted_points, d_original_points
, deviation, d_kernel_matrix);
if (cudaGetLastError() != cudaSuccess){
--requested_block_size;
} else {
block_size_too_big = false;
gpuErrchk( cudaDeviceSynchronize() );
}
} while(block_size_too_big);
if (first_iter && params.verbose){
printf("calculate_kernel_matrix_kernel called with:\n");
printf("dimBlock.x = %d, dimBlock.y = %d\n", dimBlock.x, dimBlock.y);
printf("dimGrid.x = %d, dimGrid.y = %d\n\n", dimGrid.x, dimGrid.y);
first_iter = false;
}
size = NUMBER_OF_POINTS * NUMBER_OF_POINTS * sizeof(double);
gpuErrchk( cudaMemcpy(&((*kernel_matrix)[0][0]), d_kernel_matrix.elements
, size, cudaMemcpyDeviceToHost) );
gpuErrchk( cudaFree(d_shifted_points.elements) );
gpuErrchk( cudaFree(d_original_points.elements) );
gpuErrchk( cudaFree(d_kernel_matrix.elements) );
}
void multiply(double **kernel_matrix, double **original_points, double ***new_shift){
static bool first_iter = true;
7 years ago
// allocates memory for kernel_matrix in GPU and copies the array
Matrix d_kernel_matrix;
d_kernel_matrix.width = NUMBER_OF_POINTS;
d_kernel_matrix.height = NUMBER_OF_POINTS;
int size = NUMBER_OF_POINTS * NUMBER_OF_POINTS * sizeof(double);
gpuErrchk( cudaMalloc(&d_kernel_matrix.elements, size) );
gpuErrchk( cudaMemcpy(d_kernel_matrix.elements, &(kernel_matrix[0][0])
, size, cudaMemcpyHostToDevice) );
// allocates memory for original_points in GPU and copies the array
Matrix d_original_points;
d_original_points.width = DIMENSIONS;
d_original_points.height = NUMBER_OF_POINTS;
size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double);
gpuErrchk( cudaMalloc(&d_original_points.elements, size) );
gpuErrchk( cudaMemcpy(d_original_points.elements, &(original_points[0][0])
, size, cudaMemcpyHostToDevice) );
// allocates memory for new_shift in GPU
Matrix d_new_shift;
d_new_shift.width = DIMENSIONS;
d_new_shift.height = NUMBER_OF_POINTS;
size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double);
gpuErrchk( cudaMalloc(&d_new_shift.elements, size) );
// get max sizes supported from the device
int max_block_size = device_properties.maxThreadsPerBlock;
dim3 dimBlock((d_new_shift.height < sqrt(max_block_size)) ? d_new_shift.height : sqrt(max_block_size)
, (d_new_shift.width < sqrt(max_block_size)) ? d_new_shift.width : sqrt(max_block_size));
7 years ago
dim3 dimGrid((d_new_shift.height + dimBlock.x - 1) / dimBlock.x
, (d_new_shift.width + dimBlock.y - 1) / dimBlock.y);
if (first_iter && params.verbose){
printf("multiply_kernel called with:\n");
7 years ago
printf("dimBlock.x = %d, dimBlock.y = %d\n", dimBlock.x, dimBlock.y);
printf("dimGrid.x = %d, dimGrid.y = %d\n\n", dimGrid.x, dimGrid.y);
first_iter = false;
7 years ago
}
multiply_kernel<<<dimGrid, dimBlock>>>(d_kernel_matrix, d_original_points, d_new_shift);
gpuErrchk( cudaPeekAtLastError() );
gpuErrchk( cudaDeviceSynchronize() );
size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double);
gpuErrchk( cudaMemcpy(&((*new_shift)[0][0]), d_new_shift.elements
, size, cudaMemcpyDeviceToHost) );
gpuErrchk( cudaFree(d_kernel_matrix.elements) );
gpuErrchk( cudaFree(d_original_points.elements) );
gpuErrchk( cudaFree(d_new_shift.elements) );
}
double calculateDistance(double *y, double *x){
double sum = 0, dif;
for (int i=0; i<DIMENSIONS; i++){
dif = y[i]-x[i];
sum += dif * dif;
}
double distance = sqrt(sum);
return distance;
}
double **alloc_2d_double(int rows, int cols) {
double *data = (double *) malloc(rows*cols*sizeof(double));
double **array = (double **) malloc(rows*sizeof(double*));
for (int i=0; i<rows; i++)
array[i] = &(data[cols*i]);
return array;
}
void duplicate(double **source, int rows, int cols, double ***dest){
for (int i=0; i<rows; i++){
for (int j=0; j<cols; j++){
(*dest)[i][j] = source[i][j];
}
}
}
void print_matrix(double **array, int rows, int cols){
for (int i=0; i<cols; i++){
for (int j=0; j<rows; j++){
printf("%f ", array[j][i]);
}
printf("\n");
}
}
void save_matrix(double **matrix, int iteration){
char filename[50];
snprintf(filename, sizeof(filename), "%s%d", "../output/output_", iteration);
FILE *file;
file = fopen(filename, "w");
for (int rows=0; rows<NUMBER_OF_POINTS; ++rows){
for (int cols=0; cols<DIMENSIONS; ++cols){
fprintf(file, "%f", matrix[rows][cols]);
if (cols != DIMENSIONS - 1){
fprintf(file, ",");
}
}
fprintf(file, "\n");
}
}
double * calculate_denominator(double **kernel_matrix){
static bool first_iter = true;
// allocates memory for denominator_matrix in GPU
Matrix d_denominator_matrix;
d_denominator_matrix.width = NUMBER_OF_POINTS;
d_denominator_matrix.height = 1;
int size = NUMBER_OF_POINTS * sizeof(double);
gpuErrchk( cudaMalloc(&d_denominator_matrix.elements, size) );
// allocates memory for kernel_matrix in GPU and copies the array
Matrix d_kernel_matrix;
d_kernel_matrix.width = NUMBER_OF_POINTS;
d_kernel_matrix.height = NUMBER_OF_POINTS;
size = NUMBER_OF_POINTS * NUMBER_OF_POINTS * sizeof(double);
gpuErrchk( cudaMalloc(&d_kernel_matrix.elements, size) );
gpuErrchk( cudaMemcpy(d_kernel_matrix.elements, &(kernel_matrix[0][0])
, size, cudaMemcpyHostToDevice) );
// get max sizes supported from the device
int max_block_size = device_properties.maxThreadsPerBlock;
dim3 dimBlock((d_denominator_matrix.height < sqrt(max_block_size)) ? d_denominator_matrix.height : sqrt(max_block_size)
, (d_denominator_matrix.width < sqrt(max_block_size)) ? d_denominator_matrix.width : sqrt(max_block_size));
dim3 dimGrid((d_denominator_matrix.height + dimBlock.x - 1) / dimBlock.x
, (d_denominator_matrix.width + dimBlock.y - 1) / dimBlock.y);
if (first_iter && params.verbose){
printf("calculate_denominator called with:\n");
printf("dimBlock.x = %d, dimBlock.y = %d\n", dimBlock.x, dimBlock.y);
printf("dimGrid.x = %d, dimGrid.y = %d\n\n", dimGrid.x, dimGrid.y);
first_iter = false;
}
denominator_kernel<<<dimGrid, dimBlock>>>(d_denominator_matrix, d_kernel_matrix);
gpuErrchk( cudaPeekAtLastError() );
gpuErrchk( cudaDeviceSynchronize() );
size = NUMBER_OF_POINTS * sizeof(double);
double ** denominator = (double**)malloc(size);
gpuErrchk( cudaMemcpy(&((*denominator)[0]), d_denominator_matrix.elements
,size, cudaMemcpyDeviceToHost) );
gpuErrchk( cudaFree(d_kernel_matrix.elements) );
gpuErrchk( cudaFree(d_denominator_matrix.elements) );
return (*denominator);
}