Browse Source

Add meanshift_gpu_utils header and implementation, minor fixes

master
Apostolos Fanakis 7 years ago
parent
commit
11c170c185
  1. 2
      mean_shift_cuda/Makefile
  2. BIN
      mean_shift_cuda/meanshift
  3. 1
      mean_shift_cuda/meanshift.cu
  4. 308
      mean_shift_cuda/meanshift_gpu_utils.cu
  5. 53
      mean_shift_cuda/meanshift_gpu_utils.h
  6. 67
      mean_shift_cuda/meanshift_kernels.cu
  7. 9
      mean_shift_cuda/meanshift_kernels.h
  8. 318
      mean_shift_cuda/meanshift_utils.cu
  9. 49
      mean_shift_cuda/meanshift_utils.h

2
mean_shift_cuda/Makefile

@ -11,7 +11,7 @@ C_FLAGS = -lm -O3 -I.
COMPILE_FLAGS = $(HOST_COMPILER) -x cu $(CUDA_FLAGS) -dc $(C_FLAGS) COMPILE_FLAGS = $(HOST_COMPILER) -x cu $(CUDA_FLAGS) -dc $(C_FLAGS)
LINK_FLAGS = $(HOST_COMPILER) $(CUDA_FLAGS) $(C_FLAGS) LINK_FLAGS = $(HOST_COMPILER) $(CUDA_FLAGS) $(C_FLAGS)
OBJ = meanshift.o meanshift_utils.o meanshift_kernels.o OBJ = meanshift.o meanshift_utils.o meanshift_gpu_utils.o meanshift_kernels.o
DEPS = meanshift_utils.h meanshift_kernels.h DEPS = meanshift_utils.h meanshift_kernels.h
RM = rm -f RM = rm -f

BIN
mean_shift_cuda/meanshift

Binary file not shown.

1
mean_shift_cuda/meanshift.cu

@ -3,6 +3,7 @@
#include <sys/time.h> #include <sys/time.h>
#include "meanshift_utils.h" #include "meanshift_utils.h"
#include "meanshift_gpu_utils.h"
int DEVIATION = 1; int DEVIATION = 1;
int NUMBER_OF_POINTS = 600; int NUMBER_OF_POINTS = 600;

308
mean_shift_cuda/meanshift_gpu_utils.cu

@ -0,0 +1,308 @@
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <float.h>
#include <string.h>
#include "meanshift_utils.h"
#include "meanshift_gpu_utils.h"
cudaDeviceProp device_properties;
//Based on https://stackoverflow.com/a/28113186
//Pio psagmeno link https://www.cs.virginia.edu/~csadmin/wiki/index.php/CUDA_Support/Choosing_a_GPU
void set_GPU(){
int devices_count = 0, max_multiprocessors = 0, max_device = 0;
// gets devices count checking for errors like no devices or no drivers to check for
// devices available
gpuErrchk( cudaGetDeviceCount(&devices_count) );
for(int device_index = 0; device_index < devices_count; ++device_index){
// gets current index device's properties
cudaDeviceProp this_device_properties;
gpuErrchk( cudaGetDeviceProperties(&this_device_properties, device_index) );
// stores best available device's index
// only devices with compute capability >= 2.0 are able to run the code
if (max_multiprocessors < this_device_properties.multiProcessorCount
&& this_device_properties.major >= 2 && this_device_properties.minor >= 0){
// stores devices properties for later use
device_properties = this_device_properties;
max_multiprocessors = this_device_properties.multiProcessorCount;
max_device = device_index;
}
}
// sets the device
gpuErrchk( cudaSetDevice(max_device) );
if (params.verbose){
printf("Device chosen is \"%s\"\n"
"Device has %d multi processors and compute capability %d.%d\n"
"Max threads per block supported are %d\n\n"
, device_properties.name
, device_properties.multiProcessorCount, device_properties.major, device_properties.minor
, device_properties.maxThreadsPerBlock);
}
}
int meanshift(double **original_points, double ***shifted_points, int deviation
, parameters *opt){
// host variables
int size = 0;
static int iteration = 0;
static double **kernel_matrix, *denominator, **mean_shift_vector;
double **new_shift;
// device variables
static Matrix d_original_points, d_shifted_points, d_kernel_matrix, d_denominator,
d_mean_shift_vector;
Matrix d_new_shift;
// allocates memory and copies original points on first iteration
if (iteration == 0 || (*shifted_points) == NULL){
// allocates memory for shifted points array and copies original points into it
(*shifted_points) = alloc_double(NUMBER_OF_POINTS, DIMENSIONS);
duplicate(original_points, NUMBER_OF_POINTS, DIMENSIONS, shifted_points);
// allocates memory for mean shift vector
mean_shift_vector = alloc_double(NUMBER_OF_POINTS, DIMENSIONS);
// initializes elements of mean_shift_vector to inf
for (int i=0;i<NUMBER_OF_POINTS;i++){
for (int j=0;j<DIMENSIONS;j++){
mean_shift_vector[i][j] = DBL_MAX;
}
}
// allocates memory for other arrays needed
kernel_matrix = alloc_double(NUMBER_OF_POINTS, NUMBER_OF_POINTS);
denominator = (double *)malloc(NUMBER_OF_POINTS * sizeof(double));
// allocates corresponding memory in device
init_device_memory(original_points, *shifted_points, &d_original_points, &d_shifted_points,
&d_kernel_matrix, &d_denominator, &d_mean_shift_vector);
}
// finds pairwise distance matrix (inside radius)
// [I, D] = rangesearch(x,y,h);
calculate_kernel_matrix(d_shifted_points, d_original_points, d_kernel_matrix, deviation,
&kernel_matrix);
// calculates denominator
calculate_denominator(d_kernel_matrix, d_denominator, &denominator);
size = NUMBER_OF_POINTS * sizeof(double);
gpuErrchk( cudaMemcpy(d_denominator.elements, &(denominator[0])
, size, cudaMemcpyHostToDevice) );
// creates new y vector
// allocates memory in every recursion
new_shift = alloc_double(NUMBER_OF_POINTS, DIMENSIONS);
// allocates corresponding memory in device
d_new_shift.width = DIMENSIONS;
d_new_shift.height = NUMBER_OF_POINTS;
size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double);
gpuErrchk( cudaMalloc(&(d_new_shift.elements), size) );
shift_points(d_kernel_matrix, d_original_points, d_shifted_points, d_new_shift, d_denominator,
d_mean_shift_vector, kernel_matrix, original_points, &new_shift, &mean_shift_vector);
// frees previously shifted points, they're now garbage
free((*shifted_points)[0]);
// updates shifted points pointer to the new array address
shifted_points = &new_shift;
d_shifted_points.elements = d_new_shift.elements;
if (params.display){
save_matrix((*shifted_points), iteration);
}
// calculates norm of the new mean shift vector
double current_norm = norm(mean_shift_vector, NUMBER_OF_POINTS, DIMENSIONS);
if (params.verbose){
printf("Iteration n. %d, error\t%f \n", iteration, current_norm);
}
/** iterates until convergence **/
if (current_norm > opt->epsilon) {
++iteration;
meanshift(original_points, shifted_points, deviation, opt);
}
if (iteration == 0){
// cleans up allocations
free(mean_shift_vector[0]);
free(mean_shift_vector);
free(kernel_matrix[0]);
free(kernel_matrix);
free(denominator);
free_device_memory(d_original_points, d_kernel_matrix, d_denominator, d_new_shift);
}
return iteration;
}
void init_device_memory(double **original_points, double **shifted_points,
Matrix *d_original_points, Matrix *d_shifted_points, Matrix *d_kernel_matrix,
Matrix *d_denominator, Matrix *d_mean_shift_vector){
int size;
// allocates memory for original_points in GPU and copies the array
d_original_points->width = DIMENSIONS;
d_original_points->height = NUMBER_OF_POINTS;
size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double);
gpuErrchk( cudaMalloc(&(d_original_points->elements), size) );
gpuErrchk( cudaMemcpy(d_original_points->elements, &(original_points[0][0])
, size, cudaMemcpyHostToDevice) );
// allocates memory for shifted_points in GPU and copies the array
d_shifted_points->width = DIMENSIONS;
d_shifted_points->height = NUMBER_OF_POINTS;
size = DIMENSIONS * NUMBER_OF_POINTS * sizeof(double);
gpuErrchk( cudaMalloc(&(d_shifted_points->elements), size) );
gpuErrchk( cudaMemcpy(d_shifted_points->elements, &(shifted_points[0][0])
, size, cudaMemcpyHostToDevice) );
// allocates memory for kernel_matrix in GPU
d_kernel_matrix->width = NUMBER_OF_POINTS;
d_kernel_matrix->height = NUMBER_OF_POINTS;
size = NUMBER_OF_POINTS * NUMBER_OF_POINTS * sizeof(double);
gpuErrchk( cudaMalloc(&(d_kernel_matrix->elements), size) );
// allocates memory for denominator in GPU
d_denominator->width = 1;
d_denominator->height = NUMBER_OF_POINTS;
size = NUMBER_OF_POINTS * sizeof(double);
gpuErrchk( cudaMalloc(&(d_denominator->elements), size) );
// allocates memory for mean_shift_vector in GPU
d_mean_shift_vector->width = DIMENSIONS;
d_mean_shift_vector->height = NUMBER_OF_POINTS;
size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double);
gpuErrchk( cudaMalloc(&(d_mean_shift_vector->elements), size) );
}
void calculate_kernel_matrix(Matrix d_shifted_points, Matrix d_original_points,
Matrix d_kernel_matrix, double deviation, double ***kernel_matrix){
int size;
static bool first_iter = true;
// gets max block size supported from the device
static int max_block_size = device_properties.maxThreadsPerBlock;
static int requested_block_size = (int)sqrt(max_block_size);
bool block_size_too_big = true;
dim3 dimBlock;
dim3 dimGrid;
do {
dimBlock.x = requested_block_size;
dimBlock.y = requested_block_size;
dimGrid.x = (d_kernel_matrix.height + dimBlock.x - 1) / dimBlock.x;
dimGrid.y = (d_kernel_matrix.width + dimBlock.y - 1) / dimBlock.y;
calculate_kernel_matrix_kernel<<<dimGrid, dimBlock>>>(d_shifted_points, d_original_points
, deviation, d_kernel_matrix);
if (cudaGetLastError() != cudaSuccess){
--requested_block_size;
} else {
block_size_too_big = false;
gpuErrchk( cudaDeviceSynchronize() );
}
} while(block_size_too_big);
if (first_iter && params.verbose){
printf("calculate_kernel_matrix_kernel called with:\n");
printf("dimBlock.x = %d, dimBlock.y = %d\n", dimBlock.x, dimBlock.y);
printf("dimGrid.x = %d, dimGrid.y = %d\n\n", dimGrid.x, dimGrid.y);
first_iter = false;
}
size = NUMBER_OF_POINTS * NUMBER_OF_POINTS * sizeof(double);
gpuErrchk( cudaMemcpy(&((*kernel_matrix)[0][0]), d_kernel_matrix.elements
, size, cudaMemcpyDeviceToHost) );
}
void calculate_denominator(Matrix d_kernel_matrix, Matrix d_denominator, double **denominator){
int size;
static bool first_iter = true;
// gets max block size supported from the device
static int requested_block_size = device_properties.maxThreadsPerBlock;
bool block_size_too_big = true;
dim3 dimBlock;
dim3 dimGrid;
do {
dimBlock.x = requested_block_size;
dimBlock.y = 1;
dimGrid.x = (d_kernel_matrix.height + dimBlock.x - 1) / dimBlock.x;
dimGrid.y = 1;
denominator_kernel<<<dimGrid, dimBlock>>>(d_denominator, d_kernel_matrix);
if (cudaGetLastError() != cudaSuccess){
--requested_block_size;
} else {
block_size_too_big = false;
gpuErrchk( cudaDeviceSynchronize() );
}
} while(block_size_too_big);
if (first_iter && params.verbose){
printf("calculate_denominator called with:\n");
printf("dimBlock.x = %d, dimBlock.y = %d\n", dimBlock.x, dimBlock.y);
printf("dimGrid.x = %d, dimGrid.y = %d\n\n", dimGrid.x, dimGrid.y);
first_iter = false;
}
size = NUMBER_OF_POINTS * sizeof(double);
gpuErrchk( cudaMemcpy(&((*denominator)[0]), d_denominator.elements
, size, cudaMemcpyDeviceToHost) );
}
void shift_points(Matrix d_kernel_matrix, Matrix d_original_points, Matrix d_shifted_points,
Matrix d_new_shift, Matrix d_denominator, Matrix d_mean_shift_vector, double **kernel_matrix,
double **original_points, double ***new_shift, double ***mean_shift_vector){
int size;
static bool first_iter = true;
// gets max block size supported from the device
static int max_block_size = device_properties.maxThreadsPerBlock;
static int requested_block_size = (int)(max_block_size / d_new_shift.width);
bool block_size_too_big = true;
dim3 dimBlock;
dim3 dimGrid;
do {
dimBlock.x = requested_block_size;
dimBlock.y = d_new_shift.width;
dimGrid.x = (d_denominator.height + dimBlock.x - 1) / dimBlock.x;
dimGrid.y = 1;
shift_points_kernel<<<dimGrid, dimBlock>>>(d_original_points, d_kernel_matrix, d_shifted_points,
d_new_shift, d_denominator, d_mean_shift_vector);
if (cudaGetLastError() != cudaSuccess){
--requested_block_size;
} else {
block_size_too_big = false;
gpuErrchk( cudaDeviceSynchronize() );
}
} while(block_size_too_big);
if (first_iter && params.verbose){
printf("shift_points_kernel called with:\n");
printf("dimBlock.x = %d, dimBlock.y = %d\n", dimBlock.x, dimBlock.y);
printf("dimGrid.x = %d, dimGrid.y = %d\n\n", dimGrid.x, dimGrid.y);
first_iter = false;
}
size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double);
gpuErrchk( cudaMemcpy(&((*new_shift)[0][0]), d_new_shift.elements
, size, cudaMemcpyDeviceToHost) );
gpuErrchk( cudaMemcpy(&((*mean_shift_vector)[0][0]), d_mean_shift_vector.elements
, size, cudaMemcpyDeviceToHost) );
}
void free_device_memory(Matrix d_original_points, Matrix d_kernel_matrix, Matrix d_denominator,
Matrix d_new_shift){
// frees all memory previously allocated in device
gpuErrchk( cudaFree(d_original_points.elements) );
gpuErrchk( cudaFree(d_kernel_matrix.elements) );
gpuErrchk( cudaFree(d_denominator.elements) );
gpuErrchk( cudaFree(d_new_shift.elements) );
}

53
mean_shift_cuda/meanshift_gpu_utils.h

@ -0,0 +1,53 @@
#ifndef SERIAL_GPU_UTILS_H /* Include guard */
#define SERIAL_GPU_UTILS_H
#include "meanshift_kernels.h"
//GPU error check snippet taken from:
//https://stackoverflow.com/a/14038590
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true){
if (code != cudaSuccess){
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
/* Global variables */
extern int DEVIATION;
extern int NUMBER_OF_POINTS;
extern int DIMENSIONS;
extern char* POINTS_FILENAME;
extern char* LABELS_FILENAME;
extern parameters params;
extern cudaDeviceProp device_properties;
void set_GPU();
//Function meanshift recursively shifts original points according to th
//mean-shift algorithm saving the result to shiftedPoints. Struct opt has user
//options, h is the desirable deviation.
int meanshift(double **original_points, double ***shifted_points, int h
, parameters *opt);
void init_device_memory(double **original_points, double **shifted_points,
Matrix *d_original_points, Matrix *d_shifted_points,
Matrix *d_kernel_matrix, Matrix *d_denominator, Matrix *d_new_shift);
void calculate_kernel_matrix(Matrix d_shifted_points, Matrix d_original_points,
Matrix d_kernel_matrix, double deviation, double ***kernel_matrix);
//Function multiply allocates memory in GPU, sends the data and calls the
//multiply kernel function.
void shift_points(Matrix d_kernel_matrix, Matrix d_original_points, Matrix d_shifted_points,
Matrix d_new_shift, Matrix d_denominator, Matrix d_mean_shift_vector, double **kernel_matrix,
double **original_points, double ***new_shift, double ***mean_shift_vector);
void free_device_memory(Matrix d_original_points, Matrix d_kernel_matrix, Matrix d_denominator,
Matrix d_new_shift);
//Function calculate_denominator allocates memory in GPU, sends the data and calls the
//denominator kernel function.
void calculate_denominator(Matrix d_kernel_matrix, Matrix d_denominator, double **denominator);
#endif //SERIAL_GPU_UTILS_H

67
mean_shift_cuda/meanshift_kernels.cu

@ -1,28 +1,14 @@
#include "meanshift_kernels.h" #include "meanshift_kernels.h"
#include <stdio.h> #include <stdio.h>
__global__ void multiply_kernel(Matrix matrix1, Matrix matrix2, Matrix output){ __global__ void calculate_kernel_matrix_kernel(Matrix shifted_points, Matrix original_points,
// Each thread computes one element of output double deviation, Matrix kernel_matrix){
// by accumulating results into cell_value // each thread calculates one element of kernel_matrix
double cell_value = 0;
int row = blockIdx.x * blockDim.x + threadIdx.x; int row = blockIdx.x * blockDim.x + threadIdx.x;
int col = blockIdx.y * blockDim.y + threadIdx.y; int col = blockIdx.y * blockDim.y + threadIdx.y;
if (row + col < output.height * output.width){ // performs calculations only if thread's indexes are within matrix bounds
for (int element_index = 0; element_index < matrix1.width; ++element_index){ if (row * kernel_matrix.width + col >= kernel_matrix.width * kernel_matrix.height){
cell_value += matrix1.elements[row * matrix1.width + element_index]
* matrix2.elements[element_index * matrix2.width + col];
}
output.elements[row * output.width + col] = cell_value;
}
}
__global__ void calculate_kernel_matrix_kernel(Matrix shifted_points, Matrix original_points
, double deviation, Matrix kernel_matrix){
// Each thread calculates one element of kernel_matrix
int row = blockIdx.x * blockDim.x + threadIdx.x;
int col = blockIdx.y * blockDim.y + threadIdx.y;
if (row * kernel_matrix.width + col > kernel_matrix.width * kernel_matrix.height){
return; return;
} }
@ -48,17 +34,48 @@ __global__ void calculate_kernel_matrix_kernel(Matrix shifted_points, Matrix ori
} }
} }
__global__ void denominator_kernel(Matrix denominator, Matrix kernel_matrix){ __global__ void shift_points_kernel(Matrix original_points, Matrix kernel_matrix, Matrix shifted_points,
Matrix new_shift, Matrix denominator, Matrix mean_shift_vector){
// each thread computes one element of new_shift
// by accumulating results into cell_value
double cell_value = 0;
int row = blockIdx.x * blockDim.x + threadIdx.x; int row = blockIdx.x * blockDim.x + threadIdx.x;
int col = blockIdx.y * blockDim.y + threadIdx.y; int col = blockIdx.y * blockDim.y + threadIdx.y;
// performs calculations only if thread's indexes are within matrix bounds
if (row * denominator.width + col > denominator.width * denominator.height){ if (row * new_shift.width + col >= new_shift.width * new_shift.height){
return; return;
} }
denominator.elements[col]=0; // calculates new_shift
denominator.elements[row] += kernel_matrix.elements[row*denominator.width + col]; // builds nominator by multiplying kernel_matrix and original_points
for (int element_index = 0; element_index < kernel_matrix.width; ++element_index){
cell_value += kernel_matrix.elements[row * kernel_matrix.width + element_index]
* original_points.elements[element_index * original_points.width + col];
}
// new_shift elements are calculated by dividing with the denominator
new_shift.elements[row * new_shift.width + col] =
cell_value / denominator.elements[row];
// calculates mean-shift vector
mean_shift_vector.elements[row * new_shift.width + col] =
new_shift.elements[row * new_shift.width + col] -
shifted_points.elements[row * new_shift.width + col];
}
__global__ void denominator_kernel(Matrix denominator, Matrix kernel_matrix){
// each thread computes one element of denominator_kernel
// by accumulating results into cell_value
double cell_value = 0;
int row = blockIdx.x * blockDim.x + threadIdx.x;
// performs calculations only if thread's indexes are within matrix bounds
if (row >= denominator.height){
return;
}
for (int column = 0; column < kernel_matrix.width; ++column){
cell_value += kernel_matrix.elements[row * kernel_matrix.width + column];
}
denominator.elements[row] = cell_value;
} }

9
mean_shift_cuda/meanshift_kernels.h

@ -7,11 +7,12 @@ typedef struct{
double *elements; double *elements;
} Matrix; } Matrix;
//Function multiply_kernel calculates the product of matrices 1 and 2 into output. __global__ void calculate_kernel_matrix_kernel(Matrix shifted_points, Matrix original_points,
__global__ void multiply_kernel(Matrix matrix1, Matrix matrix2, Matrix output); double deviation, Matrix kernel_matrix);
__global__ void calculate_kernel_matrix_kernel(Matrix shifted_points, Matrix original_points //Function multiply_kernel calculates the product of matrices 1 and 2 into output.
, double deviation, Matrix kernel_matrix); __global__ void shift_points_kernel(Matrix original_points, Matrix kernel_matrix, Matrix shifted_points,
Matrix new_shift, Matrix denominator, Matrix mean_shift_vector);
__global__ void denominator_kernel(Matrix denominator, Matrix kernel_matrix); __global__ void denominator_kernel(Matrix denominator, Matrix kernel_matrix);

318
mean_shift_cuda/meanshift_utils.cu

@ -5,12 +5,10 @@
#include <string.h> #include <string.h>
#include "meanshift_utils.h" #include "meanshift_utils.h"
#include "meanshift_kernels.h" #include "meanshift_gpu_utils.h"
#define OUTPUT_PREFIX "../output/output_" #define OUTPUT_PREFIX "../output/output_"
cudaDeviceProp device_properties;
void get_args(int argc, char **argv, parameters *params){ void get_args(int argc, char **argv, parameters *params){
if (argc < 7) { if (argc < 7) {
printf("Usage: %s h e N D Pd Pl\nwhere:\n" printf("Usage: %s h e N D Pd Pl\nwhere:\n"
@ -61,7 +59,7 @@ void get_args(int argc, char **argv, parameters *params){
void init(double ***vectors, char **labels){ void init(double ***vectors, char **labels){
int bytes_read = 0; int bytes_read = 0;
set_Gpu(); set_GPU();
if (params.verbose){ if (params.verbose){
printf("Reading dataset and labels...\n"); printf("Reading dataset and labels...\n");
@ -72,7 +70,7 @@ void init(double ***vectors, char **labels){
points_file = fopen(POINTS_FILENAME, "rb"); points_file = fopen(POINTS_FILENAME, "rb");
if (points_file != NULL){ if (points_file != NULL){
// allocates memory for the array // allocates memory for the array
(*vectors) = alloc_2d_double(NUMBER_OF_POINTS, DIMENSIONS); (*vectors) = alloc_double(NUMBER_OF_POINTS, DIMENSIONS);
// reads vectors dataset from file // reads vectors dataset from file
for (int i=0; i<NUMBER_OF_POINTS; i++){ for (int i=0; i<NUMBER_OF_POINTS; i++){
bytes_read = fread((*vectors)[i], sizeof(double), DIMENSIONS, points_file); bytes_read = fread((*vectors)[i], sizeof(double), DIMENSIONS, points_file);
@ -125,129 +123,7 @@ void init(double ***vectors, char **labels){
} }
} }
//Based on https://stackoverflow.com/a/28113186 // TODO check why there's is a difference in the norm calculate in matlab
//Poio psagmeno link https://www.cs.virginia.edu/~csadmin/wiki/index.php/CUDA_Support/Choosing_a_GPU
void set_Gpu(){
int devices_count = 0, max_multiprocessors = 0, max_device = 0;
// gets devices count checking for errors like no devices or no drivers to check for
// devices available
gpuErrchk( cudaGetDeviceCount(&devices_count) );
for(int device_index = 0; device_index < devices_count; ++device_index){
// gets current index device's properties
cudaDeviceProp this_device_properties;
gpuErrchk( cudaGetDeviceProperties(&this_device_properties, device_index) );
// stores best available device's index
// only devices with compute capability >= 2.0 are able to run the code
if (max_multiprocessors < this_device_properties.multiProcessorCount
&& this_device_properties.major >= 2 && this_device_properties.minor >= 0){
// stores devices properties for later use
device_properties = this_device_properties;
max_multiprocessors = this_device_properties.multiProcessorCount;
max_device = device_index;
}
}
// sets the device
gpuErrchk( cudaSetDevice(max_device) );
if (params.verbose){
printf("Device chosen is \"%s\"\n"
"Device has %d multi processors and compute capability %d.%d\n"
"Max threads per block supported are %d\n\n"
, device_properties.name
, device_properties.multiProcessorCount, device_properties.major, device_properties.minor
, device_properties.maxThreadsPerBlock);
}
}
int meanshift(double **original_points, double ***shifted_points, int deviation
, parameters *opt){
static int iteration = 0;
static double **mean_shift_vector, **kernel_matrix, *denominator;
// allocates memory and copies original points on first iteration
if (iteration == 0 || (*shifted_points) == NULL){
(*shifted_points) = alloc_2d_double(NUMBER_OF_POINTS, DIMENSIONS);
duplicate(original_points, NUMBER_OF_POINTS, DIMENSIONS, shifted_points);
// allocates memory for mean shift vector
mean_shift_vector = alloc_2d_double(NUMBER_OF_POINTS, DIMENSIONS);
// initializes elements of mean_shift_vector to inf
for (int i=0;i<NUMBER_OF_POINTS;i++){
for (int j=0;j<DIMENSIONS;j++){
mean_shift_vector[i][j] = DBL_MAX;
}
}
// allocates memory for other arrays needed
kernel_matrix = alloc_2d_double(NUMBER_OF_POINTS, NUMBER_OF_POINTS);
denominator = (double *)malloc(NUMBER_OF_POINTS * sizeof(double));
}
// TODO move arrays to device and create global kernel for the iteration
// finds pairwise distance matrix (inside radius)
// [I, D] = rangesearch(x,y,h);
calculate_kernel_matrix((*shifted_points), original_points, deviation, &kernel_matrix);
// // calculate denominator
// for (int i=0; i<NUMBER_OF_POINTS; i++){
// double sum = 0;
// for (int j=0; j<NUMBER_OF_POINTS; j++){
// sum = sum + kernel_matrix[i][j];
// }
// denominator[i] = sum;
// }
denominator = calculate_denominator(kernel_matrix);
// creates new y vector
double **new_shift = alloc_2d_double(NUMBER_OF_POINTS, DIMENSIONS);
// builds nominator
multiply(kernel_matrix, original_points, &new_shift);
// divides element-wise
for (int i=0; i<NUMBER_OF_POINTS; i++){
for (int j=0; j<DIMENSIONS; j++){
new_shift[i][j] = new_shift[i][j] / denominator[i];
// calculates mean-shift vector at the same time
mean_shift_vector[i][j] = new_shift[i][j] - (*shifted_points)[i][j];
}
}
// frees previously shifted points, they're now garbage
free((*shifted_points)[0]);
// updates shifted points pointer to the new array address
shifted_points = &new_shift;
if (params.display){
save_matrix((*shifted_points), iteration);
}
// calculates norm of the new mean shift vector
double current_norm = norm(mean_shift_vector, NUMBER_OF_POINTS, DIMENSIONS);
if (params.verbose){
printf("Iteration n. %d, error %f \n", iteration, current_norm);
}
/** iterates until convergence **/
if (current_norm > opt->epsilon) {
++iteration;
meanshift(original_points, shifted_points, deviation, opt);
}
if (iteration == 0){
// cleans up allocations
free(mean_shift_vector[0]);
free(mean_shift_vector);
free(kernel_matrix[0]);
free(kernel_matrix);
free(denominator);
}
return iteration;
}
double norm(double **matrix, int rows, int cols){ double norm(double **matrix, int rows, int cols){
double sum=0, temp_mul=0; double sum=0, temp_mul=0;
for (int i=0; i<rows; i++) { for (int i=0; i<rows; i++) {
@ -260,142 +136,7 @@ double norm(double **matrix, int rows, int cols){
return norm; return norm;
} }
void calculate_kernel_matrix(double **shifted_points, double **original_points, double deviation double **alloc_double(int rows, int cols) {
, double ***kernel_matrix){
static bool first_iter = true;
// allocates memory for shifted_points in GPU and copies the array
Matrix d_shifted_points;
d_shifted_points.width = DIMENSIONS;
d_shifted_points.height = NUMBER_OF_POINTS;
int size = DIMENSIONS * NUMBER_OF_POINTS * sizeof(double);
gpuErrchk( cudaMalloc(&d_shifted_points.elements, size) );
gpuErrchk( cudaMemcpy(d_shifted_points.elements, &(shifted_points[0][0])
, size, cudaMemcpyHostToDevice) );
// allocates memory for original_points in GPU and copies the array
Matrix d_original_points;
d_original_points.width = DIMENSIONS;
d_original_points.height = NUMBER_OF_POINTS;
size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double);
gpuErrchk( cudaMalloc(&d_original_points.elements, size) );
gpuErrchk( cudaMemcpy(d_original_points.elements, &(original_points[0][0])
, size, cudaMemcpyHostToDevice) );
// allocates memory for kernel_matrix in GPU
Matrix d_kernel_matrix;
d_kernel_matrix.width = NUMBER_OF_POINTS;
d_kernel_matrix.height = NUMBER_OF_POINTS;
size = NUMBER_OF_POINTS * NUMBER_OF_POINTS * sizeof(double);
gpuErrchk( cudaMalloc(&d_kernel_matrix.elements, size) );
// get max sizes supported from the device
int max_block_size = (int)sqrt(device_properties.maxThreadsPerBlock);
int requested_block_size = max_block_size;
bool block_size_too_big = true;
dim3 dimBlock;
dim3 dimGrid;
do {
dimBlock.x = requested_block_size;
dimBlock.y = requested_block_size;
dimGrid.x = (d_kernel_matrix.height + dimBlock.x - 1) / dimBlock.x;
dimGrid.y = (d_kernel_matrix.width + dimBlock.y - 1) / dimBlock.y;
calculate_kernel_matrix_kernel<<<dimGrid, dimBlock>>>(d_shifted_points, d_original_points
, deviation, d_kernel_matrix);
if (cudaGetLastError() != cudaSuccess){
--requested_block_size;
} else {
block_size_too_big = false;
gpuErrchk( cudaDeviceSynchronize() );
}
} while(block_size_too_big);
if (first_iter && params.verbose){
printf("calculate_kernel_matrix_kernel called with:\n");
printf("dimBlock.x = %d, dimBlock.y = %d\n", dimBlock.x, dimBlock.y);
printf("dimGrid.x = %d, dimGrid.y = %d\n\n", dimGrid.x, dimGrid.y);
first_iter = false;
}
size = NUMBER_OF_POINTS * NUMBER_OF_POINTS * sizeof(double);
gpuErrchk( cudaMemcpy(&((*kernel_matrix)[0][0]), d_kernel_matrix.elements
, size, cudaMemcpyDeviceToHost) );
gpuErrchk( cudaFree(d_shifted_points.elements) );
gpuErrchk( cudaFree(d_original_points.elements) );
gpuErrchk( cudaFree(d_kernel_matrix.elements) );
}
void multiply(double **kernel_matrix, double **original_points, double ***new_shift){
static bool first_iter = true;
// allocates memory for kernel_matrix in GPU and copies the array
Matrix d_kernel_matrix;
d_kernel_matrix.width = NUMBER_OF_POINTS;
d_kernel_matrix.height = NUMBER_OF_POINTS;
int size = NUMBER_OF_POINTS * NUMBER_OF_POINTS * sizeof(double);
gpuErrchk( cudaMalloc(&d_kernel_matrix.elements, size) );
gpuErrchk( cudaMemcpy(d_kernel_matrix.elements, &(kernel_matrix[0][0])
, size, cudaMemcpyHostToDevice) );
// allocates memory for original_points in GPU and copies the array
Matrix d_original_points;
d_original_points.width = DIMENSIONS;
d_original_points.height = NUMBER_OF_POINTS;
size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double);
gpuErrchk( cudaMalloc(&d_original_points.elements, size) );
gpuErrchk( cudaMemcpy(d_original_points.elements, &(original_points[0][0])
, size, cudaMemcpyHostToDevice) );
// allocates memory for new_shift in GPU
Matrix d_new_shift;
d_new_shift.width = DIMENSIONS;
d_new_shift.height = NUMBER_OF_POINTS;
size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double);
gpuErrchk( cudaMalloc(&d_new_shift.elements, size) );
// get max sizes supported from the device
int max_block_size = device_properties.maxThreadsPerBlock;
dim3 dimBlock((d_new_shift.height < sqrt(max_block_size)) ? d_new_shift.height : sqrt(max_block_size)
, (d_new_shift.width < sqrt(max_block_size)) ? d_new_shift.width : sqrt(max_block_size));
dim3 dimGrid((d_new_shift.height + dimBlock.x - 1) / dimBlock.x
, (d_new_shift.width + dimBlock.y - 1) / dimBlock.y);
if (first_iter && params.verbose){
printf("multiply_kernel called with:\n");
printf("dimBlock.x = %d, dimBlock.y = %d\n", dimBlock.x, dimBlock.y);
printf("dimGrid.x = %d, dimGrid.y = %d\n\n", dimGrid.x, dimGrid.y);
first_iter = false;
}
multiply_kernel<<<dimGrid, dimBlock>>>(d_kernel_matrix, d_original_points, d_new_shift);
gpuErrchk( cudaPeekAtLastError() );
gpuErrchk( cudaDeviceSynchronize() );
size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double);
gpuErrchk( cudaMemcpy(&((*new_shift)[0][0]), d_new_shift.elements
, size, cudaMemcpyDeviceToHost) );
gpuErrchk( cudaFree(d_kernel_matrix.elements) );
gpuErrchk( cudaFree(d_original_points.elements) );
gpuErrchk( cudaFree(d_new_shift.elements) );
}
double calculateDistance(double *y, double *x){
double sum = 0, dif;
for (int i=0; i<DIMENSIONS; i++){
dif = y[i]-x[i];
sum += dif * dif;
}
double distance = sqrt(sum);
return distance;
}
double **alloc_2d_double(int rows, int cols) {
double *data = (double *) malloc(rows*cols*sizeof(double)); double *data = (double *) malloc(rows*cols*sizeof(double));
double **array = (double **) malloc(rows*sizeof(double*)); double **array = (double **) malloc(rows*sizeof(double*));
for (int i=0; i<rows; i++) for (int i=0; i<rows; i++)
@ -435,52 +176,3 @@ void save_matrix(double **matrix, int iteration){
fprintf(file, "\n"); fprintf(file, "\n");
} }
} }
double * calculate_denominator(double **kernel_matrix){
static bool first_iter = true;
// allocates memory for denominator_matrix in GPU
Matrix d_denominator_matrix;
d_denominator_matrix.width = NUMBER_OF_POINTS;
d_denominator_matrix.height = 1;
int size = NUMBER_OF_POINTS * sizeof(double);
gpuErrchk( cudaMalloc(&d_denominator_matrix.elements, size) );
// allocates memory for kernel_matrix in GPU and copies the array
Matrix d_kernel_matrix;
d_kernel_matrix.width = NUMBER_OF_POINTS;
d_kernel_matrix.height = NUMBER_OF_POINTS;
size = NUMBER_OF_POINTS * NUMBER_OF_POINTS * sizeof(double);
gpuErrchk( cudaMalloc(&d_kernel_matrix.elements, size) );
gpuErrchk( cudaMemcpy(d_kernel_matrix.elements, &(kernel_matrix[0][0])
, size, cudaMemcpyHostToDevice) );
// get max sizes supported from the device
int max_block_size = device_properties.maxThreadsPerBlock;
dim3 dimBlock((d_denominator_matrix.height < sqrt(max_block_size)) ? d_denominator_matrix.height : sqrt(max_block_size)
, (d_denominator_matrix.width < sqrt(max_block_size)) ? d_denominator_matrix.width : sqrt(max_block_size));
dim3 dimGrid((d_denominator_matrix.height + dimBlock.x - 1) / dimBlock.x
, (d_denominator_matrix.width + dimBlock.y - 1) / dimBlock.y);
if (first_iter && params.verbose){
printf("calculate_denominator called with:\n");
printf("dimBlock.x = %d, dimBlock.y = %d\n", dimBlock.x, dimBlock.y);
printf("dimGrid.x = %d, dimGrid.y = %d\n\n", dimGrid.x, dimGrid.y);
first_iter = false;
}
denominator_kernel<<<dimGrid, dimBlock>>>(d_denominator_matrix, d_kernel_matrix);
gpuErrchk( cudaPeekAtLastError() );
gpuErrchk( cudaDeviceSynchronize() );
size = NUMBER_OF_POINTS * sizeof(double);
double ** denominator = (double**)malloc(size);
gpuErrchk( cudaMemcpy(&((*denominator)[0]), d_denominator_matrix.elements
,size, cudaMemcpyDeviceToHost) );
gpuErrchk( cudaFree(d_kernel_matrix.elements) );
gpuErrchk( cudaFree(d_denominator_matrix.elements) );
return (*denominator);
}

49
mean_shift_cuda/meanshift_utils.h

@ -3,16 +3,6 @@
#include <stdbool.h> #include <stdbool.h>
//GPU error check snippet taken from:
//https://stackoverflow.com/a/14038590
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true){
if (code != cudaSuccess){
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
/* Structs */ /* Structs */
typedef struct parameters { typedef struct parameters {
double epsilon; double epsilon;
@ -20,45 +10,17 @@ typedef struct parameters {
bool display; bool display;
} parameters; } parameters;
/* Global variables */
extern int DEVIATION;
extern int NUMBER_OF_POINTS;
extern int DIMENSIONS;
extern char* POINTS_FILENAME;
extern char* LABELS_FILENAME;
extern parameters params;
extern cudaDeviceProp device_properties;
//Function get_args parses command line arguments. //Function get_args parses command line arguments.
void get_args(int argc, char **argv, parameters *params); void get_args(int argc, char **argv, parameters *params);
//Function init reads the dataset and label arrays from the corresponding files. //Function init reads the dataset and label arrays from the corresponding files.
void init(double ***vectors, char **labels); void init(double ***vectors, char **labels);
void set_Gpu();
//Function meanshift recursively shifts original points according to th
//mean-shift algorithm saving the result to shiftedPoints. Struct opt has user
//options, h is the desirable deviation.
int meanshift(double **original_points, double ***shifted_points, int h
, parameters *opt);
//Function norm returns the second norm of matrix of dimensions rowsXcols. //Function norm returns the second norm of matrix of dimensions rowsXcols.
double norm(double **matrix, int rows, int cols); double norm(double **matrix, int rows, int cols);
void calculate_kernel_matrix(double **shifted_points, double **original_points, double deviation //Function alloc_double allocates rows*cols bytes of continuous memory.
, double ***kernel_matrix); double **alloc_double(int rows, int cols);
//Function multiply allocates memory in GPU, sends the data and calls the
//multiply kernel function.
void multiply(double **kernel_matrix, double **original_points
, double ***new_shift);
//Function calculateDistance returns the distance between x and y vectors.
double calculateDistance(double *y, double *x);
//Function alloc_2d_double allocates rows*cols bytes of continuous memory.
double **alloc_2d_double(int rows, int cols);
//Function duplicate copies the values of source array to dest array. //Function duplicate copies the values of source array to dest array.
void duplicate(double **source, int rows, int cols, double ***dest); void duplicate(double **source, int rows, int cols, double ***dest);
@ -68,11 +30,6 @@ void print_matrix(double **array, int rows, int cols);
//Function save_matrix prints matrix in a csv file with path/filename //Function save_matrix prints matrix in a csv file with path/filename
//"output/output_iteration". If a file already exists new lines are concatenated. //"output/output_iteration". If a file already exists new lines are concatenated.
void save_matrix(double **matrix void save_matrix(double **matrix, int iteration);
, int iteration);
//Function calculate_denominator allocates memory in GPU, sends the data and calls the
//denominator kernel function.
double * calculate_denominator(double **kernel_matrix);
#endif //SERIAL_UTILS_H #endif //SERIAL_UTILS_H
Loading…
Cancel
Save