|
@ -5,7 +5,6 @@ |
|
|
#include <string.h> |
|
|
#include <string.h> |
|
|
|
|
|
|
|
|
#include "meanshift_utils.h" |
|
|
#include "meanshift_utils.h" |
|
|
#include "meanshift_kernels.h" |
|
|
|
|
|
|
|
|
|
|
|
#define OUTPUT_PREFIX "../output/output_" |
|
|
#define OUTPUT_PREFIX "../output/output_" |
|
|
|
|
|
|
|
@ -61,7 +60,7 @@ void get_args(int argc, char **argv, parameters *params){ |
|
|
void init(double ***vectors, char **labels){ |
|
|
void init(double ***vectors, char **labels){ |
|
|
int bytes_read = 0; |
|
|
int bytes_read = 0; |
|
|
|
|
|
|
|
|
set_Gpu(); |
|
|
set_GPU(); |
|
|
|
|
|
|
|
|
if (params.verbose){ |
|
|
if (params.verbose){ |
|
|
printf("Reading dataset and labels...\n"); |
|
|
printf("Reading dataset and labels...\n"); |
|
@ -127,7 +126,7 @@ void init(double ***vectors, char **labels){ |
|
|
|
|
|
|
|
|
//Based on https://stackoverflow.com/a/28113186 |
|
|
//Based on https://stackoverflow.com/a/28113186 |
|
|
//Poio psagmeno link https://www.cs.virginia.edu/~csadmin/wiki/index.php/CUDA_Support/Choosing_a_GPU |
|
|
//Poio psagmeno link https://www.cs.virginia.edu/~csadmin/wiki/index.php/CUDA_Support/Choosing_a_GPU |
|
|
void set_Gpu(){ |
|
|
void set_GPU(){ |
|
|
int devices_count = 0, max_multiprocessors = 0, max_device = 0; |
|
|
int devices_count = 0, max_multiprocessors = 0, max_device = 0; |
|
|
|
|
|
|
|
|
// gets devices count checking for errors like no devices or no drivers to check for |
|
|
// gets devices count checking for errors like no devices or no drivers to check for |
|
@ -162,11 +161,20 @@ void set_Gpu(){ |
|
|
|
|
|
|
|
|
int meanshift(double **original_points, double ***shifted_points, int deviation |
|
|
int meanshift(double **original_points, double ***shifted_points, int deviation |
|
|
, parameters *opt){ |
|
|
, parameters *opt){ |
|
|
|
|
|
// host variables |
|
|
|
|
|
int size = 0; |
|
|
static int iteration = 0; |
|
|
static int iteration = 0; |
|
|
static double **mean_shift_vector, **kernel_matrix, *denominator; |
|
|
static double **kernel_matrix, *denominator, **mean_shift_vector; |
|
|
|
|
|
double **new_shift; |
|
|
|
|
|
|
|
|
|
|
|
// device variables |
|
|
|
|
|
static Matrix d_original_points, d_shifted_points, d_kernel_matrix, d_denominator, |
|
|
|
|
|
d_mean_shift_vector; |
|
|
|
|
|
Matrix d_new_shift; |
|
|
|
|
|
|
|
|
// allocates memory and copies original points on first iteration |
|
|
// allocates memory and copies original points on first iteration |
|
|
if (iteration == 0 || (*shifted_points) == NULL){ |
|
|
if (iteration == 0 || (*shifted_points) == NULL){ |
|
|
|
|
|
// allocates memory for shifted points array and copies original points into it |
|
|
(*shifted_points) = alloc_2d_double(NUMBER_OF_POINTS, DIMENSIONS); |
|
|
(*shifted_points) = alloc_2d_double(NUMBER_OF_POINTS, DIMENSIONS); |
|
|
duplicate(original_points, NUMBER_OF_POINTS, DIMENSIONS, shifted_points); |
|
|
duplicate(original_points, NUMBER_OF_POINTS, DIMENSIONS, shifted_points); |
|
|
|
|
|
|
|
@ -182,42 +190,49 @@ int meanshift(double **original_points, double ***shifted_points, int deviation |
|
|
// allocates memory for other arrays needed |
|
|
// allocates memory for other arrays needed |
|
|
kernel_matrix = alloc_2d_double(NUMBER_OF_POINTS, NUMBER_OF_POINTS); |
|
|
kernel_matrix = alloc_2d_double(NUMBER_OF_POINTS, NUMBER_OF_POINTS); |
|
|
denominator = (double *)malloc(NUMBER_OF_POINTS * sizeof(double)); |
|
|
denominator = (double *)malloc(NUMBER_OF_POINTS * sizeof(double)); |
|
|
|
|
|
|
|
|
|
|
|
// allocates corresponding memory in device |
|
|
|
|
|
init_device_memory(original_points, *shifted_points, &d_original_points, &d_shifted_points, |
|
|
|
|
|
&d_kernel_matrix, &d_denominator, &d_mean_shift_vector); |
|
|
} |
|
|
} |
|
|
// TODO move arrays to device and create global kernel for the iteration |
|
|
|
|
|
|
|
|
|
|
|
// finds pairwise distance matrix (inside radius) |
|
|
// finds pairwise distance matrix (inside radius) |
|
|
// [I, D] = rangesearch(x,y,h); |
|
|
// [I, D] = rangesearch(x,y,h); |
|
|
calculate_kernel_matrix((*shifted_points), original_points, deviation, &kernel_matrix); |
|
|
calculate_kernel_matrix(d_shifted_points, d_original_points, d_kernel_matrix, deviation, |
|
|
|
|
|
&kernel_matrix); |
|
|
// // calculate denominator |
|
|
|
|
|
// for (int i=0; i<NUMBER_OF_POINTS; i++){ |
|
|
|
|
|
// double sum = 0; |
|
|
|
|
|
// for (int j=0; j<NUMBER_OF_POINTS; j++){ |
|
|
|
|
|
// sum = sum + kernel_matrix[i][j]; |
|
|
|
|
|
// } |
|
|
|
|
|
// denominator[i] = sum; |
|
|
|
|
|
// } |
|
|
|
|
|
denominator = calculate_denominator(kernel_matrix); |
|
|
|
|
|
|
|
|
|
|
|
// creates new y vector |
|
|
|
|
|
double **new_shift = alloc_2d_double(NUMBER_OF_POINTS, DIMENSIONS); |
|
|
|
|
|
|
|
|
|
|
|
// builds nominator |
|
|
// calculates denominator |
|
|
multiply(kernel_matrix, original_points, &new_shift); |
|
|
|
|
|
|
|
|
|
|
|
// divides element-wise |
|
|
|
|
|
for (int i=0; i<NUMBER_OF_POINTS; i++){ |
|
|
for (int i=0; i<NUMBER_OF_POINTS; i++){ |
|
|
for (int j=0; j<DIMENSIONS; j++){ |
|
|
double sum = 0; |
|
|
new_shift[i][j] = new_shift[i][j] / denominator[i]; |
|
|
for (int j=0; j<NUMBER_OF_POINTS; j++){ |
|
|
// calculates mean-shift vector at the same time |
|
|
sum = sum + kernel_matrix[i][j]; |
|
|
mean_shift_vector[i][j] = new_shift[i][j] - (*shifted_points)[i][j]; |
|
|
|
|
|
} |
|
|
} |
|
|
|
|
|
denominator[i] = sum; |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
//calculate_denominator(kernel_matrix); |
|
|
|
|
|
|
|
|
|
|
|
size = NUMBER_OF_POINTS * sizeof(double); |
|
|
|
|
|
gpuErrchk( cudaMemcpy(d_denominator.elements, &(denominator[0]) |
|
|
|
|
|
, size, cudaMemcpyHostToDevice) ); |
|
|
|
|
|
|
|
|
|
|
|
// creates new y vector |
|
|
|
|
|
// allocates memory in every recursion |
|
|
|
|
|
new_shift = alloc_2d_double(NUMBER_OF_POINTS, DIMENSIONS); |
|
|
|
|
|
// allocates corresponding memory in device |
|
|
|
|
|
d_new_shift.width = DIMENSIONS; |
|
|
|
|
|
d_new_shift.height = NUMBER_OF_POINTS; |
|
|
|
|
|
size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double); |
|
|
|
|
|
gpuErrchk( cudaMalloc(&(d_new_shift.elements), size) ); |
|
|
|
|
|
|
|
|
|
|
|
shift_points(d_kernel_matrix, d_original_points, d_shifted_points, d_new_shift, d_denominator, |
|
|
|
|
|
d_mean_shift_vector, kernel_matrix, original_points, &new_shift, &mean_shift_vector); |
|
|
|
|
|
|
|
|
// frees previously shifted points, they're now garbage |
|
|
// frees previously shifted points, they're now garbage |
|
|
free((*shifted_points)[0]); |
|
|
free((*shifted_points)[0]); |
|
|
// updates shifted points pointer to the new array address |
|
|
// updates shifted points pointer to the new array address |
|
|
shifted_points = &new_shift; |
|
|
shifted_points = &new_shift; |
|
|
|
|
|
d_shifted_points.elements = d_new_shift.elements; |
|
|
|
|
|
|
|
|
if (params.display){ |
|
|
if (params.display){ |
|
|
save_matrix((*shifted_points), iteration); |
|
|
save_matrix((*shifted_points), iteration); |
|
@ -242,12 +257,14 @@ int meanshift(double **original_points, double ***shifted_points, int deviation |
|
|
free(kernel_matrix[0]); |
|
|
free(kernel_matrix[0]); |
|
|
free(kernel_matrix); |
|
|
free(kernel_matrix); |
|
|
free(denominator); |
|
|
free(denominator); |
|
|
|
|
|
|
|
|
|
|
|
free_device_memory(d_original_points, d_kernel_matrix, d_denominator, d_new_shift); |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
return iteration; |
|
|
return iteration; |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
// TODO check why there's is a difference in the norm calculate in matlab |
|
|
double norm(double **matrix, int rows, int cols){ |
|
|
double norm(double **matrix, int rows, int cols){ |
|
|
double sum=0, temp_mul=0; |
|
|
double sum=0, temp_mul=0; |
|
|
for (int i=0; i<rows; i++) { |
|
|
for (int i=0; i<rows; i++) { |
|
@ -260,43 +277,57 @@ double norm(double **matrix, int rows, int cols){ |
|
|
return norm; |
|
|
return norm; |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
void calculate_kernel_matrix(double **shifted_points, double **original_points, double deviation |
|
|
void init_device_memory(double **original_points, double **shifted_points, |
|
|
, double ***kernel_matrix){ |
|
|
Matrix *d_original_points, Matrix *d_shifted_points, Matrix *d_kernel_matrix, |
|
|
static bool first_iter = true; |
|
|
Matrix *d_denominator, Matrix *d_mean_shift_vector){ |
|
|
|
|
|
int size; |
|
|
// allocates memory for shifted_points in GPU and copies the array |
|
|
|
|
|
Matrix d_shifted_points; |
|
|
|
|
|
d_shifted_points.width = DIMENSIONS; |
|
|
|
|
|
d_shifted_points.height = NUMBER_OF_POINTS; |
|
|
|
|
|
int size = DIMENSIONS * NUMBER_OF_POINTS * sizeof(double); |
|
|
|
|
|
gpuErrchk( cudaMalloc(&d_shifted_points.elements, size) ); |
|
|
|
|
|
gpuErrchk( cudaMemcpy(d_shifted_points.elements, &(shifted_points[0][0]) |
|
|
|
|
|
, size, cudaMemcpyHostToDevice) ); |
|
|
|
|
|
|
|
|
|
|
|
// allocates memory for original_points in GPU and copies the array |
|
|
// allocates memory for original_points in GPU and copies the array |
|
|
Matrix d_original_points; |
|
|
d_original_points->width = DIMENSIONS; |
|
|
d_original_points.width = DIMENSIONS; |
|
|
d_original_points->height = NUMBER_OF_POINTS; |
|
|
d_original_points.height = NUMBER_OF_POINTS; |
|
|
|
|
|
size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double); |
|
|
size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double); |
|
|
gpuErrchk( cudaMalloc(&d_original_points.elements, size) ); |
|
|
gpuErrchk( cudaMalloc(&(d_original_points->elements), size) ); |
|
|
gpuErrchk( cudaMemcpy(d_original_points.elements, &(original_points[0][0]) |
|
|
gpuErrchk( cudaMemcpy(d_original_points->elements, &(original_points[0][0]) |
|
|
|
|
|
, size, cudaMemcpyHostToDevice) ); |
|
|
|
|
|
|
|
|
|
|
|
// allocates memory for shifted_points in GPU and copies the array |
|
|
|
|
|
d_shifted_points->width = DIMENSIONS; |
|
|
|
|
|
d_shifted_points->height = NUMBER_OF_POINTS; |
|
|
|
|
|
size = DIMENSIONS * NUMBER_OF_POINTS * sizeof(double); |
|
|
|
|
|
gpuErrchk( cudaMalloc(&(d_shifted_points->elements), size) ); |
|
|
|
|
|
gpuErrchk( cudaMemcpy(d_shifted_points->elements, &(shifted_points[0][0]) |
|
|
, size, cudaMemcpyHostToDevice) ); |
|
|
, size, cudaMemcpyHostToDevice) ); |
|
|
|
|
|
|
|
|
// allocates memory for kernel_matrix in GPU |
|
|
// allocates memory for kernel_matrix in GPU |
|
|
Matrix d_kernel_matrix; |
|
|
d_kernel_matrix->width = NUMBER_OF_POINTS; |
|
|
d_kernel_matrix.width = NUMBER_OF_POINTS; |
|
|
d_kernel_matrix->height = NUMBER_OF_POINTS; |
|
|
d_kernel_matrix.height = NUMBER_OF_POINTS; |
|
|
|
|
|
size = NUMBER_OF_POINTS * NUMBER_OF_POINTS * sizeof(double); |
|
|
size = NUMBER_OF_POINTS * NUMBER_OF_POINTS * sizeof(double); |
|
|
gpuErrchk( cudaMalloc(&d_kernel_matrix.elements, size) ); |
|
|
gpuErrchk( cudaMalloc(&(d_kernel_matrix->elements), size) ); |
|
|
|
|
|
|
|
|
// get max sizes supported from the device |
|
|
// allocates memory for denominator in GPU |
|
|
int max_block_size = (int)sqrt(device_properties.maxThreadsPerBlock); |
|
|
d_denominator->width = 1; |
|
|
int requested_block_size = max_block_size; |
|
|
d_denominator->height = NUMBER_OF_POINTS; |
|
|
|
|
|
size = NUMBER_OF_POINTS * sizeof(double); |
|
|
|
|
|
gpuErrchk( cudaMalloc(&(d_denominator->elements), size) ); |
|
|
|
|
|
|
|
|
|
|
|
// allocates memory for mean_shift_vector in GPU |
|
|
|
|
|
d_mean_shift_vector->width = DIMENSIONS; |
|
|
|
|
|
d_mean_shift_vector->height = NUMBER_OF_POINTS; |
|
|
|
|
|
size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double); |
|
|
|
|
|
gpuErrchk( cudaMalloc(&(d_mean_shift_vector->elements), size) ); |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
void calculate_kernel_matrix(Matrix d_shifted_points, Matrix d_original_points, |
|
|
|
|
|
Matrix d_kernel_matrix, double deviation, double ***kernel_matrix){ |
|
|
|
|
|
int size; |
|
|
|
|
|
static bool first_iter = true; |
|
|
|
|
|
// gets max block size supported from the device |
|
|
|
|
|
int max_block_size = device_properties.maxThreadsPerBlock; |
|
|
|
|
|
int requested_block_size = (int)sqrt(max_block_size); |
|
|
bool block_size_too_big = true; |
|
|
bool block_size_too_big = true; |
|
|
|
|
|
|
|
|
dim3 dimBlock; |
|
|
dim3 dimBlock; |
|
|
dim3 dimGrid; |
|
|
dim3 dimGrid; |
|
|
|
|
|
|
|
|
do { |
|
|
do { |
|
|
dimBlock.x = requested_block_size; |
|
|
dimBlock.x = requested_block_size; |
|
|
dimBlock.y = requested_block_size; |
|
|
dimBlock.y = requested_block_size; |
|
@ -323,65 +354,106 @@ void calculate_kernel_matrix(double **shifted_points, double **original_points, |
|
|
size = NUMBER_OF_POINTS * NUMBER_OF_POINTS * sizeof(double); |
|
|
size = NUMBER_OF_POINTS * NUMBER_OF_POINTS * sizeof(double); |
|
|
gpuErrchk( cudaMemcpy(&((*kernel_matrix)[0][0]), d_kernel_matrix.elements |
|
|
gpuErrchk( cudaMemcpy(&((*kernel_matrix)[0][0]), d_kernel_matrix.elements |
|
|
, size, cudaMemcpyDeviceToHost) ); |
|
|
, size, cudaMemcpyDeviceToHost) ); |
|
|
|
|
|
|
|
|
gpuErrchk( cudaFree(d_shifted_points.elements) ); |
|
|
|
|
|
gpuErrchk( cudaFree(d_original_points.elements) ); |
|
|
|
|
|
gpuErrchk( cudaFree(d_kernel_matrix.elements) ); |
|
|
|
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
double * calculate_denominator(double **kernel_matrix){ |
|
|
void multiply(double **kernel_matrix, double **original_points, double ***new_shift){ |
|
|
|
|
|
static bool first_iter = true; |
|
|
static bool first_iter = true; |
|
|
|
|
|
|
|
|
|
|
|
// allocates memory for denominator_matrix in GPU |
|
|
|
|
|
Matrix d_denominator_matrix; |
|
|
|
|
|
d_denominator_matrix.width = NUMBER_OF_POINTS; |
|
|
|
|
|
d_denominator_matrix.height = 1; |
|
|
|
|
|
int size = NUMBER_OF_POINTS * sizeof(double); |
|
|
|
|
|
gpuErrchk( cudaMalloc(&d_denominator_matrix.elements, size) ); |
|
|
|
|
|
|
|
|
// allocates memory for kernel_matrix in GPU and copies the array |
|
|
// allocates memory for kernel_matrix in GPU and copies the array |
|
|
Matrix d_kernel_matrix; |
|
|
Matrix d_kernel_matrix; |
|
|
d_kernel_matrix.width = NUMBER_OF_POINTS; |
|
|
d_kernel_matrix.width = NUMBER_OF_POINTS; |
|
|
d_kernel_matrix.height = NUMBER_OF_POINTS; |
|
|
d_kernel_matrix.height = NUMBER_OF_POINTS; |
|
|
int size = NUMBER_OF_POINTS * NUMBER_OF_POINTS * sizeof(double); |
|
|
size = NUMBER_OF_POINTS * NUMBER_OF_POINTS * sizeof(double); |
|
|
gpuErrchk( cudaMalloc(&d_kernel_matrix.elements, size) ); |
|
|
gpuErrchk( cudaMalloc(&d_kernel_matrix.elements, size) ); |
|
|
gpuErrchk( cudaMemcpy(d_kernel_matrix.elements, &(kernel_matrix[0][0]) |
|
|
gpuErrchk( cudaMemcpy(d_kernel_matrix.elements, &(kernel_matrix[0][0]) |
|
|
, size, cudaMemcpyHostToDevice) ); |
|
|
, size, cudaMemcpyHostToDevice) ); |
|
|
|
|
|
|
|
|
// allocates memory for original_points in GPU and copies the array |
|
|
|
|
|
Matrix d_original_points; |
|
|
|
|
|
d_original_points.width = DIMENSIONS; |
|
|
|
|
|
d_original_points.height = NUMBER_OF_POINTS; |
|
|
|
|
|
size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double); |
|
|
|
|
|
gpuErrchk( cudaMalloc(&d_original_points.elements, size) ); |
|
|
|
|
|
gpuErrchk( cudaMemcpy(d_original_points.elements, &(original_points[0][0]) |
|
|
|
|
|
, size, cudaMemcpyHostToDevice) ); |
|
|
|
|
|
|
|
|
|
|
|
// allocates memory for new_shift in GPU |
|
|
|
|
|
Matrix d_new_shift; |
|
|
|
|
|
d_new_shift.width = DIMENSIONS; |
|
|
|
|
|
d_new_shift.height = NUMBER_OF_POINTS; |
|
|
|
|
|
size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double); |
|
|
|
|
|
gpuErrchk( cudaMalloc(&d_new_shift.elements, size) ); |
|
|
|
|
|
|
|
|
|
|
|
// get max sizes supported from the device |
|
|
// get max sizes supported from the device |
|
|
int max_block_size = device_properties.maxThreadsPerBlock; |
|
|
int max_block_size = device_properties.maxThreadsPerBlock; |
|
|
dim3 dimBlock((d_new_shift.height < sqrt(max_block_size)) ? d_new_shift.height : sqrt(max_block_size) |
|
|
dim3 dimBlock((d_denominator_matrix.height < sqrt(max_block_size)) ? d_denominator_matrix.height : sqrt(max_block_size) |
|
|
, (d_new_shift.width < sqrt(max_block_size)) ? d_new_shift.width : sqrt(max_block_size)); |
|
|
, (d_denominator_matrix.width < sqrt(max_block_size)) ? d_denominator_matrix.width : sqrt(max_block_size)); |
|
|
dim3 dimGrid((d_new_shift.height + dimBlock.x - 1) / dimBlock.x |
|
|
dim3 dimGrid((d_denominator_matrix.height + dimBlock.x - 1) / dimBlock.x |
|
|
, (d_new_shift.width + dimBlock.y - 1) / dimBlock.y); |
|
|
, (d_denominator_matrix.width + dimBlock.y - 1) / dimBlock.y); |
|
|
|
|
|
|
|
|
if (first_iter && params.verbose){ |
|
|
if (first_iter && params.verbose){ |
|
|
printf("multiply_kernel called with:\n"); |
|
|
printf("calculate_denominator called with:\n"); |
|
|
printf("dimBlock.x = %d, dimBlock.y = %d\n", dimBlock.x, dimBlock.y); |
|
|
printf("dimBlock.x = %d, dimBlock.y = %d\n", dimBlock.x, dimBlock.y); |
|
|
printf("dimGrid.x = %d, dimGrid.y = %d\n\n", dimGrid.x, dimGrid.y); |
|
|
printf("dimGrid.x = %d, dimGrid.y = %d\n\n", dimGrid.x, dimGrid.y); |
|
|
first_iter = false; |
|
|
first_iter = false; |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
multiply_kernel<<<dimGrid, dimBlock>>>(d_kernel_matrix, d_original_points, d_new_shift); |
|
|
denominator_kernel<<<dimGrid, dimBlock>>>(d_denominator_matrix, d_kernel_matrix); |
|
|
gpuErrchk( cudaPeekAtLastError() ); |
|
|
gpuErrchk( cudaPeekAtLastError() ); |
|
|
gpuErrchk( cudaDeviceSynchronize() ); |
|
|
gpuErrchk( cudaDeviceSynchronize() ); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
size = NUMBER_OF_POINTS * sizeof(double); |
|
|
|
|
|
double ** denominator = (double**)malloc(size); |
|
|
|
|
|
gpuErrchk( cudaMemcpy(&((*denominator)[0]), d_denominator_matrix.elements |
|
|
|
|
|
,size, cudaMemcpyDeviceToHost) ); |
|
|
|
|
|
|
|
|
|
|
|
gpuErrchk( cudaFree(d_kernel_matrix.elements) ); |
|
|
|
|
|
gpuErrchk( cudaFree(d_denominator_matrix.elements) ); |
|
|
|
|
|
|
|
|
|
|
|
return (*denominator); |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
void shift_points(Matrix d_kernel_matrix, Matrix d_original_points, Matrix d_shifted_points, |
|
|
|
|
|
Matrix d_new_shift, Matrix d_denominator, Matrix d_mean_shift_vector, double **kernel_matrix, |
|
|
|
|
|
double **original_points, double ***new_shift, double ***mean_shift_vector){ |
|
|
|
|
|
int size; |
|
|
|
|
|
static bool first_iter = true; |
|
|
|
|
|
// gets max block size supported from the device |
|
|
|
|
|
int max_block_size = device_properties.maxThreadsPerBlock; |
|
|
|
|
|
int requested_block_size = (int)sqrt(max_block_size); |
|
|
|
|
|
bool block_size_too_big = true; |
|
|
|
|
|
|
|
|
|
|
|
dim3 dimBlock; |
|
|
|
|
|
dim3 dimGrid; |
|
|
|
|
|
do { |
|
|
|
|
|
dimBlock.x = requested_block_size; |
|
|
|
|
|
dimBlock.y = 2; |
|
|
|
|
|
dimGrid.x = (d_kernel_matrix.height + dimBlock.x - 1) / dimBlock.x; |
|
|
|
|
|
dimGrid.y = 1; |
|
|
|
|
|
|
|
|
|
|
|
shift_points_kernel<<<dimGrid, dimBlock>>>(d_original_points, d_kernel_matrix, d_shifted_points, |
|
|
|
|
|
d_new_shift, d_denominator, d_mean_shift_vector); |
|
|
|
|
|
if (cudaGetLastError() != cudaSuccess){ |
|
|
|
|
|
--requested_block_size; |
|
|
|
|
|
} else { |
|
|
|
|
|
block_size_too_big = false; |
|
|
|
|
|
gpuErrchk( cudaDeviceSynchronize() ); |
|
|
|
|
|
} |
|
|
|
|
|
} while(block_size_too_big); |
|
|
|
|
|
|
|
|
|
|
|
if (first_iter && params.verbose){ |
|
|
|
|
|
printf("shift_points_kernel called with:\n"); |
|
|
|
|
|
printf("dimBlock.x = %d, dimBlock.y = %d\n", dimBlock.x, dimBlock.y); |
|
|
|
|
|
printf("dimGrid.x = %d, dimGrid.y = %d\n\n", dimGrid.x, dimGrid.y); |
|
|
|
|
|
first_iter = false; |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double); |
|
|
size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double); |
|
|
gpuErrchk( cudaMemcpy(&((*new_shift)[0][0]), d_new_shift.elements |
|
|
gpuErrchk( cudaMemcpy(&((*new_shift)[0][0]), d_new_shift.elements |
|
|
, size, cudaMemcpyDeviceToHost) ); |
|
|
, size, cudaMemcpyDeviceToHost) ); |
|
|
|
|
|
gpuErrchk( cudaMemcpy(&((*mean_shift_vector)[0][0]), d_mean_shift_vector.elements |
|
|
|
|
|
, size, cudaMemcpyDeviceToHost) ); |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
gpuErrchk( cudaFree(d_kernel_matrix.elements) ); |
|
|
void free_device_memory(Matrix d_original_points, Matrix d_kernel_matrix, Matrix d_denominator, |
|
|
|
|
|
Matrix d_new_shift){ |
|
|
|
|
|
// frees all memory previously allocated in device |
|
|
gpuErrchk( cudaFree(d_original_points.elements) ); |
|
|
gpuErrchk( cudaFree(d_original_points.elements) ); |
|
|
|
|
|
gpuErrchk( cudaFree(d_kernel_matrix.elements) ); |
|
|
|
|
|
//gpuErrchk( cudaFree(d_shifted_points.elements) ); |
|
|
|
|
|
gpuErrchk( cudaFree(d_denominator.elements) ); |
|
|
gpuErrchk( cudaFree(d_new_shift.elements) ); |
|
|
gpuErrchk( cudaFree(d_new_shift.elements) ); |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
@ -434,53 +506,4 @@ void save_matrix(double **matrix, int iteration){ |
|
|
} |
|
|
} |
|
|
fprintf(file, "\n"); |
|
|
fprintf(file, "\n"); |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
double * calculate_denominator(double **kernel_matrix){ |
|
|
|
|
|
static bool first_iter = true; |
|
|
|
|
|
|
|
|
|
|
|
// allocates memory for denominator_matrix in GPU |
|
|
|
|
|
Matrix d_denominator_matrix; |
|
|
|
|
|
d_denominator_matrix.width = NUMBER_OF_POINTS; |
|
|
|
|
|
d_denominator_matrix.height = 1; |
|
|
|
|
|
int size = NUMBER_OF_POINTS * sizeof(double); |
|
|
|
|
|
gpuErrchk( cudaMalloc(&d_denominator_matrix.elements, size) ); |
|
|
|
|
|
|
|
|
|
|
|
// allocates memory for kernel_matrix in GPU and copies the array |
|
|
|
|
|
Matrix d_kernel_matrix; |
|
|
|
|
|
d_kernel_matrix.width = NUMBER_OF_POINTS; |
|
|
|
|
|
d_kernel_matrix.height = NUMBER_OF_POINTS; |
|
|
|
|
|
size = NUMBER_OF_POINTS * NUMBER_OF_POINTS * sizeof(double); |
|
|
|
|
|
gpuErrchk( cudaMalloc(&d_kernel_matrix.elements, size) ); |
|
|
|
|
|
gpuErrchk( cudaMemcpy(d_kernel_matrix.elements, &(kernel_matrix[0][0]) |
|
|
|
|
|
, size, cudaMemcpyHostToDevice) ); |
|
|
|
|
|
|
|
|
|
|
|
// get max sizes supported from the device |
|
|
|
|
|
int max_block_size = device_properties.maxThreadsPerBlock; |
|
|
|
|
|
dim3 dimBlock((d_denominator_matrix.height < sqrt(max_block_size)) ? d_denominator_matrix.height : sqrt(max_block_size) |
|
|
|
|
|
, (d_denominator_matrix.width < sqrt(max_block_size)) ? d_denominator_matrix.width : sqrt(max_block_size)); |
|
|
|
|
|
dim3 dimGrid((d_denominator_matrix.height + dimBlock.x - 1) / dimBlock.x |
|
|
|
|
|
, (d_denominator_matrix.width + dimBlock.y - 1) / dimBlock.y); |
|
|
|
|
|
|
|
|
|
|
|
if (first_iter && params.verbose){ |
|
|
|
|
|
printf("calculate_denominator called with:\n"); |
|
|
|
|
|
printf("dimBlock.x = %d, dimBlock.y = %d\n", dimBlock.x, dimBlock.y); |
|
|
|
|
|
printf("dimGrid.x = %d, dimGrid.y = %d\n\n", dimGrid.x, dimGrid.y); |
|
|
|
|
|
first_iter = false; |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
denominator_kernel<<<dimGrid, dimBlock>>>(d_denominator_matrix, d_kernel_matrix); |
|
|
|
|
|
gpuErrchk( cudaPeekAtLastError() ); |
|
|
|
|
|
gpuErrchk( cudaDeviceSynchronize() ); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
size = NUMBER_OF_POINTS * sizeof(double); |
|
|
|
|
|
double ** denominator = (double**)malloc(size); |
|
|
|
|
|
gpuErrchk( cudaMemcpy(&((*denominator)[0]), d_denominator_matrix.elements |
|
|
|
|
|
,size, cudaMemcpyDeviceToHost) ); |
|
|
|
|
|
|
|
|
|
|
|
gpuErrchk( cudaFree(d_kernel_matrix.elements) ); |
|
|
|
|
|
gpuErrchk( cudaFree(d_denominator_matrix.elements) ); |
|
|
|
|
|
|
|
|
|
|
|
return (*denominator); |
|
|
|
|
|
} |
|
|
} |