|
@ -12,7 +12,7 @@ |
|
|
|
|
|
|
|
|
cudaDeviceProp device_properties; |
|
|
cudaDeviceProp device_properties; |
|
|
|
|
|
|
|
|
struct timeval start, end; |
|
|
struct timeval start_w_time, end_w_time; |
|
|
double seq; |
|
|
double seq; |
|
|
|
|
|
|
|
|
//Based on: |
|
|
//Based on: |
|
@ -50,13 +50,13 @@ void set_GPU(){ |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
int meanshift(double **original_points, double ***shifted_points, int deviation |
|
|
int meanshift(double **original_points, double ***shifted_points, int deviation){ |
|
|
, parameters *opt){ |
|
|
|
|
|
// host variables |
|
|
// host variables |
|
|
int size = 0; |
|
|
int size = 0; |
|
|
static int iteration = 0; |
|
|
static int iteration = 0; |
|
|
static double **kernel_matrix, **mean_shift_vector; |
|
|
static double **kernel_matrix, **mean_shift_vector, w_memcpy_time; |
|
|
double **new_shift, current_norm = 0; |
|
|
double **new_shift, current_norm = 0, tmp_w_memcpy_time; |
|
|
|
|
|
bool is_first_iteration = false; |
|
|
|
|
|
|
|
|
// device variables |
|
|
// device variables |
|
|
static Matrix d_original_points, d_shifted_points, d_kernel_matrix, d_denominator, |
|
|
static Matrix d_original_points, d_shifted_points, d_kernel_matrix, d_denominator, |
|
@ -65,6 +65,7 @@ int meanshift(double **original_points, double ***shifted_points, int deviation |
|
|
|
|
|
|
|
|
// allocates memory and copies original points on first iteration |
|
|
// allocates memory and copies original points on first iteration |
|
|
if (iteration == 0 || (*shifted_points) == NULL){ |
|
|
if (iteration == 0 || (*shifted_points) == NULL){ |
|
|
|
|
|
is_first_iteration = true; |
|
|
// allocates memory for shifted points array and copies original points into it |
|
|
// allocates memory for shifted points array and copies original points into it |
|
|
(*shifted_points) = alloc_double(NUMBER_OF_POINTS, DIMENSIONS); |
|
|
(*shifted_points) = alloc_double(NUMBER_OF_POINTS, DIMENSIONS); |
|
|
duplicate(original_points, NUMBER_OF_POINTS, DIMENSIONS, shifted_points); |
|
|
duplicate(original_points, NUMBER_OF_POINTS, DIMENSIONS, shifted_points); |
|
@ -82,24 +83,26 @@ int meanshift(double **original_points, double ***shifted_points, int deviation |
|
|
kernel_matrix = alloc_double(NUMBER_OF_POINTS, NUMBER_OF_POINTS); |
|
|
kernel_matrix = alloc_double(NUMBER_OF_POINTS, NUMBER_OF_POINTS); |
|
|
|
|
|
|
|
|
// tic |
|
|
// tic |
|
|
gettimeofday (&start, NULL); |
|
|
gettimeofday (&start_w_time, NULL); |
|
|
|
|
|
|
|
|
// allocates corresponding memory in device |
|
|
// allocates corresponding memory in device |
|
|
init_device_memory(original_points, *shifted_points, &d_original_points, &d_shifted_points, |
|
|
init_device_memory(original_points, *shifted_points, &d_original_points, &d_shifted_points, |
|
|
&d_kernel_matrix, &d_denominator, &d_mean_shift_vector); |
|
|
&d_kernel_matrix, &d_denominator, &d_mean_shift_vector); |
|
|
// toc |
|
|
// toc |
|
|
gettimeofday (&end, NULL); |
|
|
gettimeofday (&end_w_time, NULL); |
|
|
seq = (double)((end.tv_usec - start.tv_usec)/1.0e6 + end.tv_sec - start.tv_sec); |
|
|
seq = (double)((end_w_time.tv_usec - start_w_time.tv_usec) |
|
|
|
|
|
/ 1.0e6 + end_w_time.tv_sec - start_w_time.tv_sec); |
|
|
|
|
|
|
|
|
// printf("%s wall clock time = %f\n","Device memory allocation", seq); |
|
|
if (params.verbose){ |
|
|
// to create output data file |
|
|
printf("Device memory allocation wall clock time = %f\n\n", seq); |
|
|
printf("%f ", seq); |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
// finds pairwise distance matrix (inside radius) |
|
|
// finds pairwise distance matrix (inside radius) |
|
|
// [I, D] = rangesearch(x,y,h); |
|
|
// [I, D] = rangesearch(x,y,h); |
|
|
calculate_kernel_matrix(d_shifted_points, d_original_points, d_kernel_matrix, deviation, |
|
|
calculate_kernel_matrix(d_shifted_points, d_original_points, d_kernel_matrix, deviation, |
|
|
&kernel_matrix); |
|
|
&kernel_matrix, &tmp_w_memcpy_time); |
|
|
|
|
|
w_memcpy_time += tmp_w_memcpy_time; |
|
|
|
|
|
|
|
|
// calculates denominator |
|
|
// calculates denominator |
|
|
calculate_denominator(d_kernel_matrix, d_denominator); |
|
|
calculate_denominator(d_kernel_matrix, d_denominator); |
|
@ -114,7 +117,9 @@ int meanshift(double **original_points, double ***shifted_points, int deviation |
|
|
gpuErrchk( cudaMalloc(&(d_new_shift.elements), size) ); |
|
|
gpuErrchk( cudaMalloc(&(d_new_shift.elements), size) ); |
|
|
|
|
|
|
|
|
shift_points(d_kernel_matrix, d_original_points, d_shifted_points, d_new_shift, d_denominator, |
|
|
shift_points(d_kernel_matrix, d_original_points, d_shifted_points, d_new_shift, d_denominator, |
|
|
d_mean_shift_vector, kernel_matrix, original_points, &new_shift, &mean_shift_vector); |
|
|
d_mean_shift_vector, kernel_matrix, original_points, &new_shift, &mean_shift_vector, |
|
|
|
|
|
&tmp_w_memcpy_time); |
|
|
|
|
|
w_memcpy_time += tmp_w_memcpy_time; |
|
|
|
|
|
|
|
|
// frees previously shifted points, they're now garbage |
|
|
// frees previously shifted points, they're now garbage |
|
|
free((*shifted_points)[0]); |
|
|
free((*shifted_points)[0]); |
|
@ -147,12 +152,16 @@ int meanshift(double **original_points, double ***shifted_points, int deviation |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
// iterates until convergence |
|
|
// iterates until convergence |
|
|
if (current_norm > opt->epsilon) { |
|
|
if (current_norm > params.epsilon) { |
|
|
++iteration; |
|
|
++iteration; |
|
|
meanshift(original_points, shifted_points, deviation, opt); |
|
|
meanshift(original_points, shifted_points, deviation); |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
if (is_first_iteration){ |
|
|
|
|
|
if (params.verbose){ |
|
|
|
|
|
printf("\nCopying between device and host wall clock time = %f\n", w_memcpy_time); |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
if (iteration == 0){ |
|
|
|
|
|
// cleans up allocations |
|
|
// cleans up allocations |
|
|
free(mean_shift_vector[0]); |
|
|
free(mean_shift_vector[0]); |
|
|
free(mean_shift_vector); |
|
|
free(mean_shift_vector); |
|
@ -206,7 +215,7 @@ void init_device_memory(double **original_points, double **shifted_points, |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
void calculate_kernel_matrix(Matrix d_shifted_points, Matrix d_original_points, |
|
|
void calculate_kernel_matrix(Matrix d_shifted_points, Matrix d_original_points, |
|
|
Matrix d_kernel_matrix, double deviation, double ***kernel_matrix){ |
|
|
Matrix d_kernel_matrix, double deviation, double ***kernel_matrix, double *w_memcpy_time){ |
|
|
int size; |
|
|
int size; |
|
|
static bool first_iter = true; |
|
|
static bool first_iter = true; |
|
|
// gets max block size supported from the device |
|
|
// gets max block size supported from the device |
|
@ -242,17 +251,15 @@ void calculate_kernel_matrix(Matrix d_shifted_points, Matrix d_original_points, |
|
|
size = NUMBER_OF_POINTS * NUMBER_OF_POINTS * sizeof(double); |
|
|
size = NUMBER_OF_POINTS * NUMBER_OF_POINTS * sizeof(double); |
|
|
|
|
|
|
|
|
// tic |
|
|
// tic |
|
|
gettimeofday (&start, NULL); |
|
|
gettimeofday (&start_w_time, NULL); |
|
|
|
|
|
|
|
|
gpuErrchk( cudaMemcpy(&((*kernel_matrix)[0][0]), d_kernel_matrix.elements |
|
|
gpuErrchk( cudaMemcpy(&((*kernel_matrix)[0][0]), d_kernel_matrix.elements |
|
|
, size, cudaMemcpyDeviceToHost) ); |
|
|
, size, cudaMemcpyDeviceToHost) ); |
|
|
|
|
|
|
|
|
// toc |
|
|
// toc |
|
|
gettimeofday (&end, NULL); |
|
|
gettimeofday (&end_w_time, NULL); |
|
|
seq = (double)((end.tv_usec - start.tv_usec)/1.0e6 + end.tv_sec - start.tv_sec); |
|
|
*w_memcpy_time = (double)((end_w_time.tv_usec - start_w_time.tv_usec) |
|
|
|
|
|
/ 1.0e6 + end_w_time.tv_sec - start_w_time.tv_sec); |
|
|
// printf("%s wall clock time = %f\n","Copying from device to host", seq); |
|
|
|
|
|
// to create output data file |
|
|
|
|
|
printf("%f ", seq); |
|
|
|
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
void calculate_denominator(Matrix d_kernel_matrix, Matrix d_denominator){ |
|
|
void calculate_denominator(Matrix d_kernel_matrix, Matrix d_denominator){ |
|
@ -289,7 +296,8 @@ void calculate_denominator(Matrix d_kernel_matrix, Matrix d_denominator){ |
|
|
|
|
|
|
|
|
void shift_points(Matrix d_kernel_matrix, Matrix d_original_points, Matrix d_shifted_points, |
|
|
void shift_points(Matrix d_kernel_matrix, Matrix d_original_points, Matrix d_shifted_points, |
|
|
Matrix d_new_shift, Matrix d_denominator, Matrix d_mean_shift_vector, double **kernel_matrix, |
|
|
Matrix d_new_shift, Matrix d_denominator, Matrix d_mean_shift_vector, double **kernel_matrix, |
|
|
double **original_points, double ***new_shift, double ***mean_shift_vector){ |
|
|
double **original_points, double ***new_shift, double ***mean_shift_vector, |
|
|
|
|
|
double *w_memcpy_time){ |
|
|
int size; |
|
|
int size; |
|
|
static bool first_iter = true; |
|
|
static bool first_iter = true; |
|
|
// gets max block size supported from the device |
|
|
// gets max block size supported from the device |
|
@ -325,7 +333,7 @@ void shift_points(Matrix d_kernel_matrix, Matrix d_original_points, Matrix d_shi |
|
|
size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double); |
|
|
size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double); |
|
|
|
|
|
|
|
|
// tic |
|
|
// tic |
|
|
gettimeofday (&start, NULL); |
|
|
gettimeofday (&start_w_time, NULL); |
|
|
|
|
|
|
|
|
gpuErrchk( cudaMemcpy(&((*new_shift)[0][0]), d_new_shift.elements |
|
|
gpuErrchk( cudaMemcpy(&((*new_shift)[0][0]), d_new_shift.elements |
|
|
, size, cudaMemcpyDeviceToHost) ); |
|
|
, size, cudaMemcpyDeviceToHost) ); |
|
@ -333,13 +341,9 @@ void shift_points(Matrix d_kernel_matrix, Matrix d_original_points, Matrix d_shi |
|
|
, size, cudaMemcpyDeviceToHost) ); |
|
|
, size, cudaMemcpyDeviceToHost) ); |
|
|
|
|
|
|
|
|
// toc |
|
|
// toc |
|
|
gettimeofday (&end, NULL); |
|
|
gettimeofday (&end_w_time, NULL); |
|
|
seq = (double)((end.tv_usec - start.tv_usec)/1.0e6 + end.tv_sec - start.tv_sec); |
|
|
*w_memcpy_time = (double)((end_w_time.tv_usec - start_w_time.tv_usec) |
|
|
|
|
|
/ 1.0e6 + end_w_time.tv_sec - start_w_time.tv_sec); |
|
|
// printf("%s wall clock time = %f\n","Copying from device to host", seq); |
|
|
|
|
|
// to create output data file |
|
|
|
|
|
printf("%f ", seq); |
|
|
|
|
|
|
|
|
|
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
void free_device_memory(Matrix d_original_points, Matrix d_kernel_matrix, Matrix d_denominator, |
|
|
void free_device_memory(Matrix d_original_points, Matrix d_kernel_matrix, Matrix d_denominator, |
|
|