diff --git a/mean_shift_cuda/meanshift.cu b/mean_shift_cuda/meanshift.cu index 649e10e..7b2d568 100644 --- a/mean_shift_cuda/meanshift.cu +++ b/mean_shift_cuda/meanshift.cu @@ -30,7 +30,7 @@ int main(int argc, char **argv){ // tic gettimeofday (&startwtime, NULL); - iterations = meanshift(vectors, &shifted_points, DEVIATION, ¶ms); + iterations = meanshift(vectors, &shifted_points, DEVIATION); // toc gettimeofday (&endwtime, NULL); diff --git a/mean_shift_cuda/meanshift_gpu_utils.cu b/mean_shift_cuda/meanshift_gpu_utils.cu index c92e78f..87e11d4 100644 --- a/mean_shift_cuda/meanshift_gpu_utils.cu +++ b/mean_shift_cuda/meanshift_gpu_utils.cu @@ -12,7 +12,7 @@ cudaDeviceProp device_properties; -struct timeval start, end; +struct timeval start_w_time, end_w_time; double seq; //Based on: @@ -50,13 +50,13 @@ void set_GPU(){ } } -int meanshift(double **original_points, double ***shifted_points, int deviation - , parameters *opt){ +int meanshift(double **original_points, double ***shifted_points, int deviation){ // host variables int size = 0; static int iteration = 0; - static double **kernel_matrix, **mean_shift_vector; - double **new_shift, current_norm = 0; + static double **kernel_matrix, **mean_shift_vector, w_memcpy_time; + double **new_shift, current_norm = 0, tmp_w_memcpy_time; + bool is_first_iteration = false; // device variables static Matrix d_original_points, d_shifted_points, d_kernel_matrix, d_denominator, @@ -65,6 +65,7 @@ int meanshift(double **original_points, double ***shifted_points, int deviation // allocates memory and copies original points on first iteration if (iteration == 0 || (*shifted_points) == NULL){ + is_first_iteration = true; // allocates memory for shifted points array and copies original points into it (*shifted_points) = alloc_double(NUMBER_OF_POINTS, DIMENSIONS); duplicate(original_points, NUMBER_OF_POINTS, DIMENSIONS, shifted_points); @@ -82,24 +83,26 @@ int meanshift(double **original_points, double ***shifted_points, int deviation kernel_matrix = alloc_double(NUMBER_OF_POINTS, NUMBER_OF_POINTS); // tic - gettimeofday (&start, NULL); + gettimeofday (&start_w_time, NULL); // allocates corresponding memory in device init_device_memory(original_points, *shifted_points, &d_original_points, &d_shifted_points, &d_kernel_matrix, &d_denominator, &d_mean_shift_vector); // toc - gettimeofday (&end, NULL); - seq = (double)((end.tv_usec - start.tv_usec)/1.0e6 + end.tv_sec - start.tv_sec); + gettimeofday (&end_w_time, NULL); + seq = (double)((end_w_time.tv_usec - start_w_time.tv_usec) + / 1.0e6 + end_w_time.tv_sec - start_w_time.tv_sec); -// printf("%s wall clock time = %f\n","Device memory allocation", seq); - // to create output data file - printf("%f ", seq); + if (params.verbose){ + printf("Device memory allocation wall clock time = %f\n\n", seq); + } } // finds pairwise distance matrix (inside radius) // [I, D] = rangesearch(x,y,h); calculate_kernel_matrix(d_shifted_points, d_original_points, d_kernel_matrix, deviation, - &kernel_matrix); + &kernel_matrix, &tmp_w_memcpy_time); + w_memcpy_time += tmp_w_memcpy_time; // calculates denominator calculate_denominator(d_kernel_matrix, d_denominator); @@ -114,7 +117,9 @@ int meanshift(double **original_points, double ***shifted_points, int deviation gpuErrchk( cudaMalloc(&(d_new_shift.elements), size) ); shift_points(d_kernel_matrix, d_original_points, d_shifted_points, d_new_shift, d_denominator, - d_mean_shift_vector, kernel_matrix, original_points, &new_shift, &mean_shift_vector); + d_mean_shift_vector, kernel_matrix, original_points, &new_shift, &mean_shift_vector, + &tmp_w_memcpy_time); + w_memcpy_time += tmp_w_memcpy_time; // frees previously shifted points, they're now garbage free((*shifted_points)[0]); @@ -147,12 +152,16 @@ int meanshift(double **original_points, double ***shifted_points, int deviation } // iterates until convergence - if (current_norm > opt->epsilon) { + if (current_norm > params.epsilon) { ++iteration; - meanshift(original_points, shifted_points, deviation, opt); + meanshift(original_points, shifted_points, deviation); } - if (iteration == 0){ + if (is_first_iteration){ + if (params.verbose){ + printf("\nCopying between device and host wall clock time = %f\n", w_memcpy_time); + } + // cleans up allocations free(mean_shift_vector[0]); free(mean_shift_vector); @@ -206,7 +215,7 @@ void init_device_memory(double **original_points, double **shifted_points, } void calculate_kernel_matrix(Matrix d_shifted_points, Matrix d_original_points, - Matrix d_kernel_matrix, double deviation, double ***kernel_matrix){ + Matrix d_kernel_matrix, double deviation, double ***kernel_matrix, double *w_memcpy_time){ int size; static bool first_iter = true; // gets max block size supported from the device @@ -242,17 +251,15 @@ void calculate_kernel_matrix(Matrix d_shifted_points, Matrix d_original_points, size = NUMBER_OF_POINTS * NUMBER_OF_POINTS * sizeof(double); // tic - gettimeofday (&start, NULL); + gettimeofday (&start_w_time, NULL); + gpuErrchk( cudaMemcpy(&((*kernel_matrix)[0][0]), d_kernel_matrix.elements , size, cudaMemcpyDeviceToHost) ); // toc - gettimeofday (&end, NULL); - seq = (double)((end.tv_usec - start.tv_usec)/1.0e6 + end.tv_sec - start.tv_sec); - -// printf("%s wall clock time = %f\n","Copying from device to host", seq); - // to create output data file - printf("%f ", seq); + gettimeofday (&end_w_time, NULL); + *w_memcpy_time = (double)((end_w_time.tv_usec - start_w_time.tv_usec) + / 1.0e6 + end_w_time.tv_sec - start_w_time.tv_sec); } void calculate_denominator(Matrix d_kernel_matrix, Matrix d_denominator){ @@ -289,7 +296,8 @@ void calculate_denominator(Matrix d_kernel_matrix, Matrix d_denominator){ void shift_points(Matrix d_kernel_matrix, Matrix d_original_points, Matrix d_shifted_points, Matrix d_new_shift, Matrix d_denominator, Matrix d_mean_shift_vector, double **kernel_matrix, - double **original_points, double ***new_shift, double ***mean_shift_vector){ + double **original_points, double ***new_shift, double ***mean_shift_vector, + double *w_memcpy_time){ int size; static bool first_iter = true; // gets max block size supported from the device @@ -325,7 +333,7 @@ void shift_points(Matrix d_kernel_matrix, Matrix d_original_points, Matrix d_shi size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double); // tic - gettimeofday (&start, NULL); + gettimeofday (&start_w_time, NULL); gpuErrchk( cudaMemcpy(&((*new_shift)[0][0]), d_new_shift.elements , size, cudaMemcpyDeviceToHost) ); @@ -333,13 +341,9 @@ void shift_points(Matrix d_kernel_matrix, Matrix d_original_points, Matrix d_shi , size, cudaMemcpyDeviceToHost) ); // toc - gettimeofday (&end, NULL); - seq = (double)((end.tv_usec - start.tv_usec)/1.0e6 + end.tv_sec - start.tv_sec); - -// printf("%s wall clock time = %f\n","Copying from device to host", seq); - // to create output data file - printf("%f ", seq); - + gettimeofday (&end_w_time, NULL); + *w_memcpy_time = (double)((end_w_time.tv_usec - start_w_time.tv_usec) + / 1.0e6 + end_w_time.tv_sec - start_w_time.tv_sec); } void free_device_memory(Matrix d_original_points, Matrix d_kernel_matrix, Matrix d_denominator, diff --git a/mean_shift_cuda/meanshift_gpu_utils.h b/mean_shift_cuda/meanshift_gpu_utils.h index 1c2cdf5..10c12da 100644 --- a/mean_shift_cuda/meanshift_gpu_utils.h +++ b/mean_shift_cuda/meanshift_gpu_utils.h @@ -28,8 +28,7 @@ void set_GPU(); //Function meanshift recursively shifts original points according to the mean-shift algorithm saving //the result to shiftedPoints. Struct opt has user options, h is the desirable deviation -int meanshift(double **original_points, double ***shifted_points, int h - , Parameters *opt); +int meanshift(double **original_points, double ***shifted_points, int h); //Function init_device_memory allocates memory for necessary arrays in the device void init_device_memory(double **original_points, double **shifted_points, @@ -39,7 +38,7 @@ void init_device_memory(double **original_points, double **shifted_points, //Function calculate_kernel_matrix is a wrapper for the kernel call of the corresponding kernel //"calculate_kernel_matrix_kernel" that calculates the kernel matrix void calculate_kernel_matrix(Matrix d_shifted_points, Matrix d_original_points, - Matrix d_kernel_matrix, double deviation, double ***kernel_matrix); + Matrix d_kernel_matrix, double deviation, double ***kernel_matrix, double *w_memcpy_time); //Function calculate_denominator is a wrapper for the kernel call of the corresponding kernel //"calculate_denominator_kernel" that calculates the denominator of shifted points fraction @@ -49,7 +48,8 @@ void calculate_denominator(Matrix d_kernel_matrix, Matrix d_denominator); //"shift_points_kernel" that shifts the positions of all points void shift_points(Matrix d_kernel_matrix, Matrix d_original_points, Matrix d_shifted_points, Matrix d_new_shift, Matrix d_denominator, Matrix d_mean_shift_vector, double **kernel_matrix, - double **original_points, double ***new_shift, double ***mean_shift_vector); + double **original_points, double ***new_shift, double ***mean_shift_vector, + double *w_memcpy_time); //Function free_device_memory frees device's previously allocated memory void free_device_memory(Matrix d_original_points, Matrix d_kernel_matrix, Matrix d_denominator,