diff --git a/mean_shift_cuda/meanshift_gpu_utils.cu b/mean_shift_cuda/meanshift_gpu_utils.cu index 6cde797..b49899d 100644 --- a/mean_shift_cuda/meanshift_gpu_utils.cu +++ b/mean_shift_cuda/meanshift_gpu_utils.cu @@ -133,20 +133,23 @@ int meanshift(double **original_points, double ***shifted_points, int deviation) } // calculates norm of the new mean shift vector in GPU using "cuBlas" library function - cublasHandle_t handle; - cublasStatus_t cublas_status = cublasCreate(&handle); - if (cublas_status != CUBLAS_STATUS_SUCCESS){ - exit(cublas_status); - } - cublas_status = cublasDnrm2(handle, NUMBER_OF_POINTS * DIMENSIONS, d_mean_shift_vector.elements, - 1, ¤t_norm); - if (cublas_status != CUBLAS_STATUS_SUCCESS){ - exit(cublas_status); - } - cublas_status = cublasDestroy(handle); - if (cublas_status != CUBLAS_STATUS_SUCCESS){ - exit(cublas_status); - } + // TODO REPLACE WITH KERNEL NORM +// cublasHandle_t handle; +// cublasStatus_t cublas_status = cublasCreate(&handle); +// if (cublas_status != CUBLAS_STATUS_SUCCESS){ +// exit(cublas_status); +// } +// cublas_status = cublasDnrm2(handle, NUMBER_OF_POINTS * DIMENSIONS, d_mean_shift_vector.elements, +// 1, ¤t_norm); +// if (cublas_status != CUBLAS_STATUS_SUCCESS){ +// exit(cublas_status); +// } +// cublas_status = cublasDestroy(handle); +// if (cublas_status != CUBLAS_STATUS_SUCCESS){ +// exit(cublas_status); +// } + calculate_norm(d_mean_shift_vector, ¤t_norm); + if (params.verbose){ printf("Recursion n. %d, error\t%f \n", recursion, current_norm); @@ -295,9 +298,9 @@ void calculate_denominator(Matrix d_kernel_matrix, Matrix d_denominator){ } void shift_points(Matrix d_kernel_matrix, Matrix d_original_points, Matrix d_shifted_points, - Matrix d_new_shift, Matrix d_denominator, Matrix d_mean_shift_vector, double **kernel_matrix, - double **original_points, double ***new_shift, double ***mean_shift_vector, - double *w_memcpy_time){ + Matrix d_new_shift, Matrix d_denominator, Matrix d_mean_shift_vector, double **kernel_matrix, + double **original_points, double ***new_shift, + double ***mean_shift_vector, double *w_memcpy_time){ int size; static bool first_iter = true; // gets max block size supported from the device @@ -346,6 +349,41 @@ void shift_points(Matrix d_kernel_matrix, Matrix d_original_points, Matrix d_shi / 1.0e6 + end_w_time.tv_sec - start_w_time.tv_sec); } +void calculate_norm(Matrix d_mean_shift_vector, double *current_norm){ + int size; + static bool first_iter = true; + // gets max block size supported from the device + static int max_block_size = device_properties.maxThreadsPerBlock; + static int requested_block_size = (int)(max_block_size / d_mean_shift_vector.width); + bool block_size_too_big = true; + + dim3 dimBlock; + dim3 dimGrid; + do { + dimBlock.x = requested_block_size; + dimBlock.y = d_mean_shift_vector.width; + dimGrid.x = (d_mean_shift_vector.height + dimBlock.x - 1) / dimBlock.x; + dimGrid.y = 1; + + norm<<>>(d_mean_shift_vector, ¤t_norm); + if (cudaGetLastError() != cudaSuccess){ + --requested_block_size; + } else { + block_size_too_big = false; + gpuErrchk( cudaDeviceSynchronize() ); + } + } while(block_size_too_big); + + if (first_iter && params.verbose){ + printf("norm_kernel called with:\n"); + printf("dimBlock.x = %d, dimBlock.y = %d\n", dimBlock.x, dimBlock.y); + printf("dimGrid.x = %d, dimGrid.y = %d\n\n", dimGrid.x, dimGrid.y); + first_iter = false; + } + + size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double); +} + void free_device_memory(Matrix d_original_points, Matrix d_kernel_matrix, Matrix d_denominator, Matrix d_shifted_points){ // frees all memory previously allocated in device diff --git a/mean_shift_cuda/meanshift_gpu_utils.h b/mean_shift_cuda/meanshift_gpu_utils.h index ced2ce5..a968d58 100644 --- a/mean_shift_cuda/meanshift_gpu_utils.h +++ b/mean_shift_cuda/meanshift_gpu_utils.h @@ -51,6 +51,10 @@ void shift_points(Matrix d_kernel_matrix, Matrix d_original_points, Matrix d_shi double **original_points, double ***new_shift, double ***mean_shift_vector, double *w_memcpy_time); +//Function calculate_norm is a wrapper for the kernel call of the corresponing kernel +//"norm" that calculate the norm of the mean_shift_vector matrix +void calculate_norm(Matrix d_mean_shift_vector, double *current_norm); + //Function free_device_memory frees device's previously allocated memory void free_device_memory(Matrix d_original_points, Matrix d_kernel_matrix, Matrix d_denominator, Matrix d_shifted_points);