diff --git a/mean_shift_cuda_shared_mem/meanshift.cu b/mean_shift_cuda_shared_mem/meanshift.cu index 171c7b3..3ab88f7 100644 --- a/mean_shift_cuda_shared_mem/meanshift.cu +++ b/mean_shift_cuda_shared_mem/meanshift.cu @@ -5,10 +5,10 @@ #include "meanshift_utils.h" #include "meanshift_gpu_utils.h" -int DEVIATION = 20; -int NUMBER_OF_POINTS = 1024; -int DIMENSIONS = 32; -const char *POINTS_FILENAME = "../data/32"; +int DEVIATION = 1; +int NUMBER_OF_POINTS = 600; +int DIMENSIONS = 2; +const char *POINTS_FILENAME = "../data/X.bin"; const char *LABELS_FILENAME = "../data/L.bin"; parameters params; @@ -33,7 +33,6 @@ int main(int argc, char **argv){ // toc gettimeofday (&endwtime, NULL); seq_time = (double)((endwtime.tv_usec - startwtime.tv_usec)/1.0e6 + endwtime.tv_sec - startwtime.tv_sec); - printf("\nTotal number of recursions = %d\n", recursions); printf("%s wall clock time = %f\n","Mean Shift", seq_time); diff --git a/mean_shift_cuda_shared_mem/meanshift_gpu_utils.cu b/mean_shift_cuda_shared_mem/meanshift_gpu_utils.cu index 6e6437d..6cd2a85 100644 --- a/mean_shift_cuda_shared_mem/meanshift_gpu_utils.cu +++ b/mean_shift_cuda_shared_mem/meanshift_gpu_utils.cu @@ -55,24 +55,26 @@ void set_GPU(){ int meanshift(double **original_points, double ***shifted_points, int deviation){ // host variables - int size = 0; static int recursion = 0; - static double **kernel_matrix, **mean_shift_vector, w_memcpy_time; - double **new_shift, current_norm = 0, tmp_w_memcpy_time; + static double **kernel_matrix, **new_shift, **mean_shift_vector, w_memcpy_time; + double current_norm = 0, tmp_w_memcpy_time; bool is_first_recursion = false; // device variables static Matrix d_original_points, d_shifted_points, d_kernel_matrix, d_denominator, - d_mean_shift_vector; - Matrix d_new_shift; + d_new_shift, d_mean_shift_vector; // allocates memory and copies original points on first recursion if (recursion == 0 || (*shifted_points) == NULL){ is_first_recursion = true; + // allocates memory for shifted points array and copies original points into it (*shifted_points) = alloc_double(NUMBER_OF_POINTS, DIMENSIONS); duplicate(original_points, NUMBER_OF_POINTS, DIMENSIONS, shifted_points); + // allocates memory for new shift array + new_shift = alloc_double(NUMBER_OF_POINTS, DIMENSIONS); + // allocates memory for mean shift vector mean_shift_vector = alloc_double(NUMBER_OF_POINTS, DIMENSIONS); // initializes elements of mean_shift_vector to inf @@ -90,7 +92,7 @@ int meanshift(double **original_points, double ***shifted_points, int deviation) // allocates corresponding memory in device init_device_memory(original_points, *shifted_points, &d_original_points, &d_shifted_points, - &d_kernel_matrix, &d_denominator, &d_mean_shift_vector); + &d_kernel_matrix, &d_denominator, &d_new_shift, &d_mean_shift_vector); // toc gettimeofday (&end_w_time, NULL); seq = (double)((end_w_time.tv_usec - start_w_time.tv_usec) @@ -110,27 +112,19 @@ int meanshift(double **original_points, double ***shifted_points, int deviation) // calculates denominator calculate_denominator(d_kernel_matrix, d_denominator); - // creates new y vector - // allocates memory in every recursion - new_shift = alloc_double(NUMBER_OF_POINTS, DIMENSIONS); - // allocates corresponding memory in device - d_new_shift.width = DIMENSIONS; - d_new_shift.height = NUMBER_OF_POINTS; - d_new_shift.stride = d_new_shift.width; - size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double); - gpuErrchk( cudaMalloc(&(d_new_shift.elements), size) ); - shift_points(d_kernel_matrix, d_original_points, d_shifted_points, d_new_shift, d_denominator, d_mean_shift_vector, kernel_matrix, original_points, &new_shift, &mean_shift_vector, &tmp_w_memcpy_time); w_memcpy_time += tmp_w_memcpy_time; - // frees previously shifted points, they're now garbage - free((*shifted_points)[0]); - gpuErrchk( cudaFree(d_shifted_points.elements) ); // updates shifted points pointer to the new array address + double ***temp = shifted_points; shifted_points = &new_shift; + new_shift = *temp; + + double *d_temp = d_shifted_points.elements; d_shifted_points.elements = d_new_shift.elements; + d_new_shift.elements = d_temp; if (params.display){ save_matrix((*shifted_points), recursion); @@ -172,8 +166,11 @@ int meanshift(double **original_points, double ***shifted_points, int deviation) free(mean_shift_vector); free(kernel_matrix[0]); free(kernel_matrix); + //free(new_shift[0]); + //free(new_shift); - free_device_memory(d_original_points, d_kernel_matrix, d_denominator, d_shifted_points); + free_device_memory(d_original_points, d_kernel_matrix, d_denominator, d_shifted_points, + d_new_shift); } return recursion; @@ -181,7 +178,7 @@ int meanshift(double **original_points, double ***shifted_points, int deviation) void init_device_memory(double **original_points, double **shifted_points, Matrix *d_original_points, Matrix *d_shifted_points, Matrix *d_kernel_matrix, - Matrix *d_denominator, Matrix *d_mean_shift_vector){ + Matrix *d_denominator, Matrix *d_new_shift, Matrix *d_mean_shift_vector){ int size; // allocates memory for original_points in GPU and copies the array @@ -216,6 +213,13 @@ void init_device_memory(double **original_points, double **shifted_points, size = NUMBER_OF_POINTS * sizeof(double); gpuErrchk( cudaMalloc(&(d_denominator->elements), size) ); + // allocates memory for new_shift in GPU + d_new_shift->width = DIMENSIONS; + d_new_shift->height = NUMBER_OF_POINTS; + d_new_shift->stride = d_new_shift->width; + size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double); + gpuErrchk( cudaMalloc(&(d_new_shift->elements), size) ); + // allocates memory for mean_shift_vector in GPU d_mean_shift_vector->width = DIMENSIONS; d_mean_shift_vector->height = NUMBER_OF_POINTS; @@ -367,10 +371,11 @@ void shift_points(Matrix d_kernel_matrix, Matrix d_original_points, Matrix d_shi } void free_device_memory(Matrix d_original_points, Matrix d_kernel_matrix, Matrix d_denominator, - Matrix d_shifted_points){ + Matrix d_shifted_points, Matrix d_new_shift){ // frees all memory previously allocated in device gpuErrchk( cudaFree(d_original_points.elements) ); gpuErrchk( cudaFree(d_kernel_matrix.elements) ); gpuErrchk( cudaFree(d_denominator.elements) ); gpuErrchk( cudaFree(d_shifted_points.elements) ); + gpuErrchk( cudaFree(d_new_shift.elements) ); } \ No newline at end of file diff --git a/mean_shift_cuda_shared_mem/meanshift_gpu_utils.h b/mean_shift_cuda_shared_mem/meanshift_gpu_utils.h index 83f9784..5669ae6 100644 --- a/mean_shift_cuda_shared_mem/meanshift_gpu_utils.h +++ b/mean_shift_cuda_shared_mem/meanshift_gpu_utils.h @@ -1,5 +1,5 @@ -#ifndef SERIAL_GPU_UTILS_H /* Include guard */ -#define SERIAL_GPU_UTILS_H +#ifndef MEANSHIFT_GPU_UTILS_H /* Include guard */ +#define MEANSHIFT_GPU_UTILS_H #include "meanshift_kernels.h" @@ -33,7 +33,7 @@ int meanshift(double **original_points, double ***shifted_points, int h); //Function init_device_memory allocates memory for necessary arrays in the device void init_device_memory(double **original_points, double **shifted_points, Matrix *d_original_points, Matrix *d_shifted_points, Matrix *d_kernel_matrix, - Matrix *d_denominator, Matrix *d_new_shift); + Matrix *d_denominator, Matrix *d_new_shift, Matrix *d_mean_shift_vector); //Function calculate_kernel_matrix is a wrapper for the kernel call of the corresponding kernel //"calculate_kernel_matrix_kernel" that calculates the kernel matrix @@ -53,6 +53,6 @@ void shift_points(Matrix d_kernel_matrix, Matrix d_original_points, Matrix d_shi //Function free_device_memory frees device's previously allocated memory void free_device_memory(Matrix d_original_points, Matrix d_kernel_matrix, Matrix d_denominator, - Matrix d_shifted_points); + Matrix d_shifted_points, Matrix d_new_shift); -#endif //SERIAL_GPU_UTILS_H \ No newline at end of file +#endif //MEANSHIFT_GPU_UTILS_H \ No newline at end of file diff --git a/mean_shift_cuda_shared_mem/meanshift_kernels.cu b/mean_shift_cuda_shared_mem/meanshift_kernels.cu index 2181327..3b5cda5 100644 --- a/mean_shift_cuda_shared_mem/meanshift_kernels.cu +++ b/mean_shift_cuda_shared_mem/meanshift_kernels.cu @@ -68,8 +68,8 @@ __global__ void shift_points_kernel(Matrix original_points, Matrix kernel_matrix int col = threadIdx.y; // performs calculations only if thread's indexes are within matrix bounds - if ((ROW_BLOCK_SIZE * block_row + row) > new_shift.height || - (COLUMN_BLOCK_SIZE * block_col + col) > new_shift.width){ + if ((ROW_BLOCK_SIZE * block_row + row) >= new_shift.height || + (COLUMN_BLOCK_SIZE * block_col + col) >= new_shift.width){ return; } diff --git a/mean_shift_cuda_shared_mem/meanshift_kernels.h b/mean_shift_cuda_shared_mem/meanshift_kernels.h index db93809..65998d0 100644 --- a/mean_shift_cuda_shared_mem/meanshift_kernels.h +++ b/mean_shift_cuda_shared_mem/meanshift_kernels.h @@ -1,5 +1,5 @@ -#ifndef SERIAL_KERNELS_H /* Include guard */ -#define SERIAL_KERNELS_H +#ifndef MEANSHIFT_KERNELS_H /* Include guard */ +#define MEANSHIFT_KERNELS_H /* Structures */ @@ -27,4 +27,4 @@ __global__ void shift_points_kernel(Matrix original_points, Matrix kernel_matrix __device__ Matrix get_sub_matrix(Matrix A, int row, int col, int ROW_BLOCK_SIZE, int COLUMN_BLOCK_SIZE); -#endif //SERIAL_KERNELS_H \ No newline at end of file +#endif //MEANSHIFT_KERNELS_H \ No newline at end of file diff --git a/mean_shift_cuda_shared_mem/meanshift_utils.h b/mean_shift_cuda_shared_mem/meanshift_utils.h index 3be505a..d751695 100644 --- a/mean_shift_cuda_shared_mem/meanshift_utils.h +++ b/mean_shift_cuda_shared_mem/meanshift_utils.h @@ -1,5 +1,5 @@ -#ifndef SERIAL_UTILS_H /* Include guard */ -#define SERIAL_UTILS_H +#ifndef MEANSHIFT_UTILS_H /* Include guard */ +#define MEANSHIFT_UTILS_H #include <stdbool.h> @@ -32,4 +32,4 @@ void print_matrix(double **array, int rows, int cols); //If a file already exists new lines are concatenated void save_matrix(double **matrix, int iteration); -#endif //SERIAL_UTILS_H \ No newline at end of file +#endif //MEANSHIFT_UTILS_H \ No newline at end of file