diff --git a/mean_shift_cuda/meanshift.cu b/mean_shift_cuda/meanshift.cu index a82d1a3..e76c2bb 100644 --- a/mean_shift_cuda/meanshift.cu +++ b/mean_shift_cuda/meanshift.cu @@ -5,10 +5,10 @@ #include "meanshift_utils.h" #include "meanshift_gpu_utils.h" -int DEVIATION = 31000; -int NUMBER_OF_POINTS = 5000; +int DEVIATION = 1; +int NUMBER_OF_POINTS = 587; int DIMENSIONS = 2; -char* POINTS_FILENAME = "../data/s4"; +const char *POINTS_FILENAME = "../data/X.bin"; const char *LABELS_FILENAME = "../data/L.bin"; parameters params; diff --git a/mean_shift_cuda_shared_mem/meanshift_gpu_utils.cu b/mean_shift_cuda_shared_mem/meanshift_gpu_utils.cu index 2a72557..6566df4 100644 --- a/mean_shift_cuda_shared_mem/meanshift_gpu_utils.cu +++ b/mean_shift_cuda_shared_mem/meanshift_gpu_utils.cu @@ -40,6 +40,9 @@ void set_GPU(){ } // sets the device gpuErrchk( cudaSetDevice(max_device) ); + // lastly sets shared memory bank size to 8 bytes since data are represented by doubles + gpuErrchk( cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte) ); + if (params.verbose){ printf("Device chosen is \"%s\"\n" "Device has %d multi processors and compute capability %d.%d\n" @@ -324,8 +327,8 @@ void shift_points(Matrix d_kernel_matrix, Matrix d_original_points, Matrix d_shi // size for kernel's dynamically allocated array // the size FOR EACH array is calculated as BLOCK_SIZE * BLOCK_SIZE * size_of_double - // the arrays nedded in kernel are two - shared_memory_size = dimBlock.x * dimBlock.x * sizeof(double) * 2; + // the arrays needed in kernel are two + shared_memory_size = (int)(dimBlock.x * dimBlock.x * sizeof(double) * 2); shift_points_kernel<<>>(d_original_points, d_kernel_matrix, d_shifted_points, d_new_shift, d_denominator, d_mean_shift_vector); diff --git a/mean_shift_cuda_shared_mem/meanshift_kernels.cu b/mean_shift_cuda_shared_mem/meanshift_kernels.cu index d109d6b..9c2d948 100644 --- a/mean_shift_cuda_shared_mem/meanshift_kernels.cu +++ b/mean_shift_cuda_shared_mem/meanshift_kernels.cu @@ -55,7 +55,7 @@ __global__ void denominator_kernel(Matrix denominator, Matrix kernel_matrix){ __global__ void shift_points_kernel(Matrix original_points, Matrix kernel_matrix, Matrix shifted_points, Matrix new_shift, Matrix denominator, Matrix mean_shift_vector){ - int BLOCK_SIZE = blockDim.y; + int BLOCK_SIZE = blockDim.x; int block_row = blockIdx.x; int block_col = blockIdx.y; @@ -67,8 +67,8 @@ __global__ void shift_points_kernel(Matrix original_points, Matrix kernel_matrix int col = threadIdx.y; // performs calculations only if thread's indexes are within matrix bounds - if ((BLOCK_SIZE * block_row + row) >= new_shift.height || - (BLOCK_SIZE * block_col + col) >= new_shift.width){ + if ((BLOCK_SIZE * block_row + row) > new_shift.height || + (BLOCK_SIZE * block_col + col) > new_shift.width){ return; } @@ -78,17 +78,13 @@ __global__ void shift_points_kernel(Matrix original_points, Matrix kernel_matrix // dynamically allocated shared memory used to store sub_kernel_matrix and sub_original_points // respectively extern __shared__ double joined_shared_memory[]; + // makes sure enough memory has been allocated + joined_shared_memory[BLOCK_SIZE * BLOCK_SIZE * 2] = 0; // first part of the allocated memory is used for s_sub_kernel_matrix and second part is used // for s_sub_original_points - double *s_sub_kernel_matrix = &(joined_shared_memory[0]); - double *s_sub_original_points = &(joined_shared_memory[BLOCK_SIZE * BLOCK_SIZE]); - - // cancel execution if allocation failed - if (sizeof(s_sub_kernel_matrix) != BLOCK_SIZE * BLOCK_SIZE * 2){ - __threadfence(); - asm("trap;"); - } + double *s_sub_kernel_matrix = (double *)joined_shared_memory; + double *s_sub_original_points = (double *)&joined_shared_memory[BLOCK_SIZE * BLOCK_SIZE]; // loops over all the sub-matrices of kernel_matrix and original_points that are required to // compute sub_new_shift, multiplies each pair of sub-matrices and accumulates the results