Browse Source

Fix shared memory version for odd number of points

master
Apostolos Fanakis 7 years ago
parent
commit
736b71fded
  1. 6
      mean_shift_cuda/meanshift.cu
  2. 7
      mean_shift_cuda_shared_mem/meanshift_gpu_utils.cu
  3. 18
      mean_shift_cuda_shared_mem/meanshift_kernels.cu

6
mean_shift_cuda/meanshift.cu

@ -5,10 +5,10 @@
#include "meanshift_utils.h" #include "meanshift_utils.h"
#include "meanshift_gpu_utils.h" #include "meanshift_gpu_utils.h"
int DEVIATION = 31000; int DEVIATION = 1;
int NUMBER_OF_POINTS = 5000; int NUMBER_OF_POINTS = 587;
int DIMENSIONS = 2; int DIMENSIONS = 2;
char* POINTS_FILENAME = "../data/s4"; const char *POINTS_FILENAME = "../data/X.bin";
const char *LABELS_FILENAME = "../data/L.bin"; const char *LABELS_FILENAME = "../data/L.bin";
parameters params; parameters params;

7
mean_shift_cuda_shared_mem/meanshift_gpu_utils.cu

@ -40,6 +40,9 @@ void set_GPU(){
} }
// sets the device // sets the device
gpuErrchk( cudaSetDevice(max_device) ); gpuErrchk( cudaSetDevice(max_device) );
// lastly sets shared memory bank size to 8 bytes since data are represented by doubles
gpuErrchk( cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte) );
if (params.verbose){ if (params.verbose){
printf("Device chosen is \"%s\"\n" printf("Device chosen is \"%s\"\n"
"Device has %d multi processors and compute capability %d.%d\n" "Device has %d multi processors and compute capability %d.%d\n"
@ -324,8 +327,8 @@ void shift_points(Matrix d_kernel_matrix, Matrix d_original_points, Matrix d_shi
// size for kernel's dynamically allocated array // size for kernel's dynamically allocated array
// the size FOR EACH array is calculated as BLOCK_SIZE * BLOCK_SIZE * size_of_double // the size FOR EACH array is calculated as BLOCK_SIZE * BLOCK_SIZE * size_of_double
// the arrays nedded in kernel are two // the arrays needed in kernel are two
shared_memory_size = dimBlock.x * dimBlock.x * sizeof(double) * 2; shared_memory_size = (int)(dimBlock.x * dimBlock.x * sizeof(double) * 2);
shift_points_kernel<<<dimGrid, dimBlock, shared_memory_size>>>(d_original_points, shift_points_kernel<<<dimGrid, dimBlock, shared_memory_size>>>(d_original_points,
d_kernel_matrix, d_shifted_points, d_new_shift, d_denominator, d_mean_shift_vector); d_kernel_matrix, d_shifted_points, d_new_shift, d_denominator, d_mean_shift_vector);

18
mean_shift_cuda_shared_mem/meanshift_kernels.cu

@ -55,7 +55,7 @@ __global__ void denominator_kernel(Matrix denominator, Matrix kernel_matrix){
__global__ void shift_points_kernel(Matrix original_points, Matrix kernel_matrix, __global__ void shift_points_kernel(Matrix original_points, Matrix kernel_matrix,
Matrix shifted_points, Matrix new_shift, Matrix denominator, Matrix mean_shift_vector){ Matrix shifted_points, Matrix new_shift, Matrix denominator, Matrix mean_shift_vector){
int BLOCK_SIZE = blockDim.y; int BLOCK_SIZE = blockDim.x;
int block_row = blockIdx.x; int block_row = blockIdx.x;
int block_col = blockIdx.y; int block_col = blockIdx.y;
@ -67,8 +67,8 @@ __global__ void shift_points_kernel(Matrix original_points, Matrix kernel_matrix
int col = threadIdx.y; int col = threadIdx.y;
// performs calculations only if thread's indexes are within matrix bounds // performs calculations only if thread's indexes are within matrix bounds
if ((BLOCK_SIZE * block_row + row) >= new_shift.height || if ((BLOCK_SIZE * block_row + row) > new_shift.height ||
(BLOCK_SIZE * block_col + col) >= new_shift.width){ (BLOCK_SIZE * block_col + col) > new_shift.width){
return; return;
} }
@ -78,17 +78,13 @@ __global__ void shift_points_kernel(Matrix original_points, Matrix kernel_matrix
// dynamically allocated shared memory used to store sub_kernel_matrix and sub_original_points // dynamically allocated shared memory used to store sub_kernel_matrix and sub_original_points
// respectively // respectively
extern __shared__ double joined_shared_memory[]; extern __shared__ double joined_shared_memory[];
// makes sure enough memory has been allocated
joined_shared_memory[BLOCK_SIZE * BLOCK_SIZE * 2] = 0;
// first part of the allocated memory is used for s_sub_kernel_matrix and second part is used // first part of the allocated memory is used for s_sub_kernel_matrix and second part is used
// for s_sub_original_points // for s_sub_original_points
double *s_sub_kernel_matrix = &(joined_shared_memory[0]); double *s_sub_kernel_matrix = (double *)joined_shared_memory;
double *s_sub_original_points = &(joined_shared_memory[BLOCK_SIZE * BLOCK_SIZE]); double *s_sub_original_points = (double *)&joined_shared_memory[BLOCK_SIZE * BLOCK_SIZE];
// cancel execution if allocation failed
if (sizeof(s_sub_kernel_matrix) != BLOCK_SIZE * BLOCK_SIZE * 2){
__threadfence();
asm("trap;");
}
// loops over all the sub-matrices of kernel_matrix and original_points that are required to // loops over all the sub-matrices of kernel_matrix and original_points that are required to
// compute sub_new_shift, multiplies each pair of sub-matrices and accumulates the results // compute sub_new_shift, multiplies each pair of sub-matrices and accumulates the results

Loading…
Cancel
Save