Browse Source

Fixes for shift_points_kernel

master
Apostolos Fanakis 7 years ago
parent
commit
fea441f712
  1. 2
      mean_shift_cuda_shared_mem/meanshift.cu
  2. 7
      mean_shift_cuda_shared_mem/meanshift_gpu_utils.cu
  3. 21
      mean_shift_cuda_shared_mem/meanshift_kernels.cu
  4. 2
      mean_shift_cuda_shared_mem/meanshift_kernels.h

2
mean_shift_cuda_shared_mem/meanshift.cu

@ -6,7 +6,7 @@
#include "meanshift_gpu_utils.h" #include "meanshift_gpu_utils.h"
int DEVIATION = 1; int DEVIATION = 1;
int NUMBER_OF_POINTS = 600; int NUMBER_OF_POINTS = 587;
int DIMENSIONS = 2; int DIMENSIONS = 2;
const char *POINTS_FILENAME = "../data/X.bin"; const char *POINTS_FILENAME = "../data/X.bin";
const char *LABELS_FILENAME = "../data/L.bin"; const char *LABELS_FILENAME = "../data/L.bin";

7
mean_shift_cuda_shared_mem/meanshift_gpu_utils.cu

@ -304,14 +304,13 @@ void shift_points(Matrix d_kernel_matrix, Matrix d_original_points, Matrix d_shi
Matrix d_new_shift, Matrix d_denominator, Matrix d_mean_shift_vector, double **kernel_matrix, Matrix d_new_shift, Matrix d_denominator, Matrix d_mean_shift_vector, double **kernel_matrix,
double **original_points, double ***new_shift, double ***mean_shift_vector, double **original_points, double ***new_shift, double ***mean_shift_vector,
double *w_memcpy_time){ double *w_memcpy_time){
static int min_dimension = min(d_new_shift.width, d_new_shift.height);
int shared_memory_size, size; int shared_memory_size, size;
static bool first_iter = true; static bool first_iter = true;
// gets max block size supported from the device // gets max block size supported from the device
static int max_block_size = device_properties.maxThreadsPerBlock; static int max_block_size = device_properties.maxThreadsPerBlock;
// requests a block size equal to // requests a block size equal to
static int requested_block_size = (min_dimension <= sqrt(max_block_size)) static int requested_block_size = (d_new_shift.width <= sqrt(max_block_size))
? min_dimension // the min dimension if it's lower than the maximum block size supported ? d_new_shift.width // the min dimension if it's lower than the maximum block size supported
: max_block_size; // the maximum block size otherwise : max_block_size; // the maximum block size otherwise
bool block_size_too_big = true; bool block_size_too_big = true;
@ -321,7 +320,7 @@ void shift_points(Matrix d_kernel_matrix, Matrix d_original_points, Matrix d_shi
dimBlock.x = requested_block_size; dimBlock.x = requested_block_size;
dimBlock.y = requested_block_size; dimBlock.y = requested_block_size;
dimGrid.x = (d_new_shift.height + dimBlock.x - 1) / dimBlock.x; dimGrid.x = (d_new_shift.height + dimBlock.x - 1) / dimBlock.x;
dimGrid.y = (d_new_shift.height + dimBlock.x - 1) / dimBlock.x; dimGrid.y = (d_new_shift.width + dimBlock.y - 1) / dimBlock.y;
// size for kernel's dynamically allocated array // size for kernel's dynamically allocated array
// the size FOR EACH array is calculated as BLOCK_SIZE * BLOCK_SIZE * size_of_double // the size FOR EACH array is calculated as BLOCK_SIZE * BLOCK_SIZE * size_of_double

21
mean_shift_cuda_shared_mem/meanshift_kernels.cu

@ -67,12 +67,13 @@ __global__ void shift_points_kernel(Matrix original_points, Matrix kernel_matrix
int col = threadIdx.y; int col = threadIdx.y;
// performs calculations only if thread's indexes are within matrix bounds // performs calculations only if thread's indexes are within matrix bounds
if (BLOCK_SIZE * block_row >= new_shift.height || BLOCK_SIZE * block_col >= new_shift.width){ if ((BLOCK_SIZE * block_row + row) >= new_shift.height ||
(BLOCK_SIZE * block_col + col) >= new_shift.width){
return; return;
} }
// each thread block computes one sub-matrix sub_new_shift of C // each thread block computes one sub-matrix sub_new_shift of C
Matrix sub_new_shift = GetSubMatrix(new_shift, block_row, block_col, BLOCK_SIZE); Matrix sub_new_shift = get_sub_matrix(new_shift, block_row, block_col, BLOCK_SIZE);
// dynamically allocated shared memory used to store sub_kernel_matrix and sub_original_points // dynamically allocated shared memory used to store sub_kernel_matrix and sub_original_points
// respectively // respectively
@ -84,12 +85,14 @@ __global__ void shift_points_kernel(Matrix original_points, Matrix kernel_matrix
// loops over all the sub-matrices of kernel_matrix and original_points that are required to // loops over all the sub-matrices of kernel_matrix and original_points that are required to
// compute sub_new_shift, multiplies each pair of sub-matrices and accumulates the results // compute sub_new_shift, multiplies each pair of sub-matrices and accumulates the results
for (int sub_matrix_index = 0; sub_matrix_index < (kernel_matrix.width / BLOCK_SIZE); ++sub_matrix_index) { for (int sub_matrix_index = 0;
sub_matrix_index < ((kernel_matrix.width + BLOCK_SIZE - 1) / BLOCK_SIZE);
++sub_matrix_index) {
// gets sub-matrix sub_kernel_matrix of kernel_matrix // gets sub-matrix sub_kernel_matrix of kernel_matrix
Matrix sub_kernel_matrix = GetSubMatrix(kernel_matrix, block_row, sub_matrix_index, BLOCK_SIZE); Matrix sub_kernel_matrix = get_sub_matrix(kernel_matrix, block_row, sub_matrix_index, BLOCK_SIZE);
// gets sub-matrix sub_original_points of original_points // gets sub-matrix sub_original_points of original_points
Matrix sub_original_points = GetSubMatrix(original_points, sub_matrix_index, block_col, BLOCK_SIZE); Matrix sub_original_points = get_sub_matrix(original_points, sub_matrix_index, block_col, BLOCK_SIZE);
// loads s_sub_kernel_matrix and s_sub_original_points from device global memory to shared // loads s_sub_kernel_matrix and s_sub_original_points from device global memory to shared
//memory, each thread loads one element of each sub-matrix //memory, each thread loads one element of each sub-matrix
@ -116,7 +119,6 @@ __global__ void shift_points_kernel(Matrix original_points, Matrix kernel_matrix
sub_new_shift.elements[row * sub_new_shift.stride + col] = sub_new_shift.elements[row * sub_new_shift.stride + col] =
cell_value / denominator.elements[block_row * BLOCK_SIZE + row]; cell_value / denominator.elements[block_row * BLOCK_SIZE + row];
int cell_row = block_row * BLOCK_SIZE + row; int cell_row = block_row * BLOCK_SIZE + row;
int cell_col = block_col * BLOCK_SIZE + col; int cell_col = block_col * BLOCK_SIZE + col;
mean_shift_vector.elements[cell_row * mean_shift_vector.stride + cell_col] = mean_shift_vector.elements[cell_row * mean_shift_vector.stride + cell_col] =
@ -124,10 +126,9 @@ __global__ void shift_points_kernel(Matrix original_points, Matrix kernel_matrix
shifted_points.elements[cell_row * shifted_points.stride + cell_col]; shifted_points.elements[cell_row * shifted_points.stride + cell_col];
} }
// Get the BLOCK_SIZExBLOCK_SIZE sub-matrix Asub of A that is // gets the BLOCK_SIZExBLOCK_SIZE sub-matrix Asub of A that is located col sub-matrices to the right
// located col sub-matrices to the right and row sub-matrices down // and row sub-matrices down from the upper-left corner of A
// from the upper-left corner of A __device__ Matrix get_sub_matrix(Matrix A, int row, int col, int BLOCK_SIZE){
__device__ Matrix GetSubMatrix(Matrix A, int row, int col, int BLOCK_SIZE){
Matrix Asub; Matrix Asub;
Asub.width = BLOCK_SIZE; Asub.width = BLOCK_SIZE;
Asub.height = BLOCK_SIZE; Asub.height = BLOCK_SIZE;

2
mean_shift_cuda_shared_mem/meanshift_kernels.h

@ -24,6 +24,6 @@ __global__ void denominator_kernel(Matrix denominator, Matrix kernel_matrix);
__global__ void shift_points_kernel(Matrix original_points, Matrix kernel_matrix, __global__ void shift_points_kernel(Matrix original_points, Matrix kernel_matrix,
Matrix shifted_points, Matrix new_shift, Matrix denominator, Matrix mean_shift_vector); Matrix shifted_points, Matrix new_shift, Matrix denominator, Matrix mean_shift_vector);
__device__ Matrix GetSubMatrix(Matrix A, int row, int col, int BLOCK_SIZE); __device__ Matrix get_sub_matrix(Matrix A, int row, int col, int BLOCK_SIZE);
#endif //SERIAL_KERNELS_H #endif //SERIAL_KERNELS_H
Loading…
Cancel
Save