Browse Source

Add calculate_kernel_matrix_kernel

master
Apostolos Fanakis 7 years ago
parent
commit
2ffa434bdc
  1. BIN
      mean_shift_cuda/meanshift
  2. 31
      mean_shift_cuda/meanshift_kernels.cu
  3. 3
      mean_shift_cuda/meanshift_kernels.h
  4. 105
      mean_shift_cuda/meanshift_utils.cu
  5. 6
      mean_shift_cuda/meanshift_utils.h

BIN
mean_shift_cuda/meanshift

Binary file not shown.

31
mean_shift_cuda/meanshift_kernels.cu

@ -16,3 +16,34 @@ __global__ void multiply_kernel(Matrix matrix1, Matrix matrix2, Matrix output){
output.elements[row * output.width + col] = cell_value; output.elements[row * output.width + col] = cell_value;
} }
} }
__global__ void calculate_kernel_matrix_kernel(Matrix shifted_points, Matrix original_points
, double deviation, Matrix kernel_matrix){
// Each thread calculates one element of kernel_matrix
int row = blockIdx.x * blockDim.x + threadIdx.x;
int col = blockIdx.y * blockDim.y + threadIdx.y;
if (row * kernel_matrix.width + col > kernel_matrix.width * kernel_matrix.height){
return;
}
int dimensions = shifted_points.width;
double sum = 0, dif;
for (int i=0; i<dimensions; i++){
dif = shifted_points.elements[row * dimensions + i] - original_points.elements[col * dimensions + i];
sum += dif * dif;
}
double distance = sqrt(sum);
double deviation_square = deviation*deviation;
if (distance < deviation_square){
// computes kernel matrix
double pow = ((-1)*(distance * distance))/(2*(deviation_square));
kernel_matrix.elements[row * kernel_matrix.width + col] = exp(pow);
} else {
kernel_matrix.elements[row * kernel_matrix.width + col] = 0;
}
if (row == col){
kernel_matrix.elements[row * kernel_matrix.width + col] += 1;
}
}

3
mean_shift_cuda/meanshift_kernels.h

@ -10,4 +10,7 @@ typedef struct{
//Function multiply_kernel calculates the product of matrices 1 and 2 into output. //Function multiply_kernel calculates the product of matrices 1 and 2 into output.
__global__ void multiply_kernel(Matrix matrix1, Matrix matrix2, Matrix output); __global__ void multiply_kernel(Matrix matrix1, Matrix matrix2, Matrix output);
__global__ void calculate_kernel_matrix_kernel(Matrix shifted_points, Matrix original_points
, double deviation, Matrix kernel_matrix);
#endif //SERIAL_KERNELS_H #endif //SERIAL_KERNELS_H

105
mean_shift_cuda/meanshift_utils.cu

@ -8,7 +8,6 @@
#include "meanshift_kernels.h" #include "meanshift_kernels.h"
#define OUTPUT_PREFIX "../output/output_" #define OUTPUT_PREFIX "../output/output_"
int BLOCK_SIZE = 16;
cudaDeviceProp device_properties; cudaDeviceProp device_properties;
@ -151,14 +150,13 @@ void set_Gpu(){
} }
// sets the device // sets the device
gpuErrchk( cudaSetDevice(max_device) ); gpuErrchk( cudaSetDevice(max_device) );
BLOCK_SIZE = device_properties.maxThreadsPerBlock;
if (params.verbose){ if (params.verbose){
printf("Device chosen is \"%s\"\n" printf("Device chosen is \"%s\"\n"
"Device has %d multi processors and compute capability %d.%d\n" "Device has %d multi processors and compute capability %d.%d\n"
"Setting BLOCK_SIZE to max threads per block supported (%d)\n\n" "Max threads per block supported are %d\n\n"
, device_properties.name , device_properties.name
, device_properties.multiProcessorCount, device_properties.major, device_properties.minor , device_properties.multiProcessorCount, device_properties.major, device_properties.minor
, BLOCK_SIZE); , device_properties.maxThreadsPerBlock);
} }
} }
@ -188,23 +186,11 @@ int meanshift(double **original_points, double ***shifted_points, int deviation
// finds pairwise distance matrix (inside radius) // finds pairwise distance matrix (inside radius)
// [I, D] = rangesearch(x,y,h); // [I, D] = rangesearch(x,y,h);
calculate_kernel_matrix((*shifted_points), original_points, deviation, &kernel_matrix);
// calculate denominator
for (int i=0; i<NUMBER_OF_POINTS; i++){ for (int i=0; i<NUMBER_OF_POINTS; i++){
double sum = 0; double sum = 0;
for (int j=0; j<NUMBER_OF_POINTS; j++){ for (int j=0; j<NUMBER_OF_POINTS; j++){
double distance = calculateDistance((*shifted_points)[i]
, original_points[j]);
double deviation_square = deviation*deviation;
if (distance < deviation_square){
// computes kernel matrix
double pow = ((-1)*(distance * distance))/(2*(deviation_square));
kernel_matrix[i][j] = exp(pow);
} else {
kernel_matrix[i][j] = 0;
}
if (i == j){
kernel_matrix[i][j] += 1;
}
sum = sum + kernel_matrix[i][j]; sum = sum + kernel_matrix[i][j];
} }
denominator[i] = sum; denominator[i] = sum;
@ -271,8 +257,78 @@ double norm(double **matrix, int rows, int cols){
return norm; return norm;
} }
void calculate_kernel_matrix(double **shifted_points, double **original_points, double deviation
, double ***kernel_matrix){
static bool first_iter = true;
// allocates memory for shifted_points in GPU and copies the array
Matrix d_shifted_points;
d_shifted_points.width = DIMENSIONS;
d_shifted_points.height = NUMBER_OF_POINTS;
int size = DIMENSIONS * NUMBER_OF_POINTS * sizeof(double);
gpuErrchk( cudaMalloc(&d_shifted_points.elements, size) );
gpuErrchk( cudaMemcpy(d_shifted_points.elements, &(shifted_points[0][0])
, size, cudaMemcpyHostToDevice) );
// allocates memory for original_points in GPU and copies the array
Matrix d_original_points;
d_original_points.width = DIMENSIONS;
d_original_points.height = NUMBER_OF_POINTS;
size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double);
gpuErrchk( cudaMalloc(&d_original_points.elements, size) );
gpuErrchk( cudaMemcpy(d_original_points.elements, &(original_points[0][0])
, size, cudaMemcpyHostToDevice) );
// allocates memory for kernel_matrix in GPU
Matrix d_kernel_matrix;
d_kernel_matrix.width = NUMBER_OF_POINTS;
d_kernel_matrix.height = NUMBER_OF_POINTS;
size = NUMBER_OF_POINTS * NUMBER_OF_POINTS * sizeof(double);
gpuErrchk( cudaMalloc(&d_kernel_matrix.elements, size) );
// get max sizes supported from the device
int max_block_size = (int)sqrt(device_properties.maxThreadsPerBlock);
int requested_block_size = max_block_size;
bool block_size_too_big = true;
dim3 dimBlock;
dim3 dimGrid;
do {
dimBlock.x = requested_block_size;
dimBlock.y = requested_block_size;
dimGrid.x = (d_kernel_matrix.height + dimBlock.x - 1) / dimBlock.x;
dimGrid.y = (d_kernel_matrix.width + dimBlock.y - 1) / dimBlock.y;
calculate_kernel_matrix_kernel<<<dimGrid, dimBlock>>>(d_shifted_points, d_original_points
, deviation, d_kernel_matrix);
if (cudaGetLastError() != cudaSuccess){
--requested_block_size;
} else {
block_size_too_big = false;
gpuErrchk( cudaDeviceSynchronize() );
}
} while(block_size_too_big);
if (first_iter && params.verbose){
printf("calculate_kernel_matrix_kernel called with:\n");
printf("dimBlock.x = %d, dimBlock.y = %d\n", dimBlock.x, dimBlock.y);
printf("dimGrid.x = %d, dimGrid.y = %d\n\n", dimGrid.x, dimGrid.y);
first_iter = false;
}
size = NUMBER_OF_POINTS * NUMBER_OF_POINTS * sizeof(double);
gpuErrchk( cudaMemcpy(&((*kernel_matrix)[0][0]), d_kernel_matrix.elements
, size, cudaMemcpyDeviceToHost) );
gpuErrchk( cudaFree(d_shifted_points.elements) );
gpuErrchk( cudaFree(d_original_points.elements) );
gpuErrchk( cudaFree(d_kernel_matrix.elements) );
}
void multiply(double **kernel_matrix, double **original_points, double ***new_shift){ void multiply(double **kernel_matrix, double **original_points, double ***new_shift){
static bool firstIter = true; static bool first_iter = true;
// allocates memory for kernel_matrix in GPU and copies the array // allocates memory for kernel_matrix in GPU and copies the array
Matrix d_kernel_matrix; Matrix d_kernel_matrix;
@ -299,15 +355,18 @@ void multiply(double **kernel_matrix, double **original_points, double ***new_sh
size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double); size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double);
gpuErrchk( cudaMalloc(&d_new_shift.elements, size) ); gpuErrchk( cudaMalloc(&d_new_shift.elements, size) );
dim3 dimBlock((d_new_shift.height < sqrt(BLOCK_SIZE)) ? d_new_shift.height : sqrt(BLOCK_SIZE) // get max sizes supported from the device
, (d_new_shift.width < sqrt(BLOCK_SIZE)) ? d_new_shift.width : sqrt(BLOCK_SIZE)); int max_block_size = device_properties.maxThreadsPerBlock;
dim3 dimBlock((d_new_shift.height < sqrt(max_block_size)) ? d_new_shift.height : sqrt(max_block_size)
, (d_new_shift.width < sqrt(max_block_size)) ? d_new_shift.width : sqrt(max_block_size));
dim3 dimGrid((d_new_shift.height + dimBlock.x - 1) / dimBlock.x dim3 dimGrid((d_new_shift.height + dimBlock.x - 1) / dimBlock.x
, (d_new_shift.width + dimBlock.y - 1) / dimBlock.y); , (d_new_shift.width + dimBlock.y - 1) / dimBlock.y);
if (firstIter && params.verbose){ if (first_iter && params.verbose){
printf("multiply_kernel called with:\n");
printf("dimBlock.x = %d, dimBlock.y = %d\n", dimBlock.x, dimBlock.y); printf("dimBlock.x = %d, dimBlock.y = %d\n", dimBlock.x, dimBlock.y);
printf("dimGrid.x = %d, dimGrid.y = %d\n\n", dimGrid.x, dimGrid.y); printf("dimGrid.x = %d, dimGrid.y = %d\n\n", dimGrid.x, dimGrid.y);
firstIter = false; first_iter = false;
} }
multiply_kernel<<<dimGrid, dimBlock>>>(d_kernel_matrix, d_original_points, d_new_shift); multiply_kernel<<<dimGrid, dimBlock>>>(d_kernel_matrix, d_original_points, d_new_shift);

6
mean_shift_cuda/meanshift_utils.h

@ -7,8 +7,7 @@
//https://stackoverflow.com/a/14038590 //https://stackoverflow.com/a/14038590
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); } #define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true){ inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true){
if (code != cudaSuccess) if (code != cudaSuccess){
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code); if (abort) exit(code);
} }
@ -47,6 +46,9 @@ int meanshift(double **original_points, double ***shifted_points, int h
//Function norm returns the second norm of matrix of dimensions rowsXcols. //Function norm returns the second norm of matrix of dimensions rowsXcols.
double norm(double **matrix, int rows, int cols); double norm(double **matrix, int rows, int cols);
void calculate_kernel_matrix(double **shifted_points, double **original_points, double deviation
, double ***kernel_matrix);
//Function multiply allocates memory in GPU, sends the data and calls the //Function multiply allocates memory in GPU, sends the data and calls the
//multiply kernel function. //multiply kernel function.
void multiply(double **kernel_matrix, double **original_points void multiply(double **kernel_matrix, double **original_points

Loading…
Cancel
Save