Browse Source

Shared memory implementation init

master
Apostolos Fanakis 7 years ago
parent
commit
649e1d7850
  1. 43
      mean_shift_cuda_shared_mem/Makefile
  2. 45
      mean_shift_cuda_shared_mem/meanshift.cu
  3. 370
      mean_shift_cuda_shared_mem/meanshift_gpu_utils.cu
  4. 58
      mean_shift_cuda_shared_mem/meanshift_gpu_utils.h
  5. 144
      mean_shift_cuda_shared_mem/meanshift_kernels.cu
  6. 29
      mean_shift_cuda_shared_mem/meanshift_kernels.h
  7. 165
      mean_shift_cuda_shared_mem/meanshift_utils.cu
  8. 35
      mean_shift_cuda_shared_mem/meanshift_utils.h

43
mean_shift_cuda_shared_mem/Makefile

@ -0,0 +1,43 @@
SHELL := /bin/bash
# ============================================
# COMMANDS
CC = nvcc
HOST_COMPILER = -ccbin gcc
CUDA_FLAGS = -arch=sm_21 -Wno-deprecated-gpu-targets -lcublas
C_FLAGS = -lm -O3 -I.
COMPILE_FLAGS = $(HOST_COMPILER) -x cu $(CUDA_FLAGS) -dc $(C_FLAGS)
LINK_FLAGS = $(HOST_COMPILER) $(CUDA_FLAGS) $(C_FLAGS)
OBJ = meanshift.o meanshift_utils.o meanshift_gpu_utils.o meanshift_kernels.o
DEPS = meanshift_utils.h meanshift_kernels.h
RM = rm -f
# ==========================================
# TARGETS
EXECUTABLES = meanshift
.PHONY: all clean
all: $(EXECUTABLES)
# ==========================================
# DEPENDENCIES (HEADERS)
%.o: %.cu $(DEPS)
$(CC) $(COMPILE_FLAGS) $< -o $@
.PRECIOUS: $(EXECUTABLES) $(OBJ)
# ==========================================
# EXECUTABLE (MAIN)
$(EXECUTABLES): $(OBJ)
$(CC) $(LINK_FLAGS) $(OBJ) -o $@
clean:
$(RM) *.o *~ $(EXECUTABLES)

45
mean_shift_cuda_shared_mem/meanshift.cu

@ -0,0 +1,45 @@
#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#include "meanshift_utils.h"
#include "meanshift_gpu_utils.h"
int DEVIATION = 1;
int NUMBER_OF_POINTS = 600;
int DIMENSIONS = 2;
const char *POINTS_FILENAME = "../data/X.bin";
const char *LABELS_FILENAME = "../data/L.bin";
parameters params;
struct timeval startwtime, endwtime;
double seq_time;
int main(int argc, char **argv){
int recursions = 0;
double **vectors, **shifted_points;
char *labels;
params.epsilon = 0.0001;
params.verbose = false;
params.display = true;
//get_args(argc, argv, &params); //commented out while in development
init(&vectors, &labels);
// tic
gettimeofday (&startwtime, NULL);
recursions = meanshift(vectors, &shifted_points, DEVIATION);
// toc
gettimeofday (&endwtime, NULL);
seq_time = (double)((endwtime.tv_usec - startwtime.tv_usec)/1.0e6 + endwtime.tv_sec - startwtime.tv_sec);
printf("\nTotal number of recursions = %d\n", recursions);
printf("%s wall clock time = %f\n","Mean Shift", seq_time);
free(vectors[0]);
free(vectors);
free(shifted_points[0]);
free(shifted_points);
}

370
mean_shift_cuda_shared_mem/meanshift_gpu_utils.cu

@ -0,0 +1,370 @@
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <float.h>
#include <string.h>
#include <sys/time.h>
#include <cublas_v2.h>
#include "meanshift_utils.h"
#include "meanshift_gpu_utils.h"
cudaDeviceProp device_properties;
struct timeval start_w_time, end_w_time;
double seq;
//Based on:
// https://www.cs.virginia.edu/~csadmin/wiki/index.php/CUDA_Support/Choosing_a_GPU
void set_GPU(){
int devices_count = 0, max_multiprocessors = 0, max_device = 0;
// gets devices count checking for errors like no devices or no drivers to check for
// devices available
gpuErrchk( cudaGetDeviceCount(&devices_count) );
for(int device_index = 0; device_index < devices_count; ++device_index){
// gets current index device's properties
cudaDeviceProp this_device_properties;
gpuErrchk( cudaGetDeviceProperties(&this_device_properties, device_index) );
// stores best available device's index
// only devices with compute capability >= 2.0 are able to run the code
if (max_multiprocessors < this_device_properties.multiProcessorCount
&& this_device_properties.major >= 2 && this_device_properties.minor >= 0){
// stores devices properties for later use
device_properties = this_device_properties;
max_multiprocessors = this_device_properties.multiProcessorCount;
max_device = device_index;
}
}
// sets the device
gpuErrchk( cudaSetDevice(max_device) );
if (params.verbose){
printf("Device chosen is \"%s\"\n"
"Device has %d multi processors and compute capability %d.%d\n"
"Max threads per block supported are %d\n\n"
, device_properties.name
, device_properties.multiProcessorCount, device_properties.major, device_properties.minor
, device_properties.maxThreadsPerBlock);
}
}
int meanshift(double **original_points, double ***shifted_points, int deviation){
// host variables
int size = 0;
static int recursion = 0;
static double **kernel_matrix, **mean_shift_vector, w_memcpy_time;
double **new_shift, current_norm = 0, tmp_w_memcpy_time;
bool is_first_recursion = false;
// device variables
static Matrix d_original_points, d_shifted_points, d_kernel_matrix, d_denominator,
d_mean_shift_vector;
Matrix d_new_shift;
// allocates memory and copies original points on first recursion
if (recursion == 0 || (*shifted_points) == NULL){
is_first_recursion = true;
// allocates memory for shifted points array and copies original points into it
(*shifted_points) = alloc_double(NUMBER_OF_POINTS, DIMENSIONS);
duplicate(original_points, NUMBER_OF_POINTS, DIMENSIONS, shifted_points);
// allocates memory for mean shift vector
mean_shift_vector = alloc_double(NUMBER_OF_POINTS, DIMENSIONS);
// initializes elements of mean_shift_vector to inf
for (int i=0;i<NUMBER_OF_POINTS;i++){
for (int j=0;j<DIMENSIONS;j++){
mean_shift_vector[i][j] = DBL_MAX;
}
}
// allocates memory for kernel_matrix
kernel_matrix = alloc_double(NUMBER_OF_POINTS, NUMBER_OF_POINTS);
// tic
gettimeofday (&start_w_time, NULL);
// allocates corresponding memory in device
init_device_memory(original_points, *shifted_points, &d_original_points, &d_shifted_points,
&d_kernel_matrix, &d_denominator, &d_mean_shift_vector);
// toc
gettimeofday (&end_w_time, NULL);
seq = (double)((end_w_time.tv_usec - start_w_time.tv_usec)
/ 1.0e6 + end_w_time.tv_sec - start_w_time.tv_sec);
if (params.verbose){
printf("Device memory allocation wall clock time = %f\n\n", seq);
}
}
// finds pairwise distance matrix (inside radius)
// [I, D] = rangesearch(x,y,h);
calculate_kernel_matrix(d_shifted_points, d_original_points, d_kernel_matrix, deviation,
&kernel_matrix, &tmp_w_memcpy_time);
w_memcpy_time += tmp_w_memcpy_time;
// calculates denominator
calculate_denominator(d_kernel_matrix, d_denominator);
// creates new y vector
// allocates memory in every recursion
new_shift = alloc_double(NUMBER_OF_POINTS, DIMENSIONS);
// allocates corresponding memory in device
d_new_shift.width = DIMENSIONS;
d_new_shift.height = NUMBER_OF_POINTS;
size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double);
gpuErrchk( cudaMalloc(&(d_new_shift.elements), size) );
shift_points(d_kernel_matrix, d_original_points, d_shifted_points, d_new_shift, d_denominator,
d_mean_shift_vector, kernel_matrix, original_points, &new_shift, &mean_shift_vector,
&tmp_w_memcpy_time);
w_memcpy_time += tmp_w_memcpy_time;
for (int row=0; row<2; ++row){
for (int col=0; col<2; ++col){
printf("new_shift[%d][%d] = %f\n", row, col, new_shift[row][col]);
printf("new_shift[%d][%d] = %f\n", 300+row, 216+col, new_shift[300+row][216+col]);
printf("new_shift[%d][%d] = %f\n", 562+row, 487+col, new_shift[562+row][487+col]);
}
}
if(is_first_recursion){
exit(0);
}
// frees previously shifted points, they're now garbage
free((*shifted_points)[0]);
gpuErrchk( cudaFree(d_shifted_points.elements) );
// updates shifted points pointer to the new array address
shifted_points = &new_shift;
d_shifted_points.elements = d_new_shift.elements;
if (params.display){
save_matrix((*shifted_points), recursion);
}
// calculates norm of the new mean shift vector in GPU using "cuBlas" library function
cublasHandle_t handle;
cublasStatus_t cublas_status = cublasCreate(&handle);
if (cublas_status != CUBLAS_STATUS_SUCCESS){
exit(cublas_status);
}
cublas_status = cublasDnrm2(handle, NUMBER_OF_POINTS * DIMENSIONS, d_mean_shift_vector.elements,
1, &current_norm);
if (cublas_status != CUBLAS_STATUS_SUCCESS){
exit(cublas_status);
}
cublas_status = cublasDestroy(handle);
if (cublas_status != CUBLAS_STATUS_SUCCESS){
exit(cublas_status);
}
if (params.verbose){
printf("Recursion n. %d, error\t%f \n", recursion, current_norm);
}
// recurses until convergence
if (current_norm > params.epsilon) {
++recursion;
meanshift(original_points, shifted_points, deviation);
}
if (is_first_recursion){
if (params.verbose){
printf("\nCopying between device and host wall clock time = %f\n", w_memcpy_time);
}
// cleans up allocations
free(mean_shift_vector[0]);
free(mean_shift_vector);
free(kernel_matrix[0]);
free(kernel_matrix);
free_device_memory(d_original_points, d_kernel_matrix, d_denominator, d_shifted_points);
}
return recursion;
}
void init_device_memory(double **original_points, double **shifted_points,
Matrix *d_original_points, Matrix *d_shifted_points, Matrix *d_kernel_matrix,
Matrix *d_denominator, Matrix *d_mean_shift_vector){
int size;
// allocates memory for original_points in GPU and copies the array
d_original_points->width = DIMENSIONS;
d_original_points->height = NUMBER_OF_POINTS;
size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double);
gpuErrchk( cudaMalloc(&(d_original_points->elements), size) );
gpuErrchk( cudaMemcpy(d_original_points->elements, &(original_points[0][0])
, size, cudaMemcpyHostToDevice) );
// allocates memory for shifted_points in GPU and copies the array
d_shifted_points->width = DIMENSIONS;
d_shifted_points->height = NUMBER_OF_POINTS;
size = DIMENSIONS * NUMBER_OF_POINTS * sizeof(double);
gpuErrchk( cudaMalloc(&(d_shifted_points->elements), size) );
gpuErrchk( cudaMemcpy(d_shifted_points->elements, &(shifted_points[0][0])
, size, cudaMemcpyHostToDevice) );
// allocates memory for kernel_matrix in GPU
d_kernel_matrix->width = NUMBER_OF_POINTS;
d_kernel_matrix->height = NUMBER_OF_POINTS;
size = NUMBER_OF_POINTS * NUMBER_OF_POINTS * sizeof(double);
gpuErrchk( cudaMalloc(&(d_kernel_matrix->elements), size) );
// allocates memory for denominator in GPU
d_denominator->width = 1;
d_denominator->height = NUMBER_OF_POINTS;
size = NUMBER_OF_POINTS * sizeof(double);
gpuErrchk( cudaMalloc(&(d_denominator->elements), size) );
// allocates memory for mean_shift_vector in GPU
d_mean_shift_vector->width = DIMENSIONS;
d_mean_shift_vector->height = NUMBER_OF_POINTS;
size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double);
gpuErrchk( cudaMalloc(&(d_mean_shift_vector->elements), size) );
}
void calculate_kernel_matrix(Matrix d_shifted_points, Matrix d_original_points,
Matrix d_kernel_matrix, double deviation, double ***kernel_matrix, double *w_memcpy_time){
int size;
static bool first_iter = true;
// gets max block size supported from the device
static int max_block_size = device_properties.maxThreadsPerBlock;
static int requested_block_size = (int)sqrt(max_block_size);
bool block_size_too_big = true;
dim3 dimBlock;
dim3 dimGrid;
do {
dimBlock.x = requested_block_size;
dimBlock.y = requested_block_size;
dimGrid.x = (d_kernel_matrix.height + dimBlock.x - 1) / dimBlock.x;
dimGrid.y = (d_kernel_matrix.width + dimBlock.y - 1) / dimBlock.y;
calculate_kernel_matrix_kernel<<<dimGrid, dimBlock>>>(d_shifted_points, d_original_points
, deviation, d_kernel_matrix);
if (cudaGetLastError() != cudaSuccess){
--requested_block_size;
} else {
block_size_too_big = false;
gpuErrchk( cudaDeviceSynchronize() );
}
} while(block_size_too_big);
if (first_iter && params.verbose){
printf("calculate_kernel_matrix_kernel called with:\n");
printf("dimBlock.x = %d, dimBlock.y = %d\n", dimBlock.x, dimBlock.y);
printf("dimGrid.x = %d, dimGrid.y = %d\n\n", dimGrid.x, dimGrid.y);
first_iter = false;
}
size = NUMBER_OF_POINTS * NUMBER_OF_POINTS * sizeof(double);
// tic
gettimeofday (&start_w_time, NULL);
gpuErrchk( cudaMemcpy(&((*kernel_matrix)[0][0]), d_kernel_matrix.elements
, size, cudaMemcpyDeviceToHost) );
// toc
gettimeofday (&end_w_time, NULL);
*w_memcpy_time = (double)((end_w_time.tv_usec - start_w_time.tv_usec)
/ 1.0e6 + end_w_time.tv_sec - start_w_time.tv_sec);
}
void calculate_denominator(Matrix d_kernel_matrix, Matrix d_denominator){
static bool first_iter = true;
// gets max block size supported from the device
static int requested_block_size = device_properties.maxThreadsPerBlock;
bool block_size_too_big = true;
dim3 dimBlock;
dim3 dimGrid;
do {
dimBlock.x = requested_block_size;
dimBlock.y = 1;
dimGrid.x = (d_kernel_matrix.height + dimBlock.x - 1) / dimBlock.x;
dimGrid.y = 1;
denominator_kernel<<<dimGrid, dimBlock>>>(d_denominator, d_kernel_matrix);
if (cudaGetLastError() != cudaSuccess){
--requested_block_size;
} else {
block_size_too_big = false;
gpuErrchk( cudaDeviceSynchronize() );
}
} while(block_size_too_big);
if (first_iter && params.verbose){
printf("calculate_denominator called with:\n");
printf("dimBlock.x = %d, dimBlock.y = %d\n", dimBlock.x, dimBlock.y);
printf("dimGrid.x = %d, dimGrid.y = %d\n\n", dimGrid.x, dimGrid.y);
first_iter = false;
}
}
void shift_points(Matrix d_kernel_matrix, Matrix d_original_points, Matrix d_shifted_points,
Matrix d_new_shift, Matrix d_denominator, Matrix d_mean_shift_vector, double **kernel_matrix,
double **original_points, double ***new_shift, double ***mean_shift_vector,
double *w_memcpy_time){
int size;
static bool first_iter = true;
// gets max block size supported from the device
static int max_block_size = device_properties.maxThreadsPerBlock;
static int requested_block_size = (int)(max_block_size / d_new_shift.width);
bool block_size_too_big = true;
dim3 dimBlock;
dim3 dimGrid;
do {
/*dimBlock.x = requested_block_size;
dimBlock.y = d_new_shift.width;*/
dimBlock.x = 2;
dimBlock.y = 2;
dimGrid.x = (d_denominator.height + dimBlock.x - 1) / dimBlock.x;
dimGrid.y = 1;
shift_points_kernel<<<dimGrid, dimBlock>>>(d_original_points, d_kernel_matrix, d_shifted_points,
d_new_shift, d_denominator, d_mean_shift_vector);
if (cudaGetLastError() != cudaSuccess){
--requested_block_size;
} else {
block_size_too_big = false;
gpuErrchk( cudaDeviceSynchronize() );
}
} while(block_size_too_big);
if (first_iter && params.verbose){
printf("shift_points_kernel called with:\n");
printf("dimBlock.x = %d, dimBlock.y = %d\n", dimBlock.x, dimBlock.y);
printf("dimGrid.x = %d, dimGrid.y = %d\n\n", dimGrid.x, dimGrid.y);
first_iter = false;
}
size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double);
// tic
gettimeofday (&start_w_time, NULL);
gpuErrchk( cudaMemcpy(&((*new_shift)[0][0]), d_new_shift.elements
, size, cudaMemcpyDeviceToHost) );
gpuErrchk( cudaMemcpy(&((*mean_shift_vector)[0][0]), d_mean_shift_vector.elements
, size, cudaMemcpyDeviceToHost) );
// toc
gettimeofday (&end_w_time, NULL);
*w_memcpy_time = (double)((end_w_time.tv_usec - start_w_time.tv_usec)
/ 1.0e6 + end_w_time.tv_sec - start_w_time.tv_sec);
}
void free_device_memory(Matrix d_original_points, Matrix d_kernel_matrix, Matrix d_denominator,
Matrix d_shifted_points){
// frees all memory previously allocated in device
gpuErrchk( cudaFree(d_original_points.elements) );
gpuErrchk( cudaFree(d_kernel_matrix.elements) );
gpuErrchk( cudaFree(d_denominator.elements) );
gpuErrchk( cudaFree(d_shifted_points.elements) );
}

58
mean_shift_cuda_shared_mem/meanshift_gpu_utils.h

@ -0,0 +1,58 @@
#ifndef SERIAL_GPU_UTILS_H /* Include guard */
#define SERIAL_GPU_UTILS_H
#include "meanshift_kernels.h"
//GPU error check snippet taken from:
// https://stackoverflow.com/a/14038590
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true){
if (code != cudaSuccess){
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
/* Global variables */
extern int DEVIATION;
extern int NUMBER_OF_POINTS;
extern int DIMENSIONS;
extern const char* POINTS_FILENAME;
extern const char* LABELS_FILENAME;
extern Parameters params;
extern cudaDeviceProp device_properties;
//Function set_GPU parses available GPU devices, selects the one with the most multi-processors for
//usage and stores its properties in global struct device_properties
void set_GPU();
//Function meanshift recursively shifts original points according to the mean-shift algorithm saving
//the result to shiftedPoints, h is the desirable deviation
int meanshift(double **original_points, double ***shifted_points, int h);
//Function init_device_memory allocates memory for necessary arrays in the device
void init_device_memory(double **original_points, double **shifted_points,
Matrix *d_original_points, Matrix *d_shifted_points, Matrix *d_kernel_matrix,
Matrix *d_denominator, Matrix *d_new_shift);
//Function calculate_kernel_matrix is a wrapper for the kernel call of the corresponding kernel
//"calculate_kernel_matrix_kernel" that calculates the kernel matrix
void calculate_kernel_matrix(Matrix d_shifted_points, Matrix d_original_points,
Matrix d_kernel_matrix, double deviation, double ***kernel_matrix, double *w_memcpy_time);
//Function calculate_denominator is a wrapper for the kernel call of the corresponding kernel
//"calculate_denominator_kernel" that calculates the denominator of shifted points fraction
void calculate_denominator(Matrix d_kernel_matrix, Matrix d_denominator);
//Function shift_points is a wrapper for the kernel call of the corresponding kernel
//"shift_points_kernel" that shifts the positions of all points
void shift_points(Matrix d_kernel_matrix, Matrix d_original_points, Matrix d_shifted_points,
Matrix d_new_shift, Matrix d_denominator, Matrix d_mean_shift_vector, double **kernel_matrix,
double **original_points, double ***new_shift, double ***mean_shift_vector,
double *w_memcpy_time);
//Function free_device_memory frees device's previously allocated memory
void free_device_memory(Matrix d_original_points, Matrix d_kernel_matrix, Matrix d_denominator,
Matrix d_shifted_points);
#endif //SERIAL_GPU_UTILS_H

144
mean_shift_cuda_shared_mem/meanshift_kernels.cu

@ -0,0 +1,144 @@
#include "meanshift_kernels.h"
#include <stdio.h>
#include <stdlib.h>
__global__ void calculate_kernel_matrix_kernel(Matrix shifted_points, Matrix original_points,
double deviation, Matrix kernel_matrix){
// each thread calculates one element of kernel_matrix
int row = blockIdx.x * blockDim.x + threadIdx.x;
int col = blockIdx.y * blockDim.y + threadIdx.y;
// performs calculations only if thread's indexes are within matrix bounds
if (row * kernel_matrix.width + col >= kernel_matrix.width * kernel_matrix.height){
return;
}
int dimensions = shifted_points.width;
// calculate distance
double sum = 0, dif;
for (int i=0; i<dimensions; i++){
dif = shifted_points.elements[row * dimensions + i]
- original_points.elements[col * dimensions + i];
sum += dif * dif;
}
double distance = sqrt(sum);
double deviation_square = deviation*deviation;
if (distance < deviation_square){
// computes kernel matrix
double pow = ((-1)*(distance * distance))/(2*(deviation_square));
kernel_matrix.elements[row * kernel_matrix.width + col] = exp(pow);
} else {
kernel_matrix.elements[row * kernel_matrix.width + col] = 0;
}
if (row == col){
kernel_matrix.elements[row * kernel_matrix.width + col] += 1;
}
}
__global__ void denominator_kernel(Matrix denominator, Matrix kernel_matrix){
// each thread computes one element of denominator_kernel
// by accumulating results into cell_value
double cell_value = 0;
int row = blockIdx.x * blockDim.x + threadIdx.x;
// performs calculations only if thread's indexes are within matrix bounds
if (row >= denominator.height){
return;
}
for (int column = 0; column < kernel_matrix.width; ++column){
cell_value += kernel_matrix.elements[row * kernel_matrix.width + column];
}
denominator.elements[row] = cell_value;
}
__global__ void shift_points_kernel(Matrix original_points, Matrix kernel_matrix,
Matrix shifted_points, Matrix new_shift, Matrix denominator, Matrix mean_shift_vector){
int BLOCK_SIZE = blockDim.y;
int block_row = blockIdx.x;
int block_col = blockIdx.y;
// each thread computes one element of new_shift by accumulating results into cell_value
double cell_value = 0;
// Thread row and column within sub_new_shift
int row = threadIdx.x;
int col = threadIdx.y;
// performs calculations only if thread's indexes are within matrix bounds
//if (row * new_shift.width + col >= new_shift.width * new_shift.height){
/*if (new_shift.stride * BLOCK_SIZE * block_row + BLOCK_SIZE * block_col >=
new_shift.width * new_shift.height){*/
if (BLOCK_SIZE * block_row >= new_shift.height || BLOCK_SIZE * block_col >= new_shift.width){
return;
}
// Each thread block computes one sub-matrix sub_new_shift of C
Matrix sub_new_shift = GetSubMatrix(new_shift, block_row, block_col, BLOCK_SIZE);
// shared memory used to store sub_kernel_matrix and sub_original_points respectively
__shared__ double *s_sub_kernel_matrix;
s_sub_kernel_matrix = (double*)malloc(BLOCK_SIZE * BLOCK_SIZE * sizeof(double));
__shared__ double *s_sub_original_points;
s_sub_original_points = (double*)malloc(BLOCK_SIZE * BLOCK_SIZE * sizeof(double));
// loops over all the sub-matrices of kernel_matrix and original_points that are required to
//compute sub_new_shift, multiplies each pair of sub-matrices and accumulates the results
for (int sub_matrix_index = 0; sub_matrix_index < (kernel_matrix.width / BLOCK_SIZE); ++sub_matrix_index) {
// gets sub-matrix sub_kernel_matrix of kernel_matrix
Matrix sub_kernel_matrix = GetSubMatrix(kernel_matrix, block_row, sub_matrix_index, BLOCK_SIZE);
// gets sub-matrix sub_original_points of original_points
Matrix sub_original_points = GetSubMatrix(original_points, sub_matrix_index, block_col, BLOCK_SIZE);
// loads s_sub_kernel_matrix and s_sub_original_points from device global memory to shared
//memory, each thread loads one element of each sub-matrix
s_sub_kernel_matrix[row * BLOCK_SIZE + col] =
sub_kernel_matrix.elements[row * sub_kernel_matrix.stride + col];
s_sub_original_points[row * BLOCK_SIZE + col] =
sub_original_points.elements[row * sub_original_points.stride + col];
// synchronizes to make sure the sub-matrices are loaded before starting the computation
__syncthreads();
// multiplies sub_kernel_matrix and sub_original_points
for (int element_index = 0; element_index < BLOCK_SIZE; ++element_index){
cell_value += s_sub_kernel_matrix[row * sub_kernel_matrix.stride + element_index] *
s_sub_original_points[element_index * sub_original_points.stride + col];
}
// synchronizes to make sure that the preceding computation is done before loading two new
// sub-matrices of kernel_matrix and original_points in the next iteration
__syncthreads();
}
// new_shift elements are calculated by dividing with the denominator
int cell_row = (block_row * BLOCK_SIZE + row) * new_shift.stride;
int cell_col = block_col * BLOCK_SIZE + col;
//sub_new_shift.elements[cell_row + cell_col] = cell_value / denominator.elements[cell_row];
sub_new_shift.elements[row * sub_new_shift.stride + col] =
cell_value / denominator.elements[block_row * BLOCK_SIZE + row];
// calculates mean-shift vector
/*mean_shift_vector.elements[(block_row * BLOCK_SIZE + row) * mean_shift_vector.stride
+ (block_col * BLOCK_SIZE + col)] =
sub_new_shift.elements[row * sub_new_shift.stride + col] -
shifted_points.elements[(block_row * BLOCK_SIZE + row) * shifted_points.stride
+ (block_col * BLOCK_SIZE + col)];*/
/*free(s_sub_kernel_matrix);
free(s_sub_original_points);*/
}
// Get the BLOCK_SIZExBLOCK_SIZE sub-matrix Asub of A that is
// located col sub-matrices to the right and row sub-matrices down
// from the upper-left corner of A
__device__ Matrix GetSubMatrix(Matrix A, int row, int col, int BLOCK_SIZE){
Matrix Asub;
Asub.width = BLOCK_SIZE;
Asub.height = BLOCK_SIZE;
Asub.stride = BLOCK_SIZE;
Asub.elements = &(A.elements[A.stride * BLOCK_SIZE * row + BLOCK_SIZE * col]);
return Asub;
}

29
mean_shift_cuda_shared_mem/meanshift_kernels.h

@ -0,0 +1,29 @@
#ifndef SERIAL_KERNELS_H /* Include guard */
#define SERIAL_KERNELS_H
/* Structures */
//Matrix is used to describe matrices
typedef struct {
int width;
int height;
int stride;
double *elements;
} Matrix;
//Kernel calculate_kernel_matrix_kernel calculates the current kernel matrix
__global__ void calculate_kernel_matrix_kernel(Matrix shifted_points, Matrix original_points,
double deviation, Matrix kernel_matrix);
//Kernel denominator_kernel calculates the sum in the denominator of the fraction used to find new
//(shifted) positions of the points
__global__ void denominator_kernel(Matrix denominator, Matrix kernel_matrix);
//Kernel shift_points_kernel shifts the positions of all points and calculates the new mean shift
//vector according to the new point array
__global__ void shift_points_kernel(Matrix original_points, Matrix kernel_matrix,
Matrix shifted_points, Matrix new_shift, Matrix denominator, Matrix mean_shift_vector);
__device__ Matrix GetSubMatrix(Matrix A, int row, int col, int BLOCK_SIZE);
#endif //SERIAL_KERNELS_H

165
mean_shift_cuda_shared_mem/meanshift_utils.cu

@ -0,0 +1,165 @@
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <float.h>
#include <string.h>
#include "meanshift_utils.h"
#include "meanshift_gpu_utils.h"
#define OUTPUT_PREFIX "../output/output_"
void get_args(int argc, char **argv, parameters *params){
if (argc < 7) {
printf("Usage: %s h e N D Pd Pl\nwhere:\n"
"\th is the variance\n"
"\te is the min distance, between two points, that is taken into account in computations\n"
"\tN is the the number of points\n"
"\tD is the number of dimensions of each point\n"
"\tPd is the path of the dataset file\n"
"\tPl is the path of the labels file\n"
"\n\t--verbose | -v is an optional flag to enable execution information output"
"\n\t--output | -o is an optional flag to enable points output in each iteration", argv[0]);
exit(1);
}
DEVIATION = atoi(argv[1]);
params->epsilon = atof(argv[2]);
NUMBER_OF_POINTS = atoi(argv[3]);
DIMENSIONS = atoi(argv[4]);
POINTS_FILENAME = argv[5];
LABELS_FILENAME = argv[6];
params->verbose = false;
params->display = false;
if (argc > 7){
for (int index=7; index<argc; ++index){
if (!strcmp(argv[index], "--verbose") || !strcmp(argv[index], "-v")){
params->verbose = true;
} else if (!strcmp(argv[index], "--output") || !strcmp(argv[index], "-o")){
params->display = true;
} else {
printf("Couldn't parse argument %d: %s\n", index, argv[index]);
exit(EXIT_FAILURE);
}
}
}
/*printf("DEVIATION = %d\n"
"epsilon = %f\n"
"NUMBER_OF_POINTS = %d\n"
"DIMENSIONS = %d\n"
"POINTS_FILENAME = %s\n"
"LABELS_FILENAME = %s\n"
"verbose = %d\n"
"display = %d\n", DEVIATION, params->epsilon, NUMBER_OF_POINTS, DIMENSIONS, POINTS_FILENAME
, LABELS_FILENAME, params->verbose, params->display);*/
}
void init(double ***vectors, char **labels){
int bytes_read = 0;
set_GPU();
if (params.verbose){
printf("Reading dataset and labels...\n");
}
// initializes vectors
FILE *points_file;
points_file = fopen(POINTS_FILENAME, "rb");
if (points_file != NULL){
// allocates memory for the array
(*vectors) = alloc_double(NUMBER_OF_POINTS, DIMENSIONS);
// reads vectors dataset from file
for (int i=0; i<NUMBER_OF_POINTS; i++){
bytes_read = fread((*vectors)[i], sizeof(double), DIMENSIONS, points_file);
if ( bytes_read != DIMENSIONS ){
if(feof(points_file)){
printf("Premature end of file reached.\n");
} else{
printf("Error reading points file.");
}
fclose(points_file);
exit(EXIT_FAILURE);
}
}
} else {
printf("Error reading dataset file.\n");
exit(EXIT_FAILURE);
}
fclose(points_file);
// initializes file that will contain the labels (train)
FILE *labels_file;
labels_file = fopen(LABELS_FILENAME, "rb");
if (labels_file != NULL){
// NOTE : Labels were classified as <class 'numpy.uint8'>
// variables of type uint8 are stored as 1-byte (8-bit) unsigned integers
// gets number of labels
fseek(labels_file, 0L, SEEK_END);
long int pos = ftell(labels_file);
rewind(labels_file);
int label_elements = pos/ sizeof(char);
// allocates memory for the array
*labels = (char*)malloc(label_elements* sizeof(char));
fseek(labels_file, 0L, SEEK_SET);
bytes_read = fread((*labels), sizeof(char), label_elements, labels_file);
if ( bytes_read != label_elements ){
if(feof(points_file)){
printf("Premature end of file reached.\n");
} else{
printf("Error reading points file.");
}
fclose(labels_file);
exit(EXIT_FAILURE);
}
}
fclose(labels_file);
if (params.verbose){
printf("Done.\n\n");
}
}
double **alloc_double(int rows, int cols) {
double *data = (double *) malloc(rows*cols*sizeof(double));
double **array = (double **) malloc(rows*sizeof(double*));
for (int i=0; i<rows; i++)
array[i] = &(data[cols*i]);
return array;
}
void duplicate(double **source, int rows, int cols, double ***dest){
for (int i=0; i<rows; i++){
for (int j=0; j<cols; j++){
(*dest)[i][j] = source[i][j];
}
}
}
void print_matrix(double **array, int rows, int cols){
for (int i=0; i<cols; i++){
for (int j=0; j<rows; j++){
printf("%f ", array[j][i]);
}
printf("\n");
}
}
void save_matrix(double **matrix, int iteration){
char filename[50];
snprintf(filename, sizeof(filename), "%s%d", "../output/output_", iteration);
FILE *file;
file = fopen(filename, "w");
for (int rows=0; rows<NUMBER_OF_POINTS; ++rows){
for (int cols=0; cols<DIMENSIONS; ++cols){
fprintf(file, "%f", matrix[rows][cols]);
if (cols != DIMENSIONS - 1){
fprintf(file, ",");
}
}
fprintf(file, "\n");
}
}

35
mean_shift_cuda_shared_mem/meanshift_utils.h

@ -0,0 +1,35 @@
#ifndef SERIAL_UTILS_H /* Include guard */
#define SERIAL_UTILS_H
#include <stdbool.h>
/* Structures */
//Parameters is used to store session specific variables in an orderly way
typedef struct parameters {
double epsilon;
bool verbose;
bool display;
} Parameters;
//Function get_args parses command line arguments
void get_args(int argc, char **argv, Parameters *params);
//Function init sets up the GPU for later use, gets its properties and reads the dataset and label
//arrays from the corresponding files
void init(double ***vectors, char **labels);
//Function alloc_double allocates rows*cols bytes of continuous memory
double **alloc_double(int rows, int cols);
//Function duplicate copies the values of source array to dest array
void duplicate(double **source, int rows, int cols, double ***dest);
//Function print_matrix prints array of dimensions <rows X cols> to the console
void print_matrix(double **array, int rows, int cols);
//Function save_matrix stores matrix in a csv file with path/filename "../output/output_iteration".
//If a file already exists new lines are concatenated
void save_matrix(double **matrix, int iteration);
#endif //SERIAL_UTILS_H
Loading…
Cancel
Save