Minor tweaks

8 years ago · 106a7f449f
6 changed files with 150 additions and 46 deletions
--- a/mean_shift_cuda/Makefile
+++ b/mean_shift_cuda/Makefile
@ -3,13 +3,19 @@ SHELL := /bin/bash
 # ============================================
 # COMMANDS

-CC = /usr/local/cuda/bin/nvcc
-RM = rm -f
+CC = nvcc
 HOST_COMPILER = -ccbin gcc
-CFLAGS= -arch=sm_21 -lm -O0 -I. -Wno-deprecated-gpu-targets
+CUDA_FLAGS = -arch=sm_21 -Wno-deprecated-gpu-targets 
+C_FLAGS = -lm -O3 -I.
+
+COMPILE_FLAGS = $(HOST_COMPILER) -x cu $(CUDA_FLAGS) -dc $(C_FLAGS)
+LINK_FLAGS = $(HOST_COMPILER) $(CUDA_FLAGS) $(C_FLAGS)
+
 OBJ = meanshift.o meanshift_utils.o meanshift_kernels.o
 DEPS = meanshift_utils.h meanshift_kernels.h

+RM = rm -f
+
 # ==========================================
 # TARGETS

@ -23,7 +29,7 @@ all: $(EXECUTABLES)
 # DEPENDENCIES (HEADERS)

 %.o: %.cu $(DEPS)
-	$(CC) $(HOST_COMPILER) -x cu $(CFLAGS) -dc $< -o $@
+	$(CC) $(COMPILE_FLAGS) $< -o $@

 .PRECIOUS: $(EXECUTABLES) $(OBJ)

@ -31,7 +37,7 @@ all: $(EXECUTABLES)
 # EXECUTABLE (MAIN)

 $(EXECUTABLES): $(OBJ)
-	$(CC) $(HOST_COMPILER) $(CFLAGS) $(OBJ) -o $@
+	$(CC) $(LINK_FLAGS) $(OBJ) -o $@

 clean:
 	$(RM) *.o *~ $(EXECUTABLES)
--- a/mean_shift_cuda/meanshift
+++ b/mean_shift_cuda/meanshift
--- a/mean_shift_cuda/meanshift.cu
+++ b/mean_shift_cuda/meanshift.cu
@ -9,30 +9,38 @@ int NUMBER_OF_POINTS = 600;
 int DIMENSIONS = 2;
 char* POINTS_FILENAME = "../data/X.bin";
 char* LABELS_FILENAME = "../data/L.bin";
+parameters params;

 struct timeval startwtime, endwtime;
 double seq_time;

 int main(int argc, char **argv){
+    int iterations;
    double **vectors, **shifted_points;
    char *labels;
-    parameters params;

-    //get_args(argc, argv); commented out while in development
-    init(&vectors, &labels, &params);
+    params.epsilon = 0.0001;
+    params.verbose = true;
+    params.display = true;
+
+    //get_args(argc, argv, &params); //commented out while in development
+    init(&vectors, &labels);

    //save_matrix(vectors, 0);

    // tic
    gettimeofday (&startwtime, NULL);

-    int iterations = meanshift(vectors, &shifted_points, DEVIATION, &params);
-    printf("Total iterations = %d\n", iterations);
+    iterations = meanshift(vectors, &shifted_points, DEVIATION, &params);

    // toc
    gettimeofday (&endwtime, NULL);
    seq_time = (double)((endwtime.tv_usec - startwtime.tv_usec)/1.0e6 + endwtime.tv_sec - startwtime.tv_sec);
+    
+    if (params.verbose){
+        printf("\nTotal number of iterations = %d\n", iterations);
        printf("%s wall clock time = %f\n","Mean Shift", seq_time);
+    }

    //TODO write output points to file -> plot later
    //save_matrix(shifted_points, iterations);
--- a/mean_shift_cuda/meanshift_kernels.cu
+++ b/mean_shift_cuda/meanshift_kernels.cu
@ -5,8 +5,8 @@ __global__ void multiply_kernel(Matrix matrix1, Matrix matrix2, Matrix output){
    // Each thread computes one element of output
    // by accumulating results into cell_value
    double cell_value = 0;
-    int col = blockIdx.y * blockDim.y + threadIdx.y;
    int row = blockIdx.x * blockDim.x + threadIdx.x;
+    int col = blockIdx.y * blockDim.y + threadIdx.y;

    if (row + col < output.height * output.width){
        for (int element_index = 0; element_index < matrix1.width; ++element_index){
--- a/mean_shift_cuda/meanshift_utils.cu
+++ b/mean_shift_cuda/meanshift_utils.cu
@ -8,28 +8,66 @@
 #include "meanshift_kernels.h"

 #define OUTPUT_PREFIX "../output/output_"
-#define BLOCK_SIZE 8
-
-void get_args(int argc, char **argv){
-    if (argc != 6) {
-        printf("Usage: %s h N D Pd Pl\nwhere:\n", argv[0]);
-        printf("\th is the variance\n");
-        printf("\tN is the the number of points\n");
-        printf("\tD is the number of dimensions of each point\n");
-        printf("\tPd is the path of the dataset file\n");
-        printf("\tPl is the path of the labels file\n");
+int BLOCK_SIZE = 16;
+
+cudaDeviceProp device_properties;
+
+void get_args(int argc, char **argv, parameters *params){
+    if (argc < 7) {
+        printf("Usage: %s h e N D Pd Pl\nwhere:\n"
+        "\th is the variance\n"
+        "\te is the min distance, between two points, that is taken into account in computations\n"
+        "\tN is the the number of points\n"
+        "\tD is the number of dimensions of each point\n"
+        "\tPd is the path of the dataset file\n"
+        "\tPl is the path of the labels file\n"
+        "\n\t--verbose | -v is an optional flag to enable execution information output"
+        "\n\t--output | -o is an optional flag to enable points output in each iteration", argv[0]);
        exit(1);
    }

    DEVIATION = atoi(argv[1]);
-    NUMBER_OF_POINTS = atoi(argv[2]);
-    DIMENSIONS = atoi(argv[3]);
-    POINTS_FILENAME = argv[4];
-    LABELS_FILENAME = argv[5];
+    params->epsilon = atof(argv[2]);
+    NUMBER_OF_POINTS = atoi(argv[3]);
+    DIMENSIONS = atoi(argv[4]);
+    POINTS_FILENAME = argv[5];
+    LABELS_FILENAME = argv[6];
+    params->verbose = false;
+    params->display = false;
+
+    if (argc > 7){
+        for (int index=7; index<argc; ++index){
+            if (!strcmp(argv[index], "--verbose") || !strcmp(argv[index], "-v")){
+                params->verbose = true;
+            } else if (!strcmp(argv[index], "--output") || !strcmp(argv[index], "-o")){
+                params->display = true;
+            } else {
+                printf("Couldn't parse argument %d: %s\n", index, argv[index]);
+                exit(EXIT_FAILURE);
+            }
+        }
+    }
+
+    /*printf("DEVIATION = %d\n"
+        "epsilon = %f\n"
+        "NUMBER_OF_POINTS = %d\n"
+        "DIMENSIONS = %d\n"
+        "POINTS_FILENAME = %s\n"
+        "LABELS_FILENAME = %s\n"
+        "verbose = %d\n"
+        "display = %d\n", DEVIATION, params->epsilon, NUMBER_OF_POINTS, DIMENSIONS, POINTS_FILENAME
+            , LABELS_FILENAME, params->verbose, params->display);*/
 }

-void init(double ***vectors, char **labels, parameters *params){
+void init(double ***vectors, char **labels){
    int bytes_read = 0;
+
+    set_Gpu();
+
+    if (params.verbose){
+        printf("Reading dataset and labels...\n");
+    }
+
    // initializes vectors
    FILE *points_file;
    points_file = fopen(POINTS_FILENAME, "rb");
@ -83,10 +121,45 @@ void init(double ***vectors, char **labels, parameters *params){
    }
    fclose(labels_file);

-    // MEAN SHIFT OPTIONS
-    params->epsilon = 0.0001;
-    params->verbose = false;
-    params->display = false;
+    if (params.verbose){
+        printf("Done.\n\n");
+    }
+}
+
+//Based on https://stackoverflow.com/a/28113186
+//Poio psagmeno link https://www.cs.virginia.edu/~csadmin/wiki/index.php/CUDA_Support/Choosing_a_GPU
+void set_Gpu(){
+    int devices_count = 0, max_multiprocessors = 0, max_device = 0;
+
+    // gets devices count checking for errors like no devices or no drivers to check for
+    // devices available
+    gpuErrchk( cudaGetDeviceCount(&devices_count) );
+    for(int device_index = 0; device_index < devices_count; ++device_index){
+        // gets current index device's properties
+        cudaDeviceProp this_device_properties;
+        gpuErrchk( cudaGetDeviceProperties(&this_device_properties, device_index) );
+
+        // stores best available device's index
+        // only devices with compute capability >= 2.0 are able to run the code
+        if (max_multiprocessors < this_device_properties.multiProcessorCount
+            && this_device_properties.major >= 2 && this_device_properties.minor >= 0){
+            // stores devices properties for later use
+            device_properties = this_device_properties;
+            max_multiprocessors = this_device_properties.multiProcessorCount;
+            max_device = device_index;
+        }
+    }
+    // sets the device
+    gpuErrchk( cudaSetDevice(max_device) );
+    BLOCK_SIZE = device_properties.maxThreadsPerBlock;
+    if (params.verbose){
+        printf("Device chosen is \"%s\"\n"
+            "Device has %d multi processors and compute capability %d.%d\n"
+            "Setting BLOCK_SIZE to max threads per block supported (%d)\n\n"
+            , device_properties.name
+            , device_properties.multiProcessorCount, device_properties.major, device_properties.minor
+            , BLOCK_SIZE);
+    }
 }

 int meanshift(double **original_points, double ***shifted_points, int deviation
@ -157,11 +230,15 @@ int meanshift(double **original_points, double ***shifted_points, int deviation
    // updates shifted points pointer to the new array address
    shifted_points = &new_shift;

+    if (params.display){
        save_matrix((*shifted_points), iteration);
+    }

    // calculates norm of the new mean shift vector
    double current_norm = norm(mean_shift_vector, NUMBER_OF_POINTS, DIMENSIONS);
+    if (params.verbose){
        printf("Iteration n. %d, error %f \n", iteration, current_norm);
+    }

    /** iterates until convergence **/
    if (current_norm > opt->epsilon) {
@ -170,7 +247,7 @@ int meanshift(double **original_points, double ***shifted_points, int deviation
    }

    if (iteration == 0){
-        // cleans up this iteration's allocations
+        // cleans up allocations
        free(mean_shift_vector[0]);
        free(mean_shift_vector);
        free(kernel_matrix[0]);
@ -195,6 +272,8 @@ double norm(double **matrix, int rows, int cols){
 }

 void multiply(double **kernel_matrix, double **original_points, double ***new_shift){
+    static bool firstIter = true;
+
 	// allocates memory for kernel_matrix in GPU and copies the array
 	Matrix d_kernel_matrix;
    d_kernel_matrix.width = NUMBER_OF_POINTS;
@ -220,11 +299,16 @@ void multiply(double **kernel_matrix, double **original_points, double ***new_sh
    size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double);
    gpuErrchk( cudaMalloc(&d_new_shift.elements, size) );

-    //dim3 dimBlock(16, 16);
-    //dim3 dimGrid(d_original_points.width / dimBlock.x, d_kernel_matrix.height / dimBlock.y);
+    dim3 dimBlock((d_new_shift.height < sqrt(BLOCK_SIZE)) ? d_new_shift.height : sqrt(BLOCK_SIZE)
+        , (d_new_shift.width < sqrt(BLOCK_SIZE)) ? d_new_shift.width : sqrt(BLOCK_SIZE));
+    dim3 dimGrid((d_new_shift.height + dimBlock.x - 1) / dimBlock.x
+        , (d_new_shift.width + dimBlock.y - 1) / dimBlock.y);

-    dim3 dimBlock(10, 2);
-    dim3 dimGrid(60, 1);
+    if (firstIter && params.verbose){
+        printf("dimBlock.x = %d, dimBlock.y = %d\n", dimBlock.x, dimBlock.y);
+        printf("dimGrid.x = %d, dimGrid.y = %d\n\n", dimGrid.x, dimGrid.y);
+        firstIter = false;
+    }

    multiply_kernel<<<dimGrid, dimBlock>>>(d_kernel_matrix, d_original_points, d_new_shift);
    gpuErrchk( cudaPeekAtLastError() );
--- a/mean_shift_cuda/meanshift_utils.h
+++ b/mean_shift_cuda/meanshift_utils.h
@ -14,23 +14,29 @@ inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=t
   }
 }

-extern int DEVIATION;
-extern int NUMBER_OF_POINTS;
-extern int DIMENSIONS;
-extern char* POINTS_FILENAME;
-extern char* LABELS_FILENAME;
-
+/*        Structs       */
 typedef struct parameters {
    double epsilon;
    bool verbose;
    bool display;
 } parameters;

+/*        Global variables        */
+extern int DEVIATION;
+extern int NUMBER_OF_POINTS;
+extern int DIMENSIONS;
+extern char* POINTS_FILENAME;
+extern char* LABELS_FILENAME;
+extern parameters params;
+extern cudaDeviceProp device_properties;
+
 //Function get_args parses command line arguments.
-void get_args(int argc, char **argv);
+void get_args(int argc, char **argv, parameters *params);

 //Function init reads the dataset and label arrays from the corresponding files.
-void init(double ***vectors, char **labels, parameters *params);
+void init(double ***vectors, char **labels);
+
+void set_Gpu();

 //Function meanshift recursively shifts original points according to th
 //mean-shift algorithm saving the result to shiftedPoints. Struct opt has user