diff --git a/mean_shift_cuda/Makefile b/mean_shift_cuda/Makefile
index b244f3d..b2fb2c8 100644
--- a/mean_shift_cuda/Makefile
+++ b/mean_shift_cuda/Makefile
@@ -3,13 +3,19 @@ SHELL := /bin/bash
 # ============================================
 # COMMANDS
 
-CC = /usr/local/cuda/bin/nvcc
-RM = rm -f
+CC = nvcc
 HOST_COMPILER = -ccbin gcc
-CFLAGS= -arch=sm_21 -lm -O0 -I. -Wno-deprecated-gpu-targets
+CUDA_FLAGS = -arch=sm_21 -Wno-deprecated-gpu-targets 
+C_FLAGS = -lm -O3 -I.
+
+COMPILE_FLAGS = $(HOST_COMPILER) -x cu $(CUDA_FLAGS) -dc $(C_FLAGS)
+LINK_FLAGS = $(HOST_COMPILER) $(CUDA_FLAGS) $(C_FLAGS)
+
 OBJ = meanshift.o meanshift_utils.o meanshift_kernels.o
 DEPS = meanshift_utils.h meanshift_kernels.h
 
+RM = rm -f
+
 # ==========================================
 # TARGETS
 
@@ -23,7 +29,7 @@ all: $(EXECUTABLES)
 # DEPENDENCIES (HEADERS)
 
 %.o: %.cu $(DEPS)
-	$(CC) $(HOST_COMPILER) -x cu $(CFLAGS) -dc $< -o $@
+	$(CC) $(COMPILE_FLAGS) $< -o $@
 
 .PRECIOUS: $(EXECUTABLES) $(OBJ)
 
@@ -31,7 +37,7 @@ all: $(EXECUTABLES)
 # EXECUTABLE (MAIN)
 
 $(EXECUTABLES): $(OBJ)
-	$(CC) $(HOST_COMPILER) $(CFLAGS) $(OBJ) -o $@
+	$(CC) $(LINK_FLAGS) $(OBJ) -o $@
 
 clean:
 	$(RM) *.o *~ $(EXECUTABLES)
\ No newline at end of file
diff --git a/mean_shift_cuda/meanshift b/mean_shift_cuda/meanshift
index 44369bc..02e8444 100644
Binary files a/mean_shift_cuda/meanshift and b/mean_shift_cuda/meanshift differ
diff --git a/mean_shift_cuda/meanshift.cu b/mean_shift_cuda/meanshift.cu
index f312238..778d081 100644
--- a/mean_shift_cuda/meanshift.cu
+++ b/mean_shift_cuda/meanshift.cu
@@ -9,30 +9,38 @@ int NUMBER_OF_POINTS = 600;
 int DIMENSIONS = 2;
 char* POINTS_FILENAME = "../data/X.bin";
 char* LABELS_FILENAME = "../data/L.bin";
+parameters params;
 
 struct timeval startwtime, endwtime;
 double seq_time;
 
 int main(int argc, char **argv){
+    int iterations;
     double **vectors, **shifted_points;
     char *labels;
-    parameters params;
 
-    //get_args(argc, argv); commented out while in development
-    init(&vectors, &labels, &params);
+    params.epsilon = 0.0001;
+    params.verbose = true;
+    params.display = true;
+
+    //get_args(argc, argv, &params); //commented out while in development
+    init(&vectors, &labels);
 
     //save_matrix(vectors, 0);
 
     // tic
     gettimeofday (&startwtime, NULL);
 
-    int iterations = meanshift(vectors, &shifted_points, DEVIATION, &params);
-    printf("Total iterations = %d\n", iterations);
+    iterations = meanshift(vectors, &shifted_points, DEVIATION, &params);
 
     // toc
     gettimeofday (&endwtime, NULL);
     seq_time = (double)((endwtime.tv_usec - startwtime.tv_usec)/1.0e6 + endwtime.tv_sec - startwtime.tv_sec);
-    printf("%s wall clock time = %f\n","Mean Shift", seq_time);
+    
+    if (params.verbose){
+        printf("\nTotal number of iterations = %d\n", iterations);
+        printf("%s wall clock time = %f\n","Mean Shift", seq_time);
+    }
 
     //TODO write output points to file -> plot later
     //save_matrix(shifted_points, iterations);
diff --git a/mean_shift_cuda/meanshift_kernels.cu b/mean_shift_cuda/meanshift_kernels.cu
index 69ebe70..ff284b8 100644
--- a/mean_shift_cuda/meanshift_kernels.cu
+++ b/mean_shift_cuda/meanshift_kernels.cu
@@ -5,8 +5,8 @@ __global__ void multiply_kernel(Matrix matrix1, Matrix matrix2, Matrix output){
     // Each thread computes one element of output
     // by accumulating results into cell_value
     double cell_value = 0;
-    int col = blockIdx.y * blockDim.y + threadIdx.y;
     int row = blockIdx.x * blockDim.x + threadIdx.x;
+    int col = blockIdx.y * blockDim.y + threadIdx.y;
 
     if (row + col < output.height * output.width){
         for (int element_index = 0; element_index < matrix1.width; ++element_index){
diff --git a/mean_shift_cuda/meanshift_utils.cu b/mean_shift_cuda/meanshift_utils.cu
index 6b2f30e..754c5e5 100644
--- a/mean_shift_cuda/meanshift_utils.cu
+++ b/mean_shift_cuda/meanshift_utils.cu
@@ -8,28 +8,66 @@
 #include "meanshift_kernels.h"
 
 #define OUTPUT_PREFIX "../output/output_"
-#define BLOCK_SIZE 8
-
-void get_args(int argc, char **argv){
-    if (argc != 6) {
-        printf("Usage: %s h N D Pd Pl\nwhere:\n", argv[0]);
-        printf("\th is the variance\n");
-        printf("\tN is the the number of points\n");
-        printf("\tD is the number of dimensions of each point\n");
-        printf("\tPd is the path of the dataset file\n");
-        printf("\tPl is the path of the labels file\n");
+int BLOCK_SIZE = 16;
+
+cudaDeviceProp device_properties;
+
+void get_args(int argc, char **argv, parameters *params){
+    if (argc < 7) {
+        printf("Usage: %s h e N D Pd Pl\nwhere:\n"
+        "\th is the variance\n"
+        "\te is the min distance, between two points, that is taken into account in computations\n"
+        "\tN is the the number of points\n"
+        "\tD is the number of dimensions of each point\n"
+        "\tPd is the path of the dataset file\n"
+        "\tPl is the path of the labels file\n"
+        "\n\t--verbose | -v is an optional flag to enable execution information output"
+        "\n\t--output | -o is an optional flag to enable points output in each iteration", argv[0]);
         exit(1);
     }
 
     DEVIATION = atoi(argv[1]);
-    NUMBER_OF_POINTS = atoi(argv[2]);
-    DIMENSIONS = atoi(argv[3]);
-    POINTS_FILENAME = argv[4];
-    LABELS_FILENAME = argv[5];
+    params->epsilon = atof(argv[2]);
+    NUMBER_OF_POINTS = atoi(argv[3]);
+    DIMENSIONS = atoi(argv[4]);
+    POINTS_FILENAME = argv[5];
+    LABELS_FILENAME = argv[6];
+    params->verbose = false;
+    params->display = false;
+
+    if (argc > 7){
+        for (int index=7; index<argc; ++index){
+            if (!strcmp(argv[index], "--verbose") || !strcmp(argv[index], "-v")){
+                params->verbose = true;
+            } else if (!strcmp(argv[index], "--output") || !strcmp(argv[index], "-o")){
+                params->display = true;
+            } else {
+                printf("Couldn't parse argument %d: %s\n", index, argv[index]);
+                exit(EXIT_FAILURE);
+            }
+        }
+    }
+
+    /*printf("DEVIATION = %d\n"
+        "epsilon = %f\n"
+        "NUMBER_OF_POINTS = %d\n"
+        "DIMENSIONS = %d\n"
+        "POINTS_FILENAME = %s\n"
+        "LABELS_FILENAME = %s\n"
+        "verbose = %d\n"
+        "display = %d\n", DEVIATION, params->epsilon, NUMBER_OF_POINTS, DIMENSIONS, POINTS_FILENAME
+            , LABELS_FILENAME, params->verbose, params->display);*/
 }
 
-void init(double ***vectors, char **labels, parameters *params){
+void init(double ***vectors, char **labels){
     int bytes_read = 0;
+
+    set_Gpu();
+
+    if (params.verbose){
+        printf("Reading dataset and labels...\n");
+    }
+
     // initializes vectors
     FILE *points_file;
     points_file = fopen(POINTS_FILENAME, "rb");
@@ -83,10 +121,45 @@ void init(double ***vectors, char **labels, parameters *params){
     }
     fclose(labels_file);
 
-    // MEAN SHIFT OPTIONS
-    params->epsilon = 0.0001;
-    params->verbose = false;
-    params->display = false;
+    if (params.verbose){
+        printf("Done.\n\n");
+    }
+}
+
+//Based on https://stackoverflow.com/a/28113186
+//Poio psagmeno link https://www.cs.virginia.edu/~csadmin/wiki/index.php/CUDA_Support/Choosing_a_GPU
+void set_Gpu(){
+    int devices_count = 0, max_multiprocessors = 0, max_device = 0;
+
+    // gets devices count checking for errors like no devices or no drivers to check for
+    // devices available
+    gpuErrchk( cudaGetDeviceCount(&devices_count) );
+    for(int device_index = 0; device_index < devices_count; ++device_index){
+        // gets current index device's properties
+        cudaDeviceProp this_device_properties;
+        gpuErrchk( cudaGetDeviceProperties(&this_device_properties, device_index) );
+
+        // stores best available device's index
+        // only devices with compute capability >= 2.0 are able to run the code
+        if (max_multiprocessors < this_device_properties.multiProcessorCount
+            && this_device_properties.major >= 2 && this_device_properties.minor >= 0){
+            // stores devices properties for later use
+            device_properties = this_device_properties;
+            max_multiprocessors = this_device_properties.multiProcessorCount;
+            max_device = device_index;
+        }
+    }
+    // sets the device
+    gpuErrchk( cudaSetDevice(max_device) );
+    BLOCK_SIZE = device_properties.maxThreadsPerBlock;
+    if (params.verbose){
+        printf("Device chosen is \"%s\"\n"
+            "Device has %d multi processors and compute capability %d.%d\n"
+            "Setting BLOCK_SIZE to max threads per block supported (%d)\n\n"
+            , device_properties.name
+            , device_properties.multiProcessorCount, device_properties.major, device_properties.minor
+            , BLOCK_SIZE);
+    }
 }
 
 int meanshift(double **original_points, double ***shifted_points, int deviation
@@ -157,11 +230,15 @@ int meanshift(double **original_points, double ***shifted_points, int deviation
     // updates shifted points pointer to the new array address
     shifted_points = &new_shift;
 
-    save_matrix((*shifted_points), iteration);
+    if (params.display){
+        save_matrix((*shifted_points), iteration);
+    }
 
     // calculates norm of the new mean shift vector
     double current_norm = norm(mean_shift_vector, NUMBER_OF_POINTS, DIMENSIONS);
-    printf("Iteration n. %d, error %f \n", iteration, current_norm);
+    if (params.verbose){
+        printf("Iteration n. %d, error %f \n", iteration, current_norm);
+    }
 
     /** iterates until convergence **/
     if (current_norm > opt->epsilon) {
@@ -170,7 +247,7 @@ int meanshift(double **original_points, double ***shifted_points, int deviation
     }
 
     if (iteration == 0){
-        // cleans up this iteration's allocations
+        // cleans up allocations
         free(mean_shift_vector[0]);
         free(mean_shift_vector);
         free(kernel_matrix[0]);
@@ -195,6 +272,8 @@ double norm(double **matrix, int rows, int cols){
 }
 
 void multiply(double **kernel_matrix, double **original_points, double ***new_shift){
+    static bool firstIter = true;
+
 	// allocates memory for kernel_matrix in GPU and copies the array
 	Matrix d_kernel_matrix;
     d_kernel_matrix.width = NUMBER_OF_POINTS;
@@ -220,11 +299,16 @@ void multiply(double **kernel_matrix, double **original_points, double ***new_sh
     size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double);
     gpuErrchk( cudaMalloc(&d_new_shift.elements, size) );
 
-    //dim3 dimBlock(16, 16);
-    //dim3 dimGrid(d_original_points.width / dimBlock.x, d_kernel_matrix.height / dimBlock.y);
+    dim3 dimBlock((d_new_shift.height < sqrt(BLOCK_SIZE)) ? d_new_shift.height : sqrt(BLOCK_SIZE)
+        , (d_new_shift.width < sqrt(BLOCK_SIZE)) ? d_new_shift.width : sqrt(BLOCK_SIZE));
+    dim3 dimGrid((d_new_shift.height + dimBlock.x - 1) / dimBlock.x
+        , (d_new_shift.width + dimBlock.y - 1) / dimBlock.y);
 
-    dim3 dimBlock(10, 2);
-    dim3 dimGrid(60, 1);
+    if (firstIter && params.verbose){
+        printf("dimBlock.x = %d, dimBlock.y = %d\n", dimBlock.x, dimBlock.y);
+        printf("dimGrid.x = %d, dimGrid.y = %d\n\n", dimGrid.x, dimGrid.y);
+        firstIter = false;
+    }
 
     multiply_kernel<<<dimGrid, dimBlock>>>(d_kernel_matrix, d_original_points, d_new_shift);
     gpuErrchk( cudaPeekAtLastError() );
diff --git a/mean_shift_cuda/meanshift_utils.h b/mean_shift_cuda/meanshift_utils.h
index a09de91..ae15d02 100644
--- a/mean_shift_cuda/meanshift_utils.h
+++ b/mean_shift_cuda/meanshift_utils.h
@@ -14,23 +14,29 @@ inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=t
    }
 }
 
-extern int DEVIATION;
-extern int NUMBER_OF_POINTS;
-extern int DIMENSIONS;
-extern char* POINTS_FILENAME;
-extern char* LABELS_FILENAME;
-
+/*        Structs       */
 typedef struct parameters {
     double epsilon;
     bool verbose;
     bool display;
 } parameters;
 
+/*        Global variables        */
+extern int DEVIATION;
+extern int NUMBER_OF_POINTS;
+extern int DIMENSIONS;
+extern char* POINTS_FILENAME;
+extern char* LABELS_FILENAME;
+extern parameters params;
+extern cudaDeviceProp device_properties;
+
 //Function get_args parses command line arguments.
-void get_args(int argc, char **argv);
+void get_args(int argc, char **argv, parameters *params);
 
 //Function init reads the dataset and label arrays from the corresponding files.
-void init(double ***vectors, char **labels, parameters *params);
+void init(double ***vectors, char **labels);
+
+void set_Gpu();
 
 //Function meanshift recursively shifts original points according to th
 //mean-shift algorithm saving the result to shiftedPoints. Struct opt has user