diff --git a/mean_shift_cuda/Makefile b/mean_shift_cuda/Makefile
index a931e40..b244f3d 100644
--- a/mean_shift_cuda/Makefile
+++ b/mean_shift_cuda/Makefile
@@ -5,9 +5,10 @@ SHELL := /bin/bash
 
 CC = /usr/local/cuda/bin/nvcc
 RM = rm -f
-CFLAGS= -arch=sm_21 -lm -O0 -I.
-OBJ=meanshift.o meanshift_utils.o meanshift_kernels.o
-DEPS=meanshift_utils.h meanshift_kernels.h
+HOST_COMPILER = -ccbin gcc
+CFLAGS= -arch=sm_21 -lm -O0 -I. -Wno-deprecated-gpu-targets
+OBJ = meanshift.o meanshift_utils.o meanshift_kernels.o
+DEPS = meanshift_utils.h meanshift_kernels.h
 
 # ==========================================
 # TARGETS
@@ -22,7 +23,7 @@ all: $(EXECUTABLES)
 # DEPENDENCIES (HEADERS)
 
 %.o: %.cu $(DEPS)
-	$(CC) -x cu $(CFLAGS) -dc $< -o $@
+	$(CC) $(HOST_COMPILER) -x cu $(CFLAGS) -dc $< -o $@
 
 .PRECIOUS: $(EXECUTABLES) $(OBJ)
 
@@ -30,7 +31,7 @@ all: $(EXECUTABLES)
 # EXECUTABLE (MAIN)
 
 $(EXECUTABLES): $(OBJ)
-	$(CC) $(CFLAGS) $(OBJ) -o $@
+	$(CC) $(HOST_COMPILER) $(CFLAGS) $(OBJ) -o $@
 
 clean:
 	$(RM) *.o *~ $(EXECUTABLES)
\ No newline at end of file
diff --git a/mean_shift_cuda/meanshift b/mean_shift_cuda/meanshift
new file mode 100644
index 0000000..44369bc
Binary files /dev/null and b/mean_shift_cuda/meanshift differ
diff --git a/mean_shift_cuda/meanshift_kernels.cu b/mean_shift_cuda/meanshift_kernels.cu
index 79fb49a..69ebe70 100644
--- a/mean_shift_cuda/meanshift_kernels.cu
+++ b/mean_shift_cuda/meanshift_kernels.cu
@@ -5,15 +5,14 @@ __global__ void multiply_kernel(Matrix matrix1, Matrix matrix2, Matrix output){
     // Each thread computes one element of output
     // by accumulating results into cell_value
     double cell_value = 0;
-    int row = blockIdx.y * blockDim.y + threadIdx.y;
-    int col = blockIdx.x * blockDim.x + threadIdx.x;
+    int col = blockIdx.y * blockDim.y + threadIdx.y;
+    int row = blockIdx.x * blockDim.x + threadIdx.x;
 
-    if (row < output.height && col < output.width){
+    if (row + col < output.height * output.width){
         for (int element_index = 0; element_index < matrix1.width; ++element_index){
             cell_value += matrix1.elements[row * matrix1.width + element_index]
                 * matrix2.elements[element_index * matrix2.width + col];
         }
-        printf("%f\n", cell_value);
         output.elements[row * output.width + col] = cell_value;
     }
 }
\ No newline at end of file
diff --git a/mean_shift_cuda/meanshift_utils.cu b/mean_shift_cuda/meanshift_utils.cu
index e791067..6b2f30e 100644
--- a/mean_shift_cuda/meanshift_utils.cu
+++ b/mean_shift_cuda/meanshift_utils.cu
@@ -8,7 +8,7 @@
 #include "meanshift_kernels.h"
 
 #define OUTPUT_PREFIX "../output/output_"
-#define BLOCK_SIZE 16
+#define BLOCK_SIZE 8
 
 void get_args(int argc, char **argv){
     if (argc != 6) {
@@ -139,45 +139,9 @@ int meanshift(double **original_points, double ***shifted_points, int deviation
 
     // creates new y vector
     double **new_shift = alloc_2d_double(NUMBER_OF_POINTS, DIMENSIONS);
-//==============================================================================
 
     // builds nominator
-    /*multiply(kernel_matrix, original_points, new_shift);*/
-
-    Matrix d_kernel_matrix;
-    d_kernel_matrix.width = NUMBER_OF_POINTS;
-    d_kernel_matrix.height = NUMBER_OF_POINTS;
-    int size = NUMBER_OF_POINTS * NUMBER_OF_POINTS * sizeof(double);
-    cudaMalloc(&d_kernel_matrix.elements, size);
-    cudaMemcpy(d_kernel_matrix.elements, &(kernel_matrix[0][0]), size, cudaMemcpyHostToDevice);
-
-    Matrix d_original_points;
-    d_original_points.width = DIMENSIONS;
-    d_original_points.height = NUMBER_OF_POINTS;
-    size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double);
-    cudaMalloc(&d_original_points.elements, size);
-    cudaMemcpy(d_original_points.elements, &(original_points[0][0]), size, cudaMemcpyHostToDevice);
-
-    Matrix d_new_shift;
-    d_new_shift.width = DIMENSIONS;
-    d_new_shift.height = NUMBER_OF_POINTS;
-    size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double);
-    cudaMalloc(&d_new_shift.elements, size);
-
-    dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
-    dim3 dimGrid(d_original_points.width / dimBlock.x, d_kernel_matrix.height / dimBlock.y);
-
-    multiply_kernel<<<dimGrid, dimBlock>>>(d_kernel_matrix, d_original_points
-        , d_new_shift);
-
-    size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double);
-    cudaMemcpy(&(new_shift[0][0]), d_new_shift.elements, size, cudaMemcpyDeviceToHost);
-
-    cudaFree(d_kernel_matrix.elements);
-    cudaFree(d_original_points.elements);
-    cudaFree(d_new_shift.elements);
-
-//==============================================================================
+    multiply(kernel_matrix, original_points, &new_shift);
 
     // divides element-wise
     for (int i=0; i<NUMBER_OF_POINTS; i++){
@@ -230,6 +194,51 @@ double norm(double **matrix, int rows, int cols){
     return norm;
 }
 
+void multiply(double **kernel_matrix, double **original_points, double ***new_shift){
+	// allocates memory for kernel_matrix in GPU and copies the array
+	Matrix d_kernel_matrix;
+    d_kernel_matrix.width = NUMBER_OF_POINTS;
+    d_kernel_matrix.height = NUMBER_OF_POINTS;
+    int size = NUMBER_OF_POINTS * NUMBER_OF_POINTS * sizeof(double);
+    gpuErrchk( cudaMalloc(&d_kernel_matrix.elements, size) );
+    gpuErrchk( cudaMemcpy(d_kernel_matrix.elements, &(kernel_matrix[0][0])
+    	, size, cudaMemcpyHostToDevice) );
+
+    // allocates memory for original_points in GPU and copies the array
+    Matrix d_original_points;
+    d_original_points.width = DIMENSIONS;
+    d_original_points.height = NUMBER_OF_POINTS;
+    size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double);
+    gpuErrchk( cudaMalloc(&d_original_points.elements, size) );
+    gpuErrchk( cudaMemcpy(d_original_points.elements, &(original_points[0][0])
+    	, size, cudaMemcpyHostToDevice) );
+
+	// allocates memory for new_shift in GPU
+    Matrix d_new_shift;
+    d_new_shift.width = DIMENSIONS;
+    d_new_shift.height = NUMBER_OF_POINTS;
+    size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double);
+    gpuErrchk( cudaMalloc(&d_new_shift.elements, size) );
+
+    //dim3 dimBlock(16, 16);
+    //dim3 dimGrid(d_original_points.width / dimBlock.x, d_kernel_matrix.height / dimBlock.y);
+
+    dim3 dimBlock(10, 2);
+    dim3 dimGrid(60, 1);
+
+    multiply_kernel<<<dimGrid, dimBlock>>>(d_kernel_matrix, d_original_points, d_new_shift);
+    gpuErrchk( cudaPeekAtLastError() );
+    gpuErrchk( cudaDeviceSynchronize() );
+
+    size = NUMBER_OF_POINTS * DIMENSIONS * sizeof(double);
+    gpuErrchk( cudaMemcpy(&((*new_shift)[0][0]), d_new_shift.elements
+    	, size, cudaMemcpyDeviceToHost) );
+
+    gpuErrchk( cudaFree(d_kernel_matrix.elements) );
+    gpuErrchk( cudaFree(d_original_points.elements) );
+    gpuErrchk( cudaFree(d_new_shift.elements) );
+}
+
 double calculateDistance(double *y, double *x){
     double sum = 0, dif;
     for (int i=0; i<DIMENSIONS; i++){
diff --git a/mean_shift_cuda/meanshift_utils.h b/mean_shift_cuda/meanshift_utils.h
index 9fa97dc..a09de91 100644
--- a/mean_shift_cuda/meanshift_utils.h
+++ b/mean_shift_cuda/meanshift_utils.h
@@ -3,6 +3,17 @@
 
 #include <stdbool.h>
 
+//GPU error check snippet taken from:
+//https://stackoverflow.com/a/14038590
+#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
+inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true){
+   if (code != cudaSuccess) 
+   {
+      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
+      if (abort) exit(code);
+   }
+}
+
 extern int DEVIATION;
 extern int NUMBER_OF_POINTS;
 extern int DIMENSIONS;
@@ -30,6 +41,11 @@ int meanshift(double **original_points, double ***shifted_points, int h
 //Function norm returns the second norm of matrix of dimensions rowsXcols.
 double norm(double **matrix, int rows, int cols);
 
+//Function multiply allocates memory in GPU, sends the data and calls the 
+//multiply kernel function.
+void multiply(double **kernel_matrix, double **original_points
+	, double ***new_shift);
+
 //Function calculateDistance returns the distance between x and y vectors.
 double calculateDistance(double *y, double *x);