diff --git a/cuda_helpers.cu b/cuda_helpers.cu
new file mode 100644
index 0000000..06433d8
--- /dev/null
+++ b/cuda_helpers.cu
@@ -0,0 +1,21 @@
+
+// Host code 
+int width = 64, height = 64; 
+float* devPtr; 
+size_t pitch; 
+cudaMallocPitch(&devPtr, &pitch, width * sizeof(float), height); 
+MyKernel<<<100, 512>>>(devPtr, pitch, width, height); 
+
+
+// Device code 
+__global__ voidMyKernel(float* devPtr, size_t pitch, int width, int height) { 
+    for (int r = 0; r < height; ++r) { 
+        float* row = (float*)((char*)devPtr + r * pitch);
+        for (int c = 0; c < width; ++c) {
+            float element = row[c]; 
+        } 
+    } 
+}
+
+Read more at: http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#ixzz54kTh80mZ 
+Follow us: @GPUComputing on Twitter | NVIDIA on Facebook
\ No newline at end of file
diff --git a/mean-shift.cu b/mean-shift.cu
index a14d30a..916c94a 100644
--- a/mean-shift.cu
+++ b/mean-shift.cu
@@ -131,6 +131,7 @@ void meanshift(double **x, int h, struct parameters *opt){
 
     //Copy vectors from host memory to device memory
     cudaMemcpy(d_y, y, ROWS * COLUMNS * sizeof(double), cudaMemcpyHostToDevice);
+    // y[i][j] == d_y[COLUMNS*i + j]
     cudaMemcpy(d_m, m, ROWS * COLUMNS * sizeof(double), cudaMemcpyHostToDevice);
 
 
@@ -280,8 +281,9 @@ void print_matrix(double ** array, int rows, int cols){
     }
 }
 
-__global__ void iteration (double norm, double epsilon){
+__global__ void iteration (double* W, double epsilon){
     // TODO check if they also need cudamalloc
+    // todo find how to keep counter
     int iter;
     int i = blockDim.x * blockIdx.x + threadIdx.x;
     int j = blockDim.x * blockIdx.x + threadIdx.x;