authParallelAndDistributedS.../mean_shift_cuda/meanshift_kernels.cu


								#include "meanshift_kernels.h"

								#include <stdio.h>


								__global__ void calculate_kernel_matrix_kernel(Matrix shifted_points, Matrix original_points,

								    double deviation, Matrix kernel_matrix){

								    // each thread calculates one element of kernel_matrix

								    int row = blockIdx.x * blockDim.x + threadIdx.x;

								    int col = blockIdx.y * blockDim.y + threadIdx.y;


								    // performs calculations only if thread's indexes are within matrix bounds

								    if (row * kernel_matrix.width + col >= kernel_matrix.width * kernel_matrix.height){

								        return;

								    }


								    int dimensions = shifted_points.width;

								    // calculate distance

								    double sum = 0, dif;

								    for (int i=0; i<dimensions; i++){

								        dif = shifted_points.elements[row * dimensions + i]

								            - original_points.elements[col * dimensions + i];

								        sum += dif * dif;

								    }

								    double distance = sqrt(sum);


								    double deviation_square = deviation*deviation;

								    if (distance < deviation_square){

								        // computes kernel matrix

								        double pow = ((-1)*(distance * distance))/(2*(deviation_square));

								        kernel_matrix.elements[row * kernel_matrix.width + col] = exp(pow);

								    } else {

								        kernel_matrix.elements[row * kernel_matrix.width + col] = 0;

								    }

								    if (row == col){

								        kernel_matrix.elements[row * kernel_matrix.width + col] += 1;

								    }

								}


								__global__ void denominator_kernel(Matrix denominator, Matrix kernel_matrix){

								    // each thread computes one element of denominator_kernel

								    // by accumulating results into cell_value

								    double cell_value = 0;

								    int row = blockIdx.x * blockDim.x + threadIdx.x;


								    // performs calculations only if thread's indexes are within matrix bounds

								    if (row >= denominator.height){

								        return;

								    }


								    for (int column = 0; column < kernel_matrix.width; ++column){

								         cell_value += kernel_matrix.elements[row * kernel_matrix.width + column];

								    }

								    denominator.elements[row] = cell_value;

								}


								__global__ void shift_points_kernel(Matrix original_points, Matrix kernel_matrix,

								    Matrix shifted_points, Matrix new_shift, Matrix denominator, Matrix mean_shift_vector){

								    // each thread computes one element of new_shift

								    // by accumulating results into cell_value

								    double cell_value = 0;

								    int row = blockIdx.x * blockDim.x + threadIdx.x;

								    int col = blockIdx.y * blockDim.y + threadIdx.y;


								    // performs calculations only if thread's indexes are within matrix bounds

								    if (row * new_shift.width + col >= new_shift.width * new_shift.height){

								        return;

								    }


								    // calculates new_shift

								    // builds nominator by multiplying kernel_matrix and original_points

								    for (int element_index = 0; element_index < kernel_matrix.width; ++element_index){

								        cell_value += kernel_matrix.elements[row * kernel_matrix.width + element_index]

								            * original_points.elements[element_index * original_points.width + col];

								    }

								    // new_shift elements are calculated by dividing with the denominator

								    new_shift.elements[row * new_shift.width + col] =

								        cell_value / denominator.elements[row];


								    // calculates mean-shift vector

								    mean_shift_vector.elements[row * new_shift.width + col] =

								        new_shift.elements[row * new_shift.width + col] -

								        shifted_points.elements[row * new_shift.width + col];

								}


								__global__ void norm(Matrix mean_shift_vector, double *current_norm) {

								    // each thread computes one element of new_shift

								    // by accumulating results into cell_value

								    double cell_value = 0;

								    int row = blockIdx.x * blockDim.x + threadIdx.x;


								    // performs calculations only if thread's indexes are within matrix bounds

								    if (row >= mean_shift_vector.height){

								        return;

								    }


								//    for (int column = 0; column < kernel_matrix.width; ++column){

								//        cell_value += kernel_matrix.elements[row * kernel_matrix.width + column];

								//    }

								    // performs calculations only if thread's indexes are within matrix bounds

								//    if (row * mean_shift_vector.width + col >= mean_shift_vector.width * mean_shift_vector.height){

								//        return;

								//    }


								    for (int element_index = 0; element_index < mean_shift_vector.width; ++element_index){

								        cell_value += mean_shift_vector.elements[row * mean_shift_vector.width + element_index]

								                      * mean_shift_vector.elements[row * mean_shift_vector.width + element_index];

								    }


								    *current_norm = sqrt(cell_value);


								//    // new_shift elements are calculated by dividing with the denominator

								//    new_shift.elements[row * new_shift.width + col] =

								//            cell_value / denominator.elements[row];

								//

								//    // calculates mean-shift vector

								//    mean_shift_vector.elements[row * new_shift.width + col] =

								//            new_shift.elements[row * new_shift.width + col] -

								//            shifted_points.elements[row * new_shift.width + col];


								//    int n_tid = 2 * (threadIdx.x + blockIdx.x * blockDim.x);

								//    int i = 1;

								//    int initial_tid = n_tid / 2;

								//    int limit = gridDim.x * blockDim.x;


								//    int block_end = 2 * (blockIdx.x * blockDim.x + blockDim.x) - 1;

								//

								//    if (n_tid < (2 * limit)){

								//

								//        while ( (i < (2 * blockDim.x)) && n_tid < block_end &&

								//                (n_tid + i) <= block_end){

								//

								//            norms[n_tid] += norms[n_tid + i];

								//            n_tid = n_tid + i * (initial_tid * 2 - 2 * (blockIdx.x * blockDim.x));

								//            i *= 2;

								//            __syncthreads();

								//        }

								//

								//

								//        if (!((initial_tid) % blockDim.x))

								//            norm_per_block[blockIdx.x] = norms[n_tid];

								//

								//    }

								}