%% AUTHOR[1] : Apostolos Fanakis (8261)
%% EMAIL[1]  : apostolof@auth.gr
%% AUTHOR[2] : Charalampos Papadiakos (8302)
%% EMAIL[2]  : charaldp@ece.auth.gr
%% AUTHOR[3] : Hlektra Mitsi ()
%% EMAIL[3]  :
%% $DATE     : 28-December-2018 12:45:00 $
%% $Revision : 1.00 $
%% DEVELOPED : 9.0.0.341360 (R2016a)
%% FILENAME  : spike_sorting.m
%%
%% =================================================================================================
%% S.1
datasetMedians(8) = 0;
datasetFactors(8) = 0;

thresholdFactorInitValue = 3; % k starting value
thresholdFactorEndValue = 12; % k ending value
thresholdFactorStep = 0.01; % k jumping step
numberOfFactors = length(thresholdFactorInitValue:thresholdFactorStep:thresholdFactorEndValue);
numberOfSpikesPerFactor(numberOfFactors) = 0;

for fileIndex=1:8
    fprintf('Loading test dataset no. %d\n', fileIndex);
    filename = sprintf('dataset\\Data_Test_%d.mat', fileIndex);
    Dataset = load(filename);
    data = double(Dataset.data);
    
    %% Q.1.1
    figure();
    plot(data(1:10000));
    xlim([0, 10000]);
    title(['First 10000 samples of dataset #' num2str(fileIndex)]);
    xlabel('Sample #');
    ylabel('Trivial Unit'); %TODO: Is this mVolts?
    drawnow;
    
    %% Q.1.2
    dataMedian = median(abs(data)/0.6745);
    datasetMedians(fileIndex) = dataMedian;
    
    for factorIteration=1:numberOfFactors % runs for each k
        % builds threshold
        thresholdFactor = thresholdFactorInitValue + (factorIteration - 1) * thresholdFactorStep;
        threshold = thresholdFactor * dataMedian;
        
        % calculates number of spikes
        sample = 1;
        while sample <= length(data)
            if data(sample) >= threshold
                % spike found
                numberOfSpikesPerFactor(factorIteration) = numberOfSpikesPerFactor(factorIteration) + 1;
                
                % skips cheking until values are below threshold again
                while sample <= length(data)
                    sample = sample + 1;
                    if (data(sample) <= threshold)
                        break;
                    end
                end
            end
            sample = sample + 1;
        end
    end
    
    figure();
    % trims zeros
    numberOfSpikesTrimmed = numberOfSpikesPerFactor(1:find(numberOfSpikesPerFactor,1,'last'));
    endValue = thresholdFactorInitValue + thresholdFactorStep * (length(numberOfSpikesTrimmed) - 1);
    plot(thresholdFactorInitValue:thresholdFactorStep:endValue, numberOfSpikesTrimmed);
    title(['Number of spikes for different values of k for dataset #' num2str(fileIndex)]);
    xlabel('Threshold factor (k)');
    ylabel('Number of spikes');
    hold on;
    plot([thresholdFactorInitValue endValue], [double(Dataset.spikeNum), double(Dataset.spikeNum)]);
    xlim([thresholdFactorInitValue endValue]);
    drawnow;
    hold off;
    
    % finds dataset's theshold factor k that produces the closest number of
    % spikes ot the ground truth
    [minValue, closestIndex] = min(abs(numberOfSpikesTrimmed-Dataset.spikeNum));
    datasetFactors(fileIndex) = thresholdFactorInitValue + (closestIndex - 1) * thresholdFactorStep;
    
    clear Dataset
    clear data
end
fprintf('\n');

%% Q.1.3
figure();
plot(datasetMedians, datasetFactors, 'o');
title('Polynomial curve fitting on median-threshold factor value pairs');
xlabel('Dataset median');
ylabel('Threshold factor');
hold on;

empiricalRule = polyfit(datasetMedians, datasetFactors, 8);
visualizationX = linspace(0, 0.5, 50);
visualizationY = polyval(empiricalRule, visualizationX);
plot(visualizationX, visualizationY);
hold off

%% =================================================================================================
%% S.2
clearvars = {'closestIndex' 'datasetFactors' 'datasetMedians' 'endValue' 'minValue' ...
    'numberOfFactors' 'numberOfSpikesPerFactor' 'numberOfSpikesTrimmed' 'thresholdFactorEndValue' ...
    'thresholdFactorInitValue' 'thresholdFactorStep' 'visualizationX' 'visualizationY'};
clear(clearvars{:})
clear clearvars

for fileIndex=1:4
    fprintf('Loading evaluation dataset no. %d \n', fileIndex);
    filename = sprintf('dataset\\Data_Eval_E_%d.mat', fileIndex);
    Dataset = load(filename);
    data = double(Dataset.data);
    
    %% Q.2.1 and Q.2.2
    dataMedian = median(abs(data)/0.6745);
    
    %factorEstimation = polyval(empiricalRule, dataMedian);
    factorEstimation = 4;
    threshold = factorEstimation * dataMedian;
    numberOfSpikes = 0;
    spikesTimesEst(2500) = 0;
    spikesEst(2500, 64) = 0;
    
    figure();
    plot(data(1:10000));
    xlim([0, 10000]);
    title(['First 10000 samples of dataset #' num2str(fileIndex)]);
    xlabel('Sample #');
    ylabel('Trivial Unit'); %TODO: Is this mVolts?
    hold on;
    plot([1 10000], [threshold, threshold]);
    drawnow;
    
    % calculates number of spikes
    spikeStartIndex = 1;
    spikeEndIndex = 1;
    sample = 1;
    while sample <= length(data)
        if data(sample) >= threshold
            % spike found
            numberOfSpikes = numberOfSpikes + 1;
            spikeStartIndex = sample;
            
            % skips cheking until values are below threshold again
            while sample <= length(data)
                sample = sample + 1;
                if (data(sample) <= threshold)
                    % finds the index of the max sample for this spike
                    spikeEndIndex = sample;
                    [~, relativeMaxIndex] = max(data(spikeStartIndex:spikeEndIndex));
                    absoluteMaxIndex = spikeStartIndex - 1 + relativeMaxIndex;
                    
                    % defines an area of -41/+22 samples around the max
                    % and searches for the min
                    [~, relativeMinIndex] = min(data(absoluteMaxIndex-41:absoluteMaxIndex+22));
                    absoluteMinIndex = absoluteMaxIndex - 41 + relativeMinIndex;
                    
                    % discernes the extrema (minimum or maximum) that
                    % occurs first
                    firstIndex = min([absoluteMaxIndex absoluteMinIndex]);
                    
                    % Q.2.1
                    spikesTimesEst(numberOfSpikes) = firstIndex;
                    % Q.2.2
                    spikesEst(numberOfSpikes, :) = data(firstIndex-34:firstIndex+29);
                    break;
                end
            end
        end
        sample = sample + 1;
    end
    
    fprintf('%d spikes found for dataset #%d\n', numberOfSpikes, fileIndex);
    fprintf('actual number of spikes = %d\n', length(Dataset.spikeTimes));
    fprintf('diff = %d\n\n', numberOfSpikes - length(Dataset.spikeTimes));
    
    figure();
    hold on;
    for spike=1:numberOfSpikes
        plot(1:64, spikesEst(spike, :));
    end
    title(['Spikes of dataset #' num2str(fileIndex) ' aligned at the first extrema']);
    xlabel('Samples');
    ylabel('Trivial Unit');
    drawnow;
    hold off;
    
    %% Q.2.3
    realSpikeIndex = double(Dataset.spikeTimes);
    numberOfCorrectEstimations = 0;
    numberOfUndetectedSpikes = 0;
    averageEstimationError = 0;
    
    correctSpikes(2500, 64) = 0;
    spikeClass(2500) = 0;
    
    for trueSpikeIndex=1:length(realSpikeIndex)
        [estimationError, closestIndex] = min(abs(spikesTimesEst-realSpikeIndex(trueSpikeIndex)));
        if estimationError < 32
            numberOfCorrectEstimations = numberOfCorrectEstimations + 1;
            averageEstimationError = averageEstimationError + estimationError;
            
            correctSpikes(numberOfCorrectEstimations, :) = spikesEst(closestIndex, :);
            spikeClass(numberOfCorrectEstimations) = double(Dataset.spikeClass(trueSpikeIndex));
        else
            numberOfUndetectedSpikes = numberOfUndetectedSpikes + 1;
        end
    end
    
    averageEstimationError = averageEstimationError / numberOfSpikes;
    
    fprintf('Number of correct spike detections is %d\n', numberOfCorrectEstimations);
    if numberOfSpikes-numberOfCorrectEstimations > 0
        fprintf('Number of uncorrect spike detections is %d\n', ...
            numberOfSpikes - numberOfCorrectEstimations);
    end
    fprintf('Number of undetected spikes is %d\n', numberOfUndetectedSpikes);
    fprintf('Average error of spike index estimation is %.2f\n\n', averageEstimationError);
    
    %% Q.2.4
    features(numberOfCorrectEstimations, 7) = 0;
    pcaCoefficients = pca(correctSpikes, 'NumComponents', 2);
    
    for spike=1:numberOfCorrectEstimations
        % finds index of max
        [maxVal, features(spike, 1)] = max(correctSpikes(spike, :));
        % calculates Vpeak-to-peak
        features(spike, 2) = maxVal - min(correctSpikes(spike, :));
        % calculates ZCR
        features(spike, 3) = mean(abs(diff(sign(correctSpikes(spike, :)))));
        % calculates the signal energy
        features(spike, 4) = sum(correctSpikes(spike, :).^2);
        % calculates the fft of the signal
        asud = fft(correctSpikes(spike, :));
        features(spike, 5) = asud(1)^2;
        
        features(spike, 6) = correctSpikes(spike, :) * pcaCoefficients(:, 1);
        features(spike, 7) = correctSpikes(spike, :) * pcaCoefficients(:, 2);
    end
    
    figure();
    scatter(features(:, 1), features(:, 2));
    title(['Dataset #' num2str(fileIndex) ' feature plot']);
    xlabel('Index of max value');
    ylabel('Peak-to-peak amplitude');
    figure();
    scatter(features(:, 6), features(:, 7));
    title(['Dataset #' num2str(fileIndex) ' feature plot']);
    xlabel('PCA feature 1');
    ylabel('PCA feature 2');
    
    %% Q.2.5
    accuracy = MyClassify(features, spikeClass');
    fprintf('Accuracy achieved is %.2f%%\n\n', accuracy);
    
    % clustering using DB-SCAN algorithm
    % code for DB-SCAN downloaded from here:
    % https://www.peterkovesi.com/matlabfns/
    if fileIndex == 1
        distThreshold = 0.4;
        minPts = 20;
    elseif fileIndex == 2
        distThreshold = 0.15;
        minPts = 30;
    elseif fileIndex == 3
        distThreshold = 0.32;
        minPts = 30;
    else
        distThreshold = 0.36;
        minPts = 35;
    end
    
    [~, dbScanClasses, ~] = dbscan(features(:, 6:7)', distThreshold, minPts);
    % fixes classes enumeration
    dbScanClasses(dbScanClasses==1) = 7;
    dbScanClasses(dbScanClasses==3) = 1;
    dbScanClasses(dbScanClasses==7) = 3;
    
    figure();
    scatter(features(dbScanClasses == 0, 6), features(dbScanClasses == 0, 7), [], 'k', 'o');
    hold on;
    scatter(features(dbScanClasses == 1, 6), features(dbScanClasses == 1, 7), [], 'b', '*');
    scatter(features(dbScanClasses == 2, 6), features(dbScanClasses == 2, 7), [], 'r', '*');
    scatter(features(dbScanClasses == 3, 6), features(dbScanClasses == 3, 7), [], 'g', '*');
    title(['Dataset #' num2str(fileIndex) ' feature plot after clustering with DB-SCAN']);
    xlabel('PCA feature 1');
    ylabel('PCA feature 2');
    accuracy = classperf(spikeClass',dbScanClasses);
    fprintf('Accuracy achieved with DB-SCAN is %.2f%%\n\n', accuracy.CorrectRate*100);
    
    % hierarchical clustering
    distances = pdist(features(:, 6:7));
    linkages = linkage(distances, 'ward');
    hierarchicalClusters = cluster(linkages, 'maxclust', 3);
    % fixes classes enumeration
    hierarchicalClusters(hierarchicalClusters==1) = 7;
    hierarchicalClusters(hierarchicalClusters==2) = 1;
    hierarchicalClusters(hierarchicalClusters==7) = 2;
    
    figure();
    scatter(features(hierarchicalClusters == 0, 6), features(hierarchicalClusters == 0, 7), [], 'k', 'o');
    hold on;
    scatter(features(hierarchicalClusters == 2, 6), features(hierarchicalClusters == 2, 7), [], 'r', '*');
    scatter(features(hierarchicalClusters == 3, 6), features(hierarchicalClusters == 3, 7), [], 'g', '*');
    scatter(features(hierarchicalClusters == 1, 6), features(hierarchicalClusters == 1, 7), [], 'b', '*');
    title(['Dataset #' num2str(fileIndex) ' feature plot after clustering with hierarchical clustering']);
    xlabel('PCA feature 1');
    ylabel('PCA feature 2');
    accuracy = classperf(spikeClass',hierarchicalClusters);
    fprintf('Accuracy achieved with hierarchical clustering is %.2f%%\n\n', accuracy.CorrectRate*100);
    
    % clustering using kmeans algorithm
    rng(1); % For reproducibility
    kMeansClasses = kmeans(features(:, 6:7), 3);
    % fixes classes enumeration
    kMeansClasses(kMeansClasses==2) = 7;
    kMeansClasses(kMeansClasses==3) = 2;
    kMeansClasses(kMeansClasses==7) = 3;
    
    figure();
    scatter(features(kMeansClasses == 1, 6), features(kMeansClasses == 1, 7), [], 'b', '*');
    hold on;
    scatter(features(kMeansClasses == 2, 6), features(kMeansClasses == 2, 7), [], 'r', '*');
    scatter(features(kMeansClasses == 3, 6), features(kMeansClasses == 3, 7), [], 'g', '*');
    title(['Dataset #' num2str(fileIndex) ' feature plot after clustering with K-Means']);
    xlabel('PCA feature 1');
    ylabel('PCA feature 2');
    accuracy = classperf(spikeClass',kMeansClasses);
    fprintf('Accuracy achieved with K-Means is %.2f%%\n\n', accuracy.CorrectRate*100);
    
    clearvars = {'spikesTimesEst', 'spikesEst', 'data', 'features', 'realSpikeIndex', ...
        'correctSpikes', 'spikeClass', 'Data', 'Dataset', 'pcaCoefficients', 'accuracy', 'asud', ...
        'dbScanClasses', 'kMeansClasses'};
    clear(clearvars{:})
    clear clearvars
end