diff --git a/classifier/classification_model_training/model_training.py b/classifier/classification_model_training/model_training.py new file mode 100644 index 0000000..786cdcc --- /dev/null +++ b/classifier/classification_model_training/model_training.py @@ -0,0 +1,79 @@ +import numpy as np + +class bcolors: + BLUE = '\033[94m' + GREEN = '\033[92m' + YELLOW = '\033[93m' + RED = '\033[91m' + ENDC = '\033[0m' + +# def arrayFromJSON(JSONPath): + +# Prints a nice message to let the user know the module was imported +print(bcolors.BLUE + 'model_training loaded' + bcolors.ENDC) + +# Enables executing the module as a standalone script +if __name__ == "__main__": + import sys + dataset = np.load(sys.argv[1] + 'dataset.npy') + target = np.load(sys.argv[1] + 'target.npy') + featureKeys = np.load(sys.argv[1] + 'featureKeys.npy') + + row_idx = np.r_[0:10956, 13696:24653] + trainingSet = np.copy(dataset[row_idx, :]) + trainingTarget = np.copy(target[row_idx]) + + row_idx = np.r_[10956:13696, 24653:27392] + testSet = np.copy(dataset[row_idx, :]) + testTarget = np.copy(target[row_idx]) + + # ========================================================================== + + # SVM training + from sklearn.svm import SVC + print('Training...') + clf = SVC(gamma='scale') + clf.fit(trainingSet, trainingTarget) + print('Testing...') + print(clf.score(testSet, testTarget)) + + # Χωρίς preprocessing => 0.4999087424712539 + # Με Standardization => 0.8906734805621463 + # Με Normalization => 0.4999087424712539 + # Με stand. then norm. => 0.7873699580215368 + # Με varReducedDataset + stand. => 0.8826428180324877 + # Με perReducedDataset + stand. => 0.81529476181785 + + # Με varReducedDataset + stand. + gamma = scale => 0.8828253330899799 + # Με varReducedDataset + stand. + sigmoid kernel => 0.5875159700675305 + # Με varReducedDataset + stand. + poly kernel dgr 5 => 0.8441321409016244 + + # Decision tree + from sklearn import tree + print('Training...') + clf = tree.DecisionTreeClassifier() + clf.fit(trainingSet, trainingTarget) + print('Testing...') + print(clf.score(testSet, testTarget)) + + # Με varReducedDataset + stand. => 0.7541522175579485 + + # Multi-layer Perceptron + from sklearn.neural_network import MLPClassifier + print('Training...') + clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 3), random_state=2) + clf.fit(trainingSet, trainingTarget) + print('Testing...') + print(clf.score(testSet, testTarget)) + + # Με varReducedDataset + stand. και rndState = 2 => 0.8647563423982478 + + # Naive Bayes + from sklearn.naive_bayes import GaussianNB + print('Training...') + clf = GaussianNB() + clf.fit(trainingSet, trainingTarget) + print('Testing...') + print(clf.score(testSet, testTarget)) + + # Με varReducedDataset + stand. => 0.6557766015696295 \ No newline at end of file diff --git a/classifier/feature_extraction/feature_extractor.py b/classifier/feature_extraction/feature_extractor.py index 131542c..e01c4ee 100644 --- a/classifier/feature_extraction/feature_extractor.py +++ b/classifier/feature_extraction/feature_extractor.py @@ -60,6 +60,7 @@ def extractFeatures(audioPath, outputPath, sampleRate): frameHFC = hfc(frameSpectrum) frameSComp = spcComp(frameSpectrum) + # Computes cepstral features # Discards the bands mfcc_coeffs = mfcc(frameSpectrum)[1] diff --git a/classifier/preprocessing/data_preprocessing.py b/classifier/preprocessing/data_preprocessing.py index 07acacc..40477b4 100644 --- a/classifier/preprocessing/data_preprocessing.py +++ b/classifier/preprocessing/data_preprocessing.py @@ -71,6 +71,7 @@ def featureSelection(dataset, target, featureKeys): selector = VarianceThreshold(threshold = (varianceThreshold * (1 - varianceThreshold))) varReducedDataset = selector.fit_transform(dataset) isRetained = selector.get_support() + varReducedFeatureKeys = featureKeys[isRetained] print(bcolors.GREEN + 'Retaining features:' + bcolors.ENDC) for index, retain in enumerate(isRetained): @@ -84,35 +85,36 @@ def featureSelection(dataset, target, featureKeys): print('\n') # Selects features based on univariate statistical tests - from sklearn.datasets import load_digits - from sklearn.feature_selection import SelectPercentile, mutual_info_classif - - print(bcolors.YELLOW + 'Running feature selection based on mutual information' + bcolors.ENDC) - percentileSelector = SelectPercentile(mutual_info_classif, percentile=33) - perReducedDataset = percentileSelector.fit_transform(dataset, target) - isRetained = percentileSelector.get_support() - - print(bcolors.BLUE + 'Scores of features:' + bcolors.ENDC) - for index, score in enumerate(percentileSelector.scores_): - print(featureKeys[index] + ' => ' + str(score), end='\t\t', flush=True) - if index%2: - print('') - print('') - - print(bcolors.GREEN + 'Retaining features:' + bcolors.ENDC) - for index, retain in enumerate(isRetained): - if retain and index < featureKeys.size: - print(featureKeys[index], end='\t', flush=True) - - print(bcolors.RED + '\n\nRemoving features:' + bcolors.ENDC) - for index, retain in enumerate(isRetained): - if not retain and index < featureKeys.size: - print(featureKeys[index], end='\t', flush=True) - print('\n') + # from sklearn.datasets import load_digits + # from sklearn.feature_selection import SelectPercentile, mutual_info_classif + + # print(bcolors.YELLOW + 'Running feature selection based on mutual information' + bcolors.ENDC) + # percentileSelector = SelectPercentile(mutual_info_classif, percentile=33) + # perReducedDataset = percentileSelector.fit_transform(dataset, target) + # isRetained = percentileSelector.get_support() + # perReducedFeatureKeys = featureKeys[isRetained] + + # print(bcolors.BLUE + 'Scores of features:' + bcolors.ENDC) + # for index, score in enumerate(percentileSelector.scores_): + # print(featureKeys[index] + ' => ' + str(score), end='\t\t', flush=True) + # if index%2: + # print('') + # print('') + + # print(bcolors.GREEN + 'Retaining features:' + bcolors.ENDC) + # for index, retain in enumerate(isRetained): + # if retain and index < featureKeys.size: + # print(featureKeys[index], end='\t', flush=True) + + # print(bcolors.RED + '\n\nRemoving features:' + bcolors.ENDC) + # for index, retain in enumerate(isRetained): + # if not retain and index < featureKeys.size: + # print(featureKeys[index], end='\t', flush=True) + # print('\n') # TODO: change the return value after the values of the parameters are decided # and the feature selection is complete - return dataset + return varReducedDataset, varReducedFeatureKeys # Details about this part can be found in the link bellow: # https://scikit-learn.org/stable/modules/preprocessing.html#preprocessing @@ -129,7 +131,7 @@ def standardization(dataset): # TODO: change the return value after the values of the parameters are decided # and the feature selection is complete - return dataset + return scaledDataset # Details about this part can be found in the link bellow: # https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html#sklearn.decomposition.PCA @@ -151,8 +153,10 @@ print(bcolors.BLUE + 'feature_preprocessing loaded' + bcolors.ENDC) if __name__ == "__main__": import sys dataset, target, featureKeys = createSingleFeaturesArray(sys.argv[1], sys.argv[2]) - PCA(standardization(featureSelection(dataset, target, featureKeys))) + dataset, featureKeys = featureSelection(dataset, target, featureKeys) + newDataset = PCA(standardization(dataset)) + print(bcolors.GREEN + 'Saving results to files' + bcolors.ENDC) - np.save('dataset.npy', dataset) + np.save('dataset.npy', newDataset) np.save('target.npy', target) np.save('featureKeys.npy', featureKeys) \ No newline at end of file diff --git a/classifier/preprocessing/dataset.npy b/classifier/preprocessing/dataset.npy index e6cd7cf..beba82d 100644 Binary files a/classifier/preprocessing/dataset.npy and b/classifier/preprocessing/dataset.npy differ diff --git a/classifier/preprocessing/featureKeys.npy b/classifier/preprocessing/featureKeys.npy index 92335c3..aa19f66 100644 Binary files a/classifier/preprocessing/featureKeys.npy and b/classifier/preprocessing/featureKeys.npy differ