Browse Source

Init models training, Other minor fixes

master
Apostolos Fanakis 6 years ago
parent
commit
30bacf953f
No known key found for this signature in database GPG Key ID: 56CE2DEDE9F1FB78
  1. 79
      classifier/classification_model_training/model_training.py
  2. 1
      classifier/feature_extraction/feature_extractor.py
  3. 62
      classifier/preprocessing/data_preprocessing.py
  4. BIN
      classifier/preprocessing/dataset.npy
  5. BIN
      classifier/preprocessing/featureKeys.npy

79
classifier/classification_model_training/model_training.py

@ -0,0 +1,79 @@
import numpy as np
class bcolors:
BLUE = '\033[94m'
GREEN = '\033[92m'
YELLOW = '\033[93m'
RED = '\033[91m'
ENDC = '\033[0m'
# def arrayFromJSON(JSONPath):
# Prints a nice message to let the user know the module was imported
print(bcolors.BLUE + 'model_training loaded' + bcolors.ENDC)
# Enables executing the module as a standalone script
if __name__ == "__main__":
import sys
dataset = np.load(sys.argv[1] + 'dataset.npy')
target = np.load(sys.argv[1] + 'target.npy')
featureKeys = np.load(sys.argv[1] + 'featureKeys.npy')
row_idx = np.r_[0:10956, 13696:24653]
trainingSet = np.copy(dataset[row_idx, :])
trainingTarget = np.copy(target[row_idx])
row_idx = np.r_[10956:13696, 24653:27392]
testSet = np.copy(dataset[row_idx, :])
testTarget = np.copy(target[row_idx])
# ==========================================================================
# SVM training
from sklearn.svm import SVC
print('Training...')
clf = SVC(gamma='scale')
clf.fit(trainingSet, trainingTarget)
print('Testing...')
print(clf.score(testSet, testTarget))
# Χωρίς preprocessing => 0.4999087424712539
# Με Standardization => 0.8906734805621463
# Με Normalization => 0.4999087424712539
# Με stand. then norm. => 0.7873699580215368
# Με varReducedDataset + stand. => 0.8826428180324877
# Με perReducedDataset + stand. => 0.81529476181785
# Με varReducedDataset + stand. + gamma = scale => 0.8828253330899799
# Με varReducedDataset + stand. + sigmoid kernel => 0.5875159700675305
# Με varReducedDataset + stand. + poly kernel dgr 5 => 0.8441321409016244
# Decision tree
from sklearn import tree
print('Training...')
clf = tree.DecisionTreeClassifier()
clf.fit(trainingSet, trainingTarget)
print('Testing...')
print(clf.score(testSet, testTarget))
# Με varReducedDataset + stand. => 0.7541522175579485
# Multi-layer Perceptron
from sklearn.neural_network import MLPClassifier
print('Training...')
clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 3), random_state=2)
clf.fit(trainingSet, trainingTarget)
print('Testing...')
print(clf.score(testSet, testTarget))
# Με varReducedDataset + stand. και rndState = 2 => 0.8647563423982478
# Naive Bayes
from sklearn.naive_bayes import GaussianNB
print('Training...')
clf = GaussianNB()
clf.fit(trainingSet, trainingTarget)
print('Testing...')
print(clf.score(testSet, testTarget))
# Με varReducedDataset + stand. => 0.6557766015696295

1
classifier/feature_extraction/feature_extractor.py

@ -60,6 +60,7 @@ def extractFeatures(audioPath, outputPath, sampleRate):
frameHFC = hfc(frameSpectrum)
frameSComp = spcComp(frameSpectrum)
# Computes cepstral features
# Discards the bands
mfcc_coeffs = mfcc(frameSpectrum)[1]

62
classifier/preprocessing/data_preprocessing.py

@ -71,6 +71,7 @@ def featureSelection(dataset, target, featureKeys):
selector = VarianceThreshold(threshold = (varianceThreshold * (1 - varianceThreshold)))
varReducedDataset = selector.fit_transform(dataset)
isRetained = selector.get_support()
varReducedFeatureKeys = featureKeys[isRetained]
print(bcolors.GREEN + 'Retaining features:' + bcolors.ENDC)
for index, retain in enumerate(isRetained):
@ -84,35 +85,36 @@ def featureSelection(dataset, target, featureKeys):
print('\n')
# Selects features based on univariate statistical tests
from sklearn.datasets import load_digits
from sklearn.feature_selection import SelectPercentile, mutual_info_classif
print(bcolors.YELLOW + 'Running feature selection based on mutual information' + bcolors.ENDC)
percentileSelector = SelectPercentile(mutual_info_classif, percentile=33)
perReducedDataset = percentileSelector.fit_transform(dataset, target)
isRetained = percentileSelector.get_support()
print(bcolors.BLUE + 'Scores of features:' + bcolors.ENDC)
for index, score in enumerate(percentileSelector.scores_):
print(featureKeys[index] + ' => ' + str(score), end='\t\t', flush=True)
if index%2:
print('')
print('')
print(bcolors.GREEN + 'Retaining features:' + bcolors.ENDC)
for index, retain in enumerate(isRetained):
if retain and index < featureKeys.size:
print(featureKeys[index], end='\t', flush=True)
print(bcolors.RED + '\n\nRemoving features:' + bcolors.ENDC)
for index, retain in enumerate(isRetained):
if not retain and index < featureKeys.size:
print(featureKeys[index], end='\t', flush=True)
print('\n')
# from sklearn.datasets import load_digits
# from sklearn.feature_selection import SelectPercentile, mutual_info_classif
# print(bcolors.YELLOW + 'Running feature selection based on mutual information' + bcolors.ENDC)
# percentileSelector = SelectPercentile(mutual_info_classif, percentile=33)
# perReducedDataset = percentileSelector.fit_transform(dataset, target)
# isRetained = percentileSelector.get_support()
# perReducedFeatureKeys = featureKeys[isRetained]
# print(bcolors.BLUE + 'Scores of features:' + bcolors.ENDC)
# for index, score in enumerate(percentileSelector.scores_):
# print(featureKeys[index] + ' => ' + str(score), end='\t\t', flush=True)
# if index%2:
# print('')
# print('')
# print(bcolors.GREEN + 'Retaining features:' + bcolors.ENDC)
# for index, retain in enumerate(isRetained):
# if retain and index < featureKeys.size:
# print(featureKeys[index], end='\t', flush=True)
# print(bcolors.RED + '\n\nRemoving features:' + bcolors.ENDC)
# for index, retain in enumerate(isRetained):
# if not retain and index < featureKeys.size:
# print(featureKeys[index], end='\t', flush=True)
# print('\n')
# TODO: change the return value after the values of the parameters are decided
# and the feature selection is complete
return dataset
return varReducedDataset, varReducedFeatureKeys
# Details about this part can be found in the link bellow:
# https://scikit-learn.org/stable/modules/preprocessing.html#preprocessing
@ -129,7 +131,7 @@ def standardization(dataset):
# TODO: change the return value after the values of the parameters are decided
# and the feature selection is complete
return dataset
return scaledDataset
# Details about this part can be found in the link bellow:
# https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html#sklearn.decomposition.PCA
@ -151,8 +153,10 @@ print(bcolors.BLUE + 'feature_preprocessing loaded' + bcolors.ENDC)
if __name__ == "__main__":
import sys
dataset, target, featureKeys = createSingleFeaturesArray(sys.argv[1], sys.argv[2])
PCA(standardization(featureSelection(dataset, target, featureKeys)))
dataset, featureKeys = featureSelection(dataset, target, featureKeys)
newDataset = PCA(standardization(dataset))
print(bcolors.GREEN + 'Saving results to files' + bcolors.ENDC)
np.save('dataset.npy', dataset)
np.save('dataset.npy', newDataset)
np.save('target.npy', target)
np.save('featureKeys.npy', featureKeys)

BIN
classifier/preprocessing/dataset.npy

Binary file not shown.

BIN
classifier/preprocessing/featureKeys.npy

Binary file not shown.
Loading…
Cancel
Save