Init models training, Other minor fixes

7 years ago · 30bacf953f
5 changed files with 113 additions and 29 deletions
--- a/classifier/classification_model_training/model_training.py
+++ b/classifier/classification_model_training/model_training.py
@ -0,0 +1,79 @@
 import numpy as np
 class bcolors:
 	BLUE = '\033[94m'
 	GREEN = '\033[92m'
 	YELLOW = '\033[93m'
 	RED = '\033[91m'
 	ENDC = '\033[0m'
 # def arrayFromJSON(JSONPath):
 # Prints a nice message to let the user know the module was imported
 print(bcolors.BLUE + 'model_training loaded' + bcolors.ENDC)
 # Enables executing the module as a standalone script
 if __name__ == "__main__":
 	import sys
 	dataset = np.load(sys.argv[1] + 'dataset.npy')
 	target = np.load(sys.argv[1] + 'target.npy')
 	featureKeys = np.load(sys.argv[1] + 'featureKeys.npy')
 	row_idx = np.r_[0:10956, 13696:24653]
 	trainingSet = np.copy(dataset[row_idx, :])
 	trainingTarget = np.copy(target[row_idx])
 	row_idx = np.r_[10956:13696, 24653:27392]
 	testSet = np.copy(dataset[row_idx, :])
 	testTarget = np.copy(target[row_idx])
 	# ==========================================================================
 	# SVM training
 	from sklearn.svm import SVC
 	print('Training...')
 	clf = SVC(gamma='scale')
 	clf.fit(trainingSet, trainingTarget)
 	print('Testing...')
 	print(clf.score(testSet, testTarget))
 	# Χωρίς preprocessing 								=> 0.4999087424712539
 	# Με Standardization 								=> 0.8906734805621463
 	# Με Normalization 									=> 0.4999087424712539
 	# Με stand. then norm. 								=> 0.7873699580215368
 	# Με varReducedDataset + stand. 					=> 0.8826428180324877
 	# Με perReducedDataset + stand. 					=> 0.81529476181785
 	# Με varReducedDataset + stand. + gamma = scale 	=> 0.8828253330899799
 	# Με varReducedDataset + stand. + sigmoid kernel 	=> 0.5875159700675305
 	# Με varReducedDataset + stand. + poly kernel dgr 5 => 0.8441321409016244
 	# Decision tree
 	from sklearn import tree
 	print('Training...')
 	clf = tree.DecisionTreeClassifier()
 	clf.fit(trainingSet, trainingTarget)
 	print('Testing...')
 	print(clf.score(testSet, testTarget))
 	# Με varReducedDataset + stand. 					=> 0.7541522175579485
 	# Multi-layer Perceptron
 	from sklearn.neural_network import MLPClassifier
 	print('Training...')
 	clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 3), random_state=2)
 	clf.fit(trainingSet, trainingTarget)
 	print('Testing...')
 	print(clf.score(testSet, testTarget))
 	# Με varReducedDataset + stand. και rndState = 2	=> 0.8647563423982478
 	# Naive Bayes
 	from sklearn.naive_bayes import GaussianNB
 	print('Training...')
 	clf = GaussianNB()
 	clf.fit(trainingSet, trainingTarget)
 	print('Testing...')
 	print(clf.score(testSet, testTarget))
 	# Με varReducedDataset + stand. 					=> 0.6557766015696295
--- a/classifier/feature_extraction/feature_extractor.py
+++ b/classifier/feature_extraction/feature_extractor.py
@ -60,6 +60,7 @@ def extractFeatures(audioPath, outputPath, sampleRate):
 		frameHFC = hfc(frameSpectrum)
 		frameSComp = spcComp(frameSpectrum)
 		# Computes cepstral features
 		# Discards the bands
 		mfcc_coeffs = mfcc(frameSpectrum)[1]
--- a/classifier/preprocessing/data_preprocessing.py
+++ b/classifier/preprocessing/data_preprocessing.py
@ -71,6 +71,7 @@ def featureSelection(dataset, target, featureKeys):
 	selector = VarianceThreshold(threshold = (varianceThreshold * (1 - varianceThreshold)))
 	varReducedDataset = selector.fit_transform(dataset)
 	isRetained = selector.get_support()
 	varReducedFeatureKeys = featureKeys[isRetained]
 	print(bcolors.GREEN + 'Retaining features:' + bcolors.ENDC)
 	for index, retain in enumerate(isRetained):
@ -84,35 +85,36 @@ def featureSelection(dataset, target, featureKeys):
 	print('\n')
 	# Selects features based on univariate statistical tests
-	from sklearn.datasets import load_digits
+	# from sklearn.datasets import load_digits
-	from sklearn.feature_selection import SelectPercentile, mutual_info_classif
+	# from sklearn.feature_selection import SelectPercentile, mutual_info_classif
-
+
-	print(bcolors.YELLOW + 'Running feature selection based on mutual information' + bcolors.ENDC)
+	# print(bcolors.YELLOW + 'Running feature selection based on mutual information' + bcolors.ENDC)
-	percentileSelector = SelectPercentile(mutual_info_classif, percentile=33)
+	# percentileSelector = SelectPercentile(mutual_info_classif, percentile=33)
-	perReducedDataset = percentileSelector.fit_transform(dataset, target)
+	# perReducedDataset = percentileSelector.fit_transform(dataset, target)
-	isRetained = percentileSelector.get_support()
+	# isRetained = percentileSelector.get_support()
-
+	# perReducedFeatureKeys = featureKeys[isRetained]
-	print(bcolors.BLUE + 'Scores of features:' + bcolors.ENDC)
+
-	for index, score in enumerate(percentileSelector.scores_):
+	# print(bcolors.BLUE + 'Scores of features:' + bcolors.ENDC)
-		print(featureKeys[index] + ' => ' + str(score), end='\t\t', flush=True)
+	# for index, score in enumerate(percentileSelector.scores_):
-		if index%2:
+	# 	print(featureKeys[index] + ' => ' + str(score), end='\t\t', flush=True)
-			print('')
+	# 	if index%2:
-	print('')
+	# 		print('')
-
+	# print('')
-	print(bcolors.GREEN + 'Retaining features:' + bcolors.ENDC)
+
-	for index, retain in enumerate(isRetained):
+	# print(bcolors.GREEN + 'Retaining features:' + bcolors.ENDC)
-		if retain and index < featureKeys.size:
+	# for index, retain in enumerate(isRetained):
-			print(featureKeys[index], end='\t', flush=True)
+	# 	if retain and index < featureKeys.size:
-
+	# 		print(featureKeys[index], end='\t', flush=True)
-	print(bcolors.RED + '\n\nRemoving features:' + bcolors.ENDC)
+
-	for index, retain in enumerate(isRetained):
+	# print(bcolors.RED + '\n\nRemoving features:' + bcolors.ENDC)
-		if not retain and index < featureKeys.size:
+	# for index, retain in enumerate(isRetained):
-			print(featureKeys[index], end='\t', flush=True)
+	# 	if not retain and index < featureKeys.size:
-	print('\n')
+	# 		print(featureKeys[index], end='\t', flush=True)
 	# print('\n')
 	# TODO: change the return value after the values of the parameters are decided
 	# and the feature selection is complete
-	return dataset
+	return varReducedDataset, varReducedFeatureKeys
 # Details about this part can be found in the link bellow:
 # https://scikit-learn.org/stable/modules/preprocessing.html#preprocessing
@ -129,7 +131,7 @@ def standardization(dataset):
 	# TODO: change the return value after the values of the parameters are decided
 	# and the feature selection is complete
-	return dataset
+	return scaledDataset
 # Details about this part can be found in the link bellow:
 # https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html#sklearn.decomposition.PCA
@ -151,8 +153,10 @@ print(bcolors.BLUE + 'feature_preprocessing loaded' + bcolors.ENDC)
 if __name__ == "__main__":
 	import sys
 	dataset, target, featureKeys = createSingleFeaturesArray(sys.argv[1], sys.argv[2])
-	PCA(standardization(featureSelection(dataset, target, featureKeys)))
+	dataset, featureKeys = featureSelection(dataset, target, featureKeys)
 	newDataset = PCA(standardization(dataset))
 	print(bcolors.GREEN + 'Saving results to files' + bcolors.ENDC)
-	np.save('dataset.npy', dataset)
+	np.save('dataset.npy', newDataset)
 	np.save('target.npy', target)
 	np.save('featureKeys.npy', featureKeys)
--- a/classifier/preprocessing/dataset.npy
+++ b/classifier/preprocessing/dataset.npy
--- a/classifier/preprocessing/featureKeys.npy
+++ b/classifier/preprocessing/featureKeys.npy