Init data preprocessing

7 years ago · f2d288d0a4
2 changed files with 140 additions and 0 deletions
--- a/classifier/preprocessing/README.md
+++ b/classifier/preprocessing/README.md
@ -0,0 +1,14 @@
 # Data preprocessing
 The file `data_preprocessing` is a python module that uses the open-source library [scikit-learn](https://scikit-learn.org/stable/) to perform several data preprocessing techniques to the data previously extracted.
 The module can be imported or executed as a script using one of the following commands
 `python data_preprocessing.py <music_data_directory> <speech_data_directory>`
 or
 `python3 data_preprocessing.py <music_data_directory> <speech_data_directory>`
 **Dependencies:**
 - scikit-learn
 - numpy
 All dependencies are available both for python2 and python3 versions and can all be installed using the commands `pip install <package_name>` or `pip3 install <package_name>` for python2 and python3 respectively.
--- a/classifier/preprocessing/data_preprocessing.py
+++ b/classifier/preprocessing/data_preprocessing.py
@ -0,0 +1,126 @@
 from os import listdir
 from os.path import isfile, join
 import numpy as np
 import json
 def arrayFromJSONs(JSONPath):
 	with open(JSONPath) as jsonFile:
 		rawJSON = json.load(jsonFile)
 	keys = np.array([])
 	values = np.array([])
 	for featureKey, featureValues in rawJSON.items():
 		if keys.size == 0 or values.size == 0:
 			keys = np.array(featureKey)
 			values = np.array(featureValues)
 		else:
 			keys = np.append(keys, (np.array(featureKey)))
 			values = np.vstack((values, np.array(featureValues)))
 	values = np.transpose(values)
 	return keys, values
 def createSingleFeaturesArray(musicJSONsPath, speechJSONsPath):
 	dataset = np.array([])
 	featureKeys = np.array([])
 	# Reads the extracted features for the music class
 	featuresFiles = [file for file in listdir(musicJSONsPath) if isfile(join(musicJSONsPath, file))]
 	for file in featuresFiles:
 		if dataset.size == 0:
 			# Gets feature arrays
 			featureKeys, musicFeatures = arrayFromJSONs(musicJSONsPath + file)
 			# Appends the class to the arrays (0 for music, 1 for speech)
 			musicClass = np.zeros((musicFeatures.shape[0]), dtype=int)
 			musicFeatures = np.c_[musicFeatures, musicClass]
 			dataset = np.copy(musicFeatures)
 		else:
 			# Gets feature arrays
 			musicFeatures = arrayFromJSONs(musicJSONsPath + file)[1]
 			# Appends the class to the arrays (0 for music, 1 for speech)
 			musicFeatures = np.c_[musicFeatures, musicClass]
 			dataset = np.vstack((dataset, musicFeatures))
 	# Reads the extracted features for the speech class
 	featuresFiles = [file for file in listdir(speechJSONsPath) if isfile(join(speechJSONsPath, file))]
 	for file in featuresFiles:
 		# Gets feature arrays
 		speechFeatures = arrayFromJSONs(speechJSONsPath + file)[1]
 		# Appends the class to the arrays (0 for music, 1 for speech)
 		speechClass = np.ones((speechFeatures.shape[0]), dtype=int)
 		speechFeatures = np.c_[speechFeatures, speechClass]
 		dataset = np.vstack((dataset, speechFeatures))
 	return dataset, featureKeys
 # Details about this part can be found in the link bellow:
 # https://scikit-learn.org/stable/modules/feature_selection.html
 def featureSelection(dataset, featureKeys):
 	# Selects features based on a variance threshold
 	from sklearn.feature_selection import VarianceThreshold
 	varianceThreshold = 0.72
 	selector = VarianceThreshold(threshold = (varianceThreshold * (1 - varianceThreshold)))
 	varReducedDataset = selector.fit_transform(dataset)
 	isRetained = selector.get_support()
 	print('Retaining features:')
 	for index, retain in enumerate(isRetained):
 		if retain and index < featureKeys.size:
 			print(featureKeys[index], end='\t', flush=True)
 	print('\n\nRemoving features:')
 	for index, retain in enumerate(isRetained):
 		if not retain and index < featureKeys.size:
 			print(featureKeys[index], end='\t', flush=True)
 	print('\n')
 	# Selects features based on univariate statistical tests
 	from sklearn.datasets import load_digits
 	from sklearn.feature_selection import SelectPercentile, mutual_info_regression
 	perReducedDataset = SelectPercentile(mutual_info_regression,
 		percentile=33).fit_transform(dataset[:, :-1], dataset[:, -1])
 	# TODO: change the return value after the values of the parameters are decided
 	# and the feature selection is complete
 	return dataset
 # Details about this part can be found in the link bellow:
 # https://scikit-learn.org/stable/modules/preprocessing.html#preprocessing
 def standardization(dataset):
 	from sklearn import preprocessing
 	# Standardization
 	scaledDataset = preprocessing.scale(dataset[:, :-1])
 	scaledDataset = np.c_[scaledDataset, dataset[:, -1]]
 	# Normalization
 	scaledDataset = preprocessing.normalize(dataset[:, :-1], norm='l2')
 	scaledDataset = np.c_[scaledDataset, dataset[:, -1]]
 	# TODO: change the return value after the values of the parameters are decided
 	# and the feature selection is complete
 	return dataset
 # Details about this part can be found in the link bellow:
 # https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html#sklearn.decomposition.PCA
 def PCA(dataset):
 	from sklearn.decomposition import PCA
 	pca = PCA(n_components=10,svd_solver='full')
 	transformedDataset = pca.fit(dataset[:, :-1]).transform(dataset[:, :-1])
 	transformedDataset = np.c_[transformedDataset, dataset[:, -1]]
 	# TODO: change the return value after the values of the parameters are decided
 	# and the feature selection is complete
 	return dataset
 # Prints a nice message to let the user know the module was imported
 print('feature_preprocessing loaded')
 # Enables executing the module as a standalone script
 if __name__ == "__main__":
 	import sys
 	dataset, featureKeys = createSingleFeaturesArray(sys.argv[1], sys.argv[2])
 	PCA(standardization(featureSelection(dataset, featureKeys)))