From f2d288d0a443991dfc3761914d8bfa11edd65196 Mon Sep 17 00:00:00 2001
From: Apostolof <apotwohd@gmail.com>
Date: Sun, 2 Dec 2018 22:16:32 +0200
Subject: [PATCH] Init data preprocessing

---
 classifier/preprocessing/README.md            |  14 ++
 .../preprocessing/data_preprocessing.py       | 126 ++++++++++++++++++
 2 files changed, 140 insertions(+)
 create mode 100644 classifier/preprocessing/README.md
 create mode 100644 classifier/preprocessing/data_preprocessing.py

diff --git a/classifier/preprocessing/README.md b/classifier/preprocessing/README.md
new file mode 100644
index 0000000..50c5aef
--- /dev/null
+++ b/classifier/preprocessing/README.md
@@ -0,0 +1,14 @@
+# Data preprocessing
+
+The file `data_preprocessing` is a python module that uses the open-source library [scikit-learn](https://scikit-learn.org/stable/) to perform several data preprocessing techniques to the data previously extracted.
+
+The module can be imported or executed as a script using one of the following commands
+`python data_preprocessing.py <music_data_directory> <speech_data_directory>`
+or
+`python3 data_preprocessing.py <music_data_directory> <speech_data_directory>`
+
+**Dependencies:**
+- scikit-learn
+- numpy
+
+All dependencies are available both for python2 and python3 versions and can all be installed using the commands `pip install <package_name>` or `pip3 install <package_name>` for python2 and python3 respectively.
\ No newline at end of file
diff --git a/classifier/preprocessing/data_preprocessing.py b/classifier/preprocessing/data_preprocessing.py
new file mode 100644
index 0000000..02f1cb4
--- /dev/null
+++ b/classifier/preprocessing/data_preprocessing.py
@@ -0,0 +1,126 @@
+from os import listdir
+from os.path import isfile, join
+import numpy as np
+import json
+
+def arrayFromJSONs(JSONPath):
+	with open(JSONPath) as jsonFile:
+		rawJSON = json.load(jsonFile)
+
+	keys = np.array([])
+	values = np.array([])
+	for featureKey, featureValues in rawJSON.items():
+		if keys.size == 0 or values.size == 0:
+			keys = np.array(featureKey)
+			values = np.array(featureValues)
+		else:
+			keys = np.append(keys, (np.array(featureKey)))
+			values = np.vstack((values, np.array(featureValues)))
+
+	values = np.transpose(values)
+	return keys, values
+
+def createSingleFeaturesArray(musicJSONsPath, speechJSONsPath):
+	dataset = np.array([])
+	featureKeys = np.array([])
+
+	# Reads the extracted features for the music class
+	featuresFiles = [file for file in listdir(musicJSONsPath) if isfile(join(musicJSONsPath, file))]
+	for file in featuresFiles:
+		if dataset.size == 0:
+			# Gets feature arrays
+			featureKeys, musicFeatures = arrayFromJSONs(musicJSONsPath + file)
+			# Appends the class to the arrays (0 for music, 1 for speech)
+			musicClass = np.zeros((musicFeatures.shape[0]), dtype=int)
+			musicFeatures = np.c_[musicFeatures, musicClass]
+			dataset = np.copy(musicFeatures)
+		else:
+			# Gets feature arrays
+			musicFeatures = arrayFromJSONs(musicJSONsPath + file)[1]
+			# Appends the class to the arrays (0 for music, 1 for speech)
+			musicFeatures = np.c_[musicFeatures, musicClass]
+			dataset = np.vstack((dataset, musicFeatures))
+
+	# Reads the extracted features for the speech class
+	featuresFiles = [file for file in listdir(speechJSONsPath) if isfile(join(speechJSONsPath, file))]
+	for file in featuresFiles:
+		# Gets feature arrays
+		speechFeatures = arrayFromJSONs(speechJSONsPath + file)[1]
+		# Appends the class to the arrays (0 for music, 1 for speech)
+		speechClass = np.ones((speechFeatures.shape[0]), dtype=int)
+		speechFeatures = np.c_[speechFeatures, speechClass]
+		dataset = np.vstack((dataset, speechFeatures))
+
+	return dataset, featureKeys
+
+# Details about this part can be found in the link bellow:
+# https://scikit-learn.org/stable/modules/feature_selection.html
+def featureSelection(dataset, featureKeys):
+	# Selects features based on a variance threshold
+	from sklearn.feature_selection import VarianceThreshold
+
+	varianceThreshold = 0.72
+	selector = VarianceThreshold(threshold = (varianceThreshold * (1 - varianceThreshold)))
+	varReducedDataset = selector.fit_transform(dataset)
+	isRetained = selector.get_support()
+
+	print('Retaining features:')
+	for index, retain in enumerate(isRetained):
+		if retain and index < featureKeys.size:
+			print(featureKeys[index], end='\t', flush=True)
+
+	print('\n\nRemoving features:')
+	for index, retain in enumerate(isRetained):
+		if not retain and index < featureKeys.size:
+			print(featureKeys[index], end='\t', flush=True)
+	print('\n')
+
+	# Selects features based on univariate statistical tests
+	from sklearn.datasets import load_digits
+	from sklearn.feature_selection import SelectPercentile, mutual_info_regression
+
+	perReducedDataset = SelectPercentile(mutual_info_regression,
+		percentile=33).fit_transform(dataset[:, :-1], dataset[:, -1])
+
+	# TODO: change the return value after the values of the parameters are decided
+	# and the feature selection is complete
+	return dataset
+
+# Details about this part can be found in the link bellow:
+# https://scikit-learn.org/stable/modules/preprocessing.html#preprocessing
+def standardization(dataset):
+	from sklearn import preprocessing
+
+	# Standardization
+	scaledDataset = preprocessing.scale(dataset[:, :-1])
+	scaledDataset = np.c_[scaledDataset, dataset[:, -1]]
+
+	# Normalization
+	scaledDataset = preprocessing.normalize(dataset[:, :-1], norm='l2')
+	scaledDataset = np.c_[scaledDataset, dataset[:, -1]]
+
+	# TODO: change the return value after the values of the parameters are decided
+	# and the feature selection is complete
+	return dataset
+
+# Details about this part can be found in the link bellow:
+# https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html#sklearn.decomposition.PCA
+def PCA(dataset):
+	from sklearn.decomposition import PCA
+
+	pca = PCA(n_components=10,svd_solver='full')
+	transformedDataset = pca.fit(dataset[:, :-1]).transform(dataset[:, :-1])
+	transformedDataset = np.c_[transformedDataset, dataset[:, -1]]
+
+	# TODO: change the return value after the values of the parameters are decided
+	# and the feature selection is complete
+	return dataset
+
+# Prints a nice message to let the user know the module was imported
+print('feature_preprocessing loaded')
+
+# Enables executing the module as a standalone script
+if __name__ == "__main__":
+	import sys
+	dataset, featureKeys = createSingleFeaturesArray(sys.argv[1], sys.argv[2])
+	PCA(standardization(featureSelection(dataset, featureKeys)))
\ No newline at end of file