From f2d288d0a443991dfc3761914d8bfa11edd65196 Mon Sep 17 00:00:00 2001 From: Apostolof Date: Sun, 2 Dec 2018 22:16:32 +0200 Subject: [PATCH] Init data preprocessing --- classifier/preprocessing/README.md | 14 ++ .../preprocessing/data_preprocessing.py | 126 ++++++++++++++++++ 2 files changed, 140 insertions(+) create mode 100644 classifier/preprocessing/README.md create mode 100644 classifier/preprocessing/data_preprocessing.py diff --git a/classifier/preprocessing/README.md b/classifier/preprocessing/README.md new file mode 100644 index 0000000..50c5aef --- /dev/null +++ b/classifier/preprocessing/README.md @@ -0,0 +1,14 @@ +# Data preprocessing + +The file `data_preprocessing` is a python module that uses the open-source library [scikit-learn](https://scikit-learn.org/stable/) to perform several data preprocessing techniques to the data previously extracted. + +The module can be imported or executed as a script using one of the following commands +`python data_preprocessing.py ` +or +`python3 data_preprocessing.py ` + +**Dependencies:** +- scikit-learn +- numpy + +All dependencies are available both for python2 and python3 versions and can all be installed using the commands `pip install ` or `pip3 install ` for python2 and python3 respectively. \ No newline at end of file diff --git a/classifier/preprocessing/data_preprocessing.py b/classifier/preprocessing/data_preprocessing.py new file mode 100644 index 0000000..02f1cb4 --- /dev/null +++ b/classifier/preprocessing/data_preprocessing.py @@ -0,0 +1,126 @@ +from os import listdir +from os.path import isfile, join +import numpy as np +import json + +def arrayFromJSONs(JSONPath): + with open(JSONPath) as jsonFile: + rawJSON = json.load(jsonFile) + + keys = np.array([]) + values = np.array([]) + for featureKey, featureValues in rawJSON.items(): + if keys.size == 0 or values.size == 0: + keys = np.array(featureKey) + values = np.array(featureValues) + else: + keys = np.append(keys, (np.array(featureKey))) + values = np.vstack((values, np.array(featureValues))) + + values = np.transpose(values) + return keys, values + +def createSingleFeaturesArray(musicJSONsPath, speechJSONsPath): + dataset = np.array([]) + featureKeys = np.array([]) + + # Reads the extracted features for the music class + featuresFiles = [file for file in listdir(musicJSONsPath) if isfile(join(musicJSONsPath, file))] + for file in featuresFiles: + if dataset.size == 0: + # Gets feature arrays + featureKeys, musicFeatures = arrayFromJSONs(musicJSONsPath + file) + # Appends the class to the arrays (0 for music, 1 for speech) + musicClass = np.zeros((musicFeatures.shape[0]), dtype=int) + musicFeatures = np.c_[musicFeatures, musicClass] + dataset = np.copy(musicFeatures) + else: + # Gets feature arrays + musicFeatures = arrayFromJSONs(musicJSONsPath + file)[1] + # Appends the class to the arrays (0 for music, 1 for speech) + musicFeatures = np.c_[musicFeatures, musicClass] + dataset = np.vstack((dataset, musicFeatures)) + + # Reads the extracted features for the speech class + featuresFiles = [file for file in listdir(speechJSONsPath) if isfile(join(speechJSONsPath, file))] + for file in featuresFiles: + # Gets feature arrays + speechFeatures = arrayFromJSONs(speechJSONsPath + file)[1] + # Appends the class to the arrays (0 for music, 1 for speech) + speechClass = np.ones((speechFeatures.shape[0]), dtype=int) + speechFeatures = np.c_[speechFeatures, speechClass] + dataset = np.vstack((dataset, speechFeatures)) + + return dataset, featureKeys + +# Details about this part can be found in the link bellow: +# https://scikit-learn.org/stable/modules/feature_selection.html +def featureSelection(dataset, featureKeys): + # Selects features based on a variance threshold + from sklearn.feature_selection import VarianceThreshold + + varianceThreshold = 0.72 + selector = VarianceThreshold(threshold = (varianceThreshold * (1 - varianceThreshold))) + varReducedDataset = selector.fit_transform(dataset) + isRetained = selector.get_support() + + print('Retaining features:') + for index, retain in enumerate(isRetained): + if retain and index < featureKeys.size: + print(featureKeys[index], end='\t', flush=True) + + print('\n\nRemoving features:') + for index, retain in enumerate(isRetained): + if not retain and index < featureKeys.size: + print(featureKeys[index], end='\t', flush=True) + print('\n') + + # Selects features based on univariate statistical tests + from sklearn.datasets import load_digits + from sklearn.feature_selection import SelectPercentile, mutual_info_regression + + perReducedDataset = SelectPercentile(mutual_info_regression, + percentile=33).fit_transform(dataset[:, :-1], dataset[:, -1]) + + # TODO: change the return value after the values of the parameters are decided + # and the feature selection is complete + return dataset + +# Details about this part can be found in the link bellow: +# https://scikit-learn.org/stable/modules/preprocessing.html#preprocessing +def standardization(dataset): + from sklearn import preprocessing + + # Standardization + scaledDataset = preprocessing.scale(dataset[:, :-1]) + scaledDataset = np.c_[scaledDataset, dataset[:, -1]] + + # Normalization + scaledDataset = preprocessing.normalize(dataset[:, :-1], norm='l2') + scaledDataset = np.c_[scaledDataset, dataset[:, -1]] + + # TODO: change the return value after the values of the parameters are decided + # and the feature selection is complete + return dataset + +# Details about this part can be found in the link bellow: +# https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html#sklearn.decomposition.PCA +def PCA(dataset): + from sklearn.decomposition import PCA + + pca = PCA(n_components=10,svd_solver='full') + transformedDataset = pca.fit(dataset[:, :-1]).transform(dataset[:, :-1]) + transformedDataset = np.c_[transformedDataset, dataset[:, -1]] + + # TODO: change the return value after the values of the parameters are decided + # and the feature selection is complete + return dataset + +# Prints a nice message to let the user know the module was imported +print('feature_preprocessing loaded') + +# Enables executing the module as a standalone script +if __name__ == "__main__": + import sys + dataset, featureKeys = createSingleFeaturesArray(sys.argv[1], sys.argv[2]) + PCA(standardization(featureSelection(dataset, featureKeys))) \ No newline at end of file