Apostolos Fanakis
6 years ago
2 changed files with 140 additions and 0 deletions
@ -0,0 +1,14 @@ |
# Data preprocessing |
The file `data_preprocessing` is a python module that uses the open-source library [scikit-learn](https://scikit-learn.org/stable/) to perform several data preprocessing techniques to the data previously extracted. |
The module can be imported or executed as a script using one of the following commands |
`python data_preprocessing.py <music_data_directory> <speech_data_directory>` |
or |
`python3 data_preprocessing.py <music_data_directory> <speech_data_directory>` |
**Dependencies:** |
- scikit-learn |
- numpy |
All dependencies are available both for python2 and python3 versions and can all be installed using the commands `pip install <package_name>` or `pip3 install <package_name>` for python2 and python3 respectively. |
@ -0,0 +1,126 @@ |
from os import listdir |
from os.path import isfile, join |
import numpy as np |
import json |
def arrayFromJSONs(JSONPath): |
with open(JSONPath) as jsonFile: |
rawJSON = json.load(jsonFile) |
keys = np.array([]) |
values = np.array([]) |
for featureKey, featureValues in rawJSON.items(): |
if keys.size == 0 or values.size == 0: |
keys = np.array(featureKey) |
values = np.array(featureValues) |
else: |
keys = np.append(keys, (np.array(featureKey))) |
values = np.vstack((values, np.array(featureValues))) |
values = np.transpose(values) |
return keys, values |
def createSingleFeaturesArray(musicJSONsPath, speechJSONsPath): |
dataset = np.array([]) |
featureKeys = np.array([]) |
# Reads the extracted features for the music class |
featuresFiles = [file for file in listdir(musicJSONsPath) if isfile(join(musicJSONsPath, file))] |
for file in featuresFiles: |
if dataset.size == 0: |
# Gets feature arrays |
featureKeys, musicFeatures = arrayFromJSONs(musicJSONsPath + file) |
# Appends the class to the arrays (0 for music, 1 for speech) |
musicClass = np.zeros((musicFeatures.shape[0]), dtype=int) |
musicFeatures = np.c_[musicFeatures, musicClass] |
dataset = np.copy(musicFeatures) |
else: |
# Gets feature arrays |
musicFeatures = arrayFromJSONs(musicJSONsPath + file)[1] |
# Appends the class to the arrays (0 for music, 1 for speech) |
musicFeatures = np.c_[musicFeatures, musicClass] |
dataset = np.vstack((dataset, musicFeatures)) |
# Reads the extracted features for the speech class |
featuresFiles = [file for file in listdir(speechJSONsPath) if isfile(join(speechJSONsPath, file))] |
for file in featuresFiles: |
# Gets feature arrays |
speechFeatures = arrayFromJSONs(speechJSONsPath + file)[1] |
# Appends the class to the arrays (0 for music, 1 for speech) |
speechClass = np.ones((speechFeatures.shape[0]), dtype=int) |
speechFeatures = np.c_[speechFeatures, speechClass] |
dataset = np.vstack((dataset, speechFeatures)) |
return dataset, featureKeys |
# Details about this part can be found in the link bellow: |
# https://scikit-learn.org/stable/modules/feature_selection.html |
def featureSelection(dataset, featureKeys): |
# Selects features based on a variance threshold |
from sklearn.feature_selection import VarianceThreshold |
varianceThreshold = 0.72 |
selector = VarianceThreshold(threshold = (varianceThreshold * (1 - varianceThreshold))) |
varReducedDataset = selector.fit_transform(dataset) |
isRetained = selector.get_support() |
print('Retaining features:') |
for index, retain in enumerate(isRetained): |
if retain and index < featureKeys.size: |
print(featureKeys[index], end='\t', flush=True) |
print('\n\nRemoving features:') |
for index, retain in enumerate(isRetained): |
if not retain and index < featureKeys.size: |
print(featureKeys[index], end='\t', flush=True) |
print('\n') |
# Selects features based on univariate statistical tests |
from sklearn.datasets import load_digits |
from sklearn.feature_selection import SelectPercentile, mutual_info_regression |
perReducedDataset = SelectPercentile(mutual_info_regression, |
percentile=33).fit_transform(dataset[:, :-1], dataset[:, -1]) |
# TODO: change the return value after the values of the parameters are decided |
# and the feature selection is complete |
return dataset |
# Details about this part can be found in the link bellow: |
# https://scikit-learn.org/stable/modules/preprocessing.html#preprocessing |
def standardization(dataset): |
from sklearn import preprocessing |
# Standardization |
scaledDataset = preprocessing.scale(dataset[:, :-1]) |
scaledDataset = np.c_[scaledDataset, dataset[:, -1]] |
# Normalization |
scaledDataset = preprocessing.normalize(dataset[:, :-1], norm='l2') |
scaledDataset = np.c_[scaledDataset, dataset[:, -1]] |
# TODO: change the return value after the values of the parameters are decided |
# and the feature selection is complete |
return dataset |
# Details about this part can be found in the link bellow: |
# https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html#sklearn.decomposition.PCA |
def PCA(dataset): |
from sklearn.decomposition import PCA |
pca = PCA(n_components=10,svd_solver='full') |
transformedDataset = pca.fit(dataset[:, :-1]).transform(dataset[:, :-1]) |
transformedDataset = np.c_[transformedDataset, dataset[:, -1]] |
# TODO: change the return value after the values of the parameters are decided |
# and the feature selection is complete |
return dataset |
# Prints a nice message to let the user know the module was imported |
print('feature_preprocessing loaded') |
# Enables executing the module as a standalone script |
if __name__ == "__main__": |
import sys |
dataset, featureKeys = createSingleFeaturesArray(sys.argv[1], sys.argv[2]) |
PCA(standardization(featureSelection(dataset, featureKeys))) |
Reference in new issue