Apostolos Fanakis
6 years ago
2 changed files with 140 additions and 0 deletions
@ -0,0 +1,14 @@ |
|||||
|
# Data preprocessing |
||||
|
|
||||
|
The file `data_preprocessing` is a python module that uses the open-source library [scikit-learn](https://scikit-learn.org/stable/) to perform several data preprocessing techniques to the data previously extracted. |
||||
|
|
||||
|
The module can be imported or executed as a script using one of the following commands |
||||
|
`python data_preprocessing.py <music_data_directory> <speech_data_directory>` |
||||
|
or |
||||
|
`python3 data_preprocessing.py <music_data_directory> <speech_data_directory>` |
||||
|
|
||||
|
**Dependencies:** |
||||
|
- scikit-learn |
||||
|
- numpy |
||||
|
|
||||
|
All dependencies are available both for python2 and python3 versions and can all be installed using the commands `pip install <package_name>` or `pip3 install <package_name>` for python2 and python3 respectively. |
@ -0,0 +1,126 @@ |
|||||
|
from os import listdir |
||||
|
from os.path import isfile, join |
||||
|
import numpy as np |
||||
|
import json |
||||
|
|
||||
|
def arrayFromJSONs(JSONPath): |
||||
|
with open(JSONPath) as jsonFile: |
||||
|
rawJSON = json.load(jsonFile) |
||||
|
|
||||
|
keys = np.array([]) |
||||
|
values = np.array([]) |
||||
|
for featureKey, featureValues in rawJSON.items(): |
||||
|
if keys.size == 0 or values.size == 0: |
||||
|
keys = np.array(featureKey) |
||||
|
values = np.array(featureValues) |
||||
|
else: |
||||
|
keys = np.append(keys, (np.array(featureKey))) |
||||
|
values = np.vstack((values, np.array(featureValues))) |
||||
|
|
||||
|
values = np.transpose(values) |
||||
|
return keys, values |
||||
|
|
||||
|
def createSingleFeaturesArray(musicJSONsPath, speechJSONsPath): |
||||
|
dataset = np.array([]) |
||||
|
featureKeys = np.array([]) |
||||
|
|
||||
|
# Reads the extracted features for the music class |
||||
|
featuresFiles = [file for file in listdir(musicJSONsPath) if isfile(join(musicJSONsPath, file))] |
||||
|
for file in featuresFiles: |
||||
|
if dataset.size == 0: |
||||
|
# Gets feature arrays |
||||
|
featureKeys, musicFeatures = arrayFromJSONs(musicJSONsPath + file) |
||||
|
# Appends the class to the arrays (0 for music, 1 for speech) |
||||
|
musicClass = np.zeros((musicFeatures.shape[0]), dtype=int) |
||||
|
musicFeatures = np.c_[musicFeatures, musicClass] |
||||
|
dataset = np.copy(musicFeatures) |
||||
|
else: |
||||
|
# Gets feature arrays |
||||
|
musicFeatures = arrayFromJSONs(musicJSONsPath + file)[1] |
||||
|
# Appends the class to the arrays (0 for music, 1 for speech) |
||||
|
musicFeatures = np.c_[musicFeatures, musicClass] |
||||
|
dataset = np.vstack((dataset, musicFeatures)) |
||||
|
|
||||
|
# Reads the extracted features for the speech class |
||||
|
featuresFiles = [file for file in listdir(speechJSONsPath) if isfile(join(speechJSONsPath, file))] |
||||
|
for file in featuresFiles: |
||||
|
# Gets feature arrays |
||||
|
speechFeatures = arrayFromJSONs(speechJSONsPath + file)[1] |
||||
|
# Appends the class to the arrays (0 for music, 1 for speech) |
||||
|
speechClass = np.ones((speechFeatures.shape[0]), dtype=int) |
||||
|
speechFeatures = np.c_[speechFeatures, speechClass] |
||||
|
dataset = np.vstack((dataset, speechFeatures)) |
||||
|
|
||||
|
return dataset, featureKeys |
||||
|
|
||||
|
# Details about this part can be found in the link bellow: |
||||
|
# https://scikit-learn.org/stable/modules/feature_selection.html |
||||
|
def featureSelection(dataset, featureKeys): |
||||
|
# Selects features based on a variance threshold |
||||
|
from sklearn.feature_selection import VarianceThreshold |
||||
|
|
||||
|
varianceThreshold = 0.72 |
||||
|
selector = VarianceThreshold(threshold = (varianceThreshold * (1 - varianceThreshold))) |
||||
|
varReducedDataset = selector.fit_transform(dataset) |
||||
|
isRetained = selector.get_support() |
||||
|
|
||||
|
print('Retaining features:') |
||||
|
for index, retain in enumerate(isRetained): |
||||
|
if retain and index < featureKeys.size: |
||||
|
print(featureKeys[index], end='\t', flush=True) |
||||
|
|
||||
|
print('\n\nRemoving features:') |
||||
|
for index, retain in enumerate(isRetained): |
||||
|
if not retain and index < featureKeys.size: |
||||
|
print(featureKeys[index], end='\t', flush=True) |
||||
|
print('\n') |
||||
|
|
||||
|
# Selects features based on univariate statistical tests |
||||
|
from sklearn.datasets import load_digits |
||||
|
from sklearn.feature_selection import SelectPercentile, mutual_info_regression |
||||
|
|
||||
|
perReducedDataset = SelectPercentile(mutual_info_regression, |
||||
|
percentile=33).fit_transform(dataset[:, :-1], dataset[:, -1]) |
||||
|
|
||||
|
# TODO: change the return value after the values of the parameters are decided |
||||
|
# and the feature selection is complete |
||||
|
return dataset |
||||
|
|
||||
|
# Details about this part can be found in the link bellow: |
||||
|
# https://scikit-learn.org/stable/modules/preprocessing.html#preprocessing |
||||
|
def standardization(dataset): |
||||
|
from sklearn import preprocessing |
||||
|
|
||||
|
# Standardization |
||||
|
scaledDataset = preprocessing.scale(dataset[:, :-1]) |
||||
|
scaledDataset = np.c_[scaledDataset, dataset[:, -1]] |
||||
|
|
||||
|
# Normalization |
||||
|
scaledDataset = preprocessing.normalize(dataset[:, :-1], norm='l2') |
||||
|
scaledDataset = np.c_[scaledDataset, dataset[:, -1]] |
||||
|
|
||||
|
# TODO: change the return value after the values of the parameters are decided |
||||
|
# and the feature selection is complete |
||||
|
return dataset |
||||
|
|
||||
|
# Details about this part can be found in the link bellow: |
||||
|
# https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html#sklearn.decomposition.PCA |
||||
|
def PCA(dataset): |
||||
|
from sklearn.decomposition import PCA |
||||
|
|
||||
|
pca = PCA(n_components=10,svd_solver='full') |
||||
|
transformedDataset = pca.fit(dataset[:, :-1]).transform(dataset[:, :-1]) |
||||
|
transformedDataset = np.c_[transformedDataset, dataset[:, -1]] |
||||
|
|
||||
|
# TODO: change the return value after the values of the parameters are decided |
||||
|
# and the feature selection is complete |
||||
|
return dataset |
||||
|
|
||||
|
# Prints a nice message to let the user know the module was imported |
||||
|
print('feature_preprocessing loaded') |
||||
|
|
||||
|
# Enables executing the module as a standalone script |
||||
|
if __name__ == "__main__": |
||||
|
import sys |
||||
|
dataset, featureKeys = createSingleFeaturesArray(sys.argv[1], sys.argv[2]) |
||||
|
PCA(standardization(featureSelection(dataset, featureKeys))) |
Loading…
Reference in new issue