Apostolos Fanakis
6 years ago
2 changed files with 140 additions and 0 deletions
@ -0,0 +1,14 @@ |
|||
# Data preprocessing |
|||
|
|||
The file `data_preprocessing` is a python module that uses the open-source library [scikit-learn](https://scikit-learn.org/stable/) to perform several data preprocessing techniques to the data previously extracted. |
|||
|
|||
The module can be imported or executed as a script using one of the following commands |
|||
`python data_preprocessing.py <music_data_directory> <speech_data_directory>` |
|||
or |
|||
`python3 data_preprocessing.py <music_data_directory> <speech_data_directory>` |
|||
|
|||
**Dependencies:** |
|||
- scikit-learn |
|||
- numpy |
|||
|
|||
All dependencies are available both for python2 and python3 versions and can all be installed using the commands `pip install <package_name>` or `pip3 install <package_name>` for python2 and python3 respectively. |
@ -0,0 +1,126 @@ |
|||
from os import listdir |
|||
from os.path import isfile, join |
|||
import numpy as np |
|||
import json |
|||
|
|||
def arrayFromJSONs(JSONPath): |
|||
with open(JSONPath) as jsonFile: |
|||
rawJSON = json.load(jsonFile) |
|||
|
|||
keys = np.array([]) |
|||
values = np.array([]) |
|||
for featureKey, featureValues in rawJSON.items(): |
|||
if keys.size == 0 or values.size == 0: |
|||
keys = np.array(featureKey) |
|||
values = np.array(featureValues) |
|||
else: |
|||
keys = np.append(keys, (np.array(featureKey))) |
|||
values = np.vstack((values, np.array(featureValues))) |
|||
|
|||
values = np.transpose(values) |
|||
return keys, values |
|||
|
|||
def createSingleFeaturesArray(musicJSONsPath, speechJSONsPath): |
|||
dataset = np.array([]) |
|||
featureKeys = np.array([]) |
|||
|
|||
# Reads the extracted features for the music class |
|||
featuresFiles = [file for file in listdir(musicJSONsPath) if isfile(join(musicJSONsPath, file))] |
|||
for file in featuresFiles: |
|||
if dataset.size == 0: |
|||
# Gets feature arrays |
|||
featureKeys, musicFeatures = arrayFromJSONs(musicJSONsPath + file) |
|||
# Appends the class to the arrays (0 for music, 1 for speech) |
|||
musicClass = np.zeros((musicFeatures.shape[0]), dtype=int) |
|||
musicFeatures = np.c_[musicFeatures, musicClass] |
|||
dataset = np.copy(musicFeatures) |
|||
else: |
|||
# Gets feature arrays |
|||
musicFeatures = arrayFromJSONs(musicJSONsPath + file)[1] |
|||
# Appends the class to the arrays (0 for music, 1 for speech) |
|||
musicFeatures = np.c_[musicFeatures, musicClass] |
|||
dataset = np.vstack((dataset, musicFeatures)) |
|||
|
|||
# Reads the extracted features for the speech class |
|||
featuresFiles = [file for file in listdir(speechJSONsPath) if isfile(join(speechJSONsPath, file))] |
|||
for file in featuresFiles: |
|||
# Gets feature arrays |
|||
speechFeatures = arrayFromJSONs(speechJSONsPath + file)[1] |
|||
# Appends the class to the arrays (0 for music, 1 for speech) |
|||
speechClass = np.ones((speechFeatures.shape[0]), dtype=int) |
|||
speechFeatures = np.c_[speechFeatures, speechClass] |
|||
dataset = np.vstack((dataset, speechFeatures)) |
|||
|
|||
return dataset, featureKeys |
|||
|
|||
# Details about this part can be found in the link bellow: |
|||
# https://scikit-learn.org/stable/modules/feature_selection.html |
|||
def featureSelection(dataset, featureKeys): |
|||
# Selects features based on a variance threshold |
|||
from sklearn.feature_selection import VarianceThreshold |
|||
|
|||
varianceThreshold = 0.72 |
|||
selector = VarianceThreshold(threshold = (varianceThreshold * (1 - varianceThreshold))) |
|||
varReducedDataset = selector.fit_transform(dataset) |
|||
isRetained = selector.get_support() |
|||
|
|||
print('Retaining features:') |
|||
for index, retain in enumerate(isRetained): |
|||
if retain and index < featureKeys.size: |
|||
print(featureKeys[index], end='\t', flush=True) |
|||
|
|||
print('\n\nRemoving features:') |
|||
for index, retain in enumerate(isRetained): |
|||
if not retain and index < featureKeys.size: |
|||
print(featureKeys[index], end='\t', flush=True) |
|||
print('\n') |
|||
|
|||
# Selects features based on univariate statistical tests |
|||
from sklearn.datasets import load_digits |
|||
from sklearn.feature_selection import SelectPercentile, mutual_info_regression |
|||
|
|||
perReducedDataset = SelectPercentile(mutual_info_regression, |
|||
percentile=33).fit_transform(dataset[:, :-1], dataset[:, -1]) |
|||
|
|||
# TODO: change the return value after the values of the parameters are decided |
|||
# and the feature selection is complete |
|||
return dataset |
|||
|
|||
# Details about this part can be found in the link bellow: |
|||
# https://scikit-learn.org/stable/modules/preprocessing.html#preprocessing |
|||
def standardization(dataset): |
|||
from sklearn import preprocessing |
|||
|
|||
# Standardization |
|||
scaledDataset = preprocessing.scale(dataset[:, :-1]) |
|||
scaledDataset = np.c_[scaledDataset, dataset[:, -1]] |
|||
|
|||
# Normalization |
|||
scaledDataset = preprocessing.normalize(dataset[:, :-1], norm='l2') |
|||
scaledDataset = np.c_[scaledDataset, dataset[:, -1]] |
|||
|
|||
# TODO: change the return value after the values of the parameters are decided |
|||
# and the feature selection is complete |
|||
return dataset |
|||
|
|||
# Details about this part can be found in the link bellow: |
|||
# https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html#sklearn.decomposition.PCA |
|||
def PCA(dataset): |
|||
from sklearn.decomposition import PCA |
|||
|
|||
pca = PCA(n_components=10,svd_solver='full') |
|||
transformedDataset = pca.fit(dataset[:, :-1]).transform(dataset[:, :-1]) |
|||
transformedDataset = np.c_[transformedDataset, dataset[:, -1]] |
|||
|
|||
# TODO: change the return value after the values of the parameters are decided |
|||
# and the feature selection is complete |
|||
return dataset |
|||
|
|||
# Prints a nice message to let the user know the module was imported |
|||
print('feature_preprocessing loaded') |
|||
|
|||
# Enables executing the module as a standalone script |
|||
if __name__ == "__main__": |
|||
import sys |
|||
dataset, featureKeys = createSingleFeaturesArray(sys.argv[1], sys.argv[2]) |
|||
PCA(standardization(featureSelection(dataset, featureKeys))) |
Loading…
Reference in new issue