Browse Source

Init data preprocessing

master
Apostolos Fanakis 6 years ago
parent
commit
f2d288d0a4
No known key found for this signature in database GPG Key ID: 56CE2DEDE9F1FB78
  1. 14
      classifier/preprocessing/README.md
  2. 126
      classifier/preprocessing/data_preprocessing.py

14
classifier/preprocessing/README.md

@ -0,0 +1,14 @@
# Data preprocessing
The file `data_preprocessing` is a python module that uses the open-source library [scikit-learn](https://scikit-learn.org/stable/) to perform several data preprocessing techniques to the data previously extracted.
The module can be imported or executed as a script using one of the following commands
`python data_preprocessing.py <music_data_directory> <speech_data_directory>`
or
`python3 data_preprocessing.py <music_data_directory> <speech_data_directory>`
**Dependencies:**
- scikit-learn
- numpy
All dependencies are available both for python2 and python3 versions and can all be installed using the commands `pip install <package_name>` or `pip3 install <package_name>` for python2 and python3 respectively.

126
classifier/preprocessing/data_preprocessing.py

@ -0,0 +1,126 @@
from os import listdir
from os.path import isfile, join
import numpy as np
import json
def arrayFromJSONs(JSONPath):
with open(JSONPath) as jsonFile:
rawJSON = json.load(jsonFile)
keys = np.array([])
values = np.array([])
for featureKey, featureValues in rawJSON.items():
if keys.size == 0 or values.size == 0:
keys = np.array(featureKey)
values = np.array(featureValues)
else:
keys = np.append(keys, (np.array(featureKey)))
values = np.vstack((values, np.array(featureValues)))
values = np.transpose(values)
return keys, values
def createSingleFeaturesArray(musicJSONsPath, speechJSONsPath):
dataset = np.array([])
featureKeys = np.array([])
# Reads the extracted features for the music class
featuresFiles = [file for file in listdir(musicJSONsPath) if isfile(join(musicJSONsPath, file))]
for file in featuresFiles:
if dataset.size == 0:
# Gets feature arrays
featureKeys, musicFeatures = arrayFromJSONs(musicJSONsPath + file)
# Appends the class to the arrays (0 for music, 1 for speech)
musicClass = np.zeros((musicFeatures.shape[0]), dtype=int)
musicFeatures = np.c_[musicFeatures, musicClass]
dataset = np.copy(musicFeatures)
else:
# Gets feature arrays
musicFeatures = arrayFromJSONs(musicJSONsPath + file)[1]
# Appends the class to the arrays (0 for music, 1 for speech)
musicFeatures = np.c_[musicFeatures, musicClass]
dataset = np.vstack((dataset, musicFeatures))
# Reads the extracted features for the speech class
featuresFiles = [file for file in listdir(speechJSONsPath) if isfile(join(speechJSONsPath, file))]
for file in featuresFiles:
# Gets feature arrays
speechFeatures = arrayFromJSONs(speechJSONsPath + file)[1]
# Appends the class to the arrays (0 for music, 1 for speech)
speechClass = np.ones((speechFeatures.shape[0]), dtype=int)
speechFeatures = np.c_[speechFeatures, speechClass]
dataset = np.vstack((dataset, speechFeatures))
return dataset, featureKeys
# Details about this part can be found in the link bellow:
# https://scikit-learn.org/stable/modules/feature_selection.html
def featureSelection(dataset, featureKeys):
# Selects features based on a variance threshold
from sklearn.feature_selection import VarianceThreshold
varianceThreshold = 0.72
selector = VarianceThreshold(threshold = (varianceThreshold * (1 - varianceThreshold)))
varReducedDataset = selector.fit_transform(dataset)
isRetained = selector.get_support()
print('Retaining features:')
for index, retain in enumerate(isRetained):
if retain and index < featureKeys.size:
print(featureKeys[index], end='\t', flush=True)
print('\n\nRemoving features:')
for index, retain in enumerate(isRetained):
if not retain and index < featureKeys.size:
print(featureKeys[index], end='\t', flush=True)
print('\n')
# Selects features based on univariate statistical tests
from sklearn.datasets import load_digits
from sklearn.feature_selection import SelectPercentile, mutual_info_regression
perReducedDataset = SelectPercentile(mutual_info_regression,
percentile=33).fit_transform(dataset[:, :-1], dataset[:, -1])
# TODO: change the return value after the values of the parameters are decided
# and the feature selection is complete
return dataset
# Details about this part can be found in the link bellow:
# https://scikit-learn.org/stable/modules/preprocessing.html#preprocessing
def standardization(dataset):
from sklearn import preprocessing
# Standardization
scaledDataset = preprocessing.scale(dataset[:, :-1])
scaledDataset = np.c_[scaledDataset, dataset[:, -1]]
# Normalization
scaledDataset = preprocessing.normalize(dataset[:, :-1], norm='l2')
scaledDataset = np.c_[scaledDataset, dataset[:, -1]]
# TODO: change the return value after the values of the parameters are decided
# and the feature selection is complete
return dataset
# Details about this part can be found in the link bellow:
# https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html#sklearn.decomposition.PCA
def PCA(dataset):
from sklearn.decomposition import PCA
pca = PCA(n_components=10,svd_solver='full')
transformedDataset = pca.fit(dataset[:, :-1]).transform(dataset[:, :-1])
transformedDataset = np.c_[transformedDataset, dataset[:, -1]]
# TODO: change the return value after the values of the parameters are decided
# and the feature selection is complete
return dataset
# Prints a nice message to let the user know the module was imported
print('feature_preprocessing loaded')
# Enables executing the module as a standalone script
if __name__ == "__main__":
import sys
dataset, featureKeys = createSingleFeaturesArray(sys.argv[1], sys.argv[2])
PCA(standardization(featureSelection(dataset, featureKeys)))
Loading…
Cancel
Save