Browse Source

Data preprocessing minor fixes

master
Apostolos Fanakis 6 years ago
parent
commit
bd99c5d734
No known key found for this signature in database GPG Key ID: 56CE2DEDE9F1FB78
  1. 92
      classifier/preprocessing/data_preprocessing.py
  2. BIN
      classifier/preprocessing/dataset.npy
  3. BIN
      classifier/preprocessing/featureKeys.npy
  4. BIN
      classifier/preprocessing/target.npy

92
classifier/preprocessing/data_preprocessing.py

@ -3,7 +3,14 @@ from os.path import isfile, join
import numpy as np import numpy as np
import json import json
def arrayFromJSONs(JSONPath): class bcolors:
BLUE = '\033[94m'
GREEN = '\033[92m'
YELLOW = '\033[93m'
RED = '\033[91m'
ENDC = '\033[0m'
def arrayFromJSON(JSONPath):
with open(JSONPath) as jsonFile: with open(JSONPath) as jsonFile:
rawJSON = json.load(jsonFile) rawJSON = json.load(jsonFile)
@ -21,6 +28,7 @@ def arrayFromJSONs(JSONPath):
return keys, values return keys, values
def createSingleFeaturesArray(musicJSONsPath, speechJSONsPath): def createSingleFeaturesArray(musicJSONsPath, speechJSONsPath):
print(bcolors.YELLOW + 'Creating single features array' + bcolors.ENDC)
dataset = np.array([]) dataset = np.array([])
featureKeys = np.array([]) featureKeys = np.array([])
@ -29,47 +37,47 @@ def createSingleFeaturesArray(musicJSONsPath, speechJSONsPath):
for file in featuresFiles: for file in featuresFiles:
if dataset.size == 0: if dataset.size == 0:
# Gets feature arrays # Gets feature arrays
featureKeys, musicFeatures = arrayFromJSONs(musicJSONsPath + file) featureKeys, musicFeatures = arrayFromJSON(musicJSONsPath + file)
# Appends the class to the arrays (0 for music, 1 for speech) # Initializes dataset array
musicClass = np.zeros((musicFeatures.shape[0]), dtype=int)
musicFeatures = np.c_[musicFeatures, musicClass]
dataset = np.copy(musicFeatures) dataset = np.copy(musicFeatures)
else: else:
# Gets feature arrays # Gets feature arrays
musicFeatures = arrayFromJSONs(musicJSONsPath + file)[1] musicFeatures = arrayFromJSON(musicJSONsPath + file)[1]
# Appends the class to the arrays (0 for music, 1 for speech)
musicFeatures = np.c_[musicFeatures, musicClass]
dataset = np.vstack((dataset, musicFeatures)) dataset = np.vstack((dataset, musicFeatures))
# Initializes target array (0 for music)
target = np.zeros((dataset.shape[0]), dtype=int)
# Reads the extracted features for the speech class # Reads the extracted features for the speech class
featuresFiles = [file for file in listdir(speechJSONsPath) if isfile(join(speechJSONsPath, file))] featuresFiles = [file for file in listdir(speechJSONsPath) if isfile(join(speechJSONsPath, file))]
for file in featuresFiles: for file in featuresFiles:
# Gets feature arrays # Gets feature arrays
speechFeatures = arrayFromJSONs(speechJSONsPath + file)[1] speechFeatures = arrayFromJSON(speechJSONsPath + file)[1]
# Appends the class to the arrays (0 for music, 1 for speech)
speechClass = np.ones((speechFeatures.shape[0]), dtype=int)
speechFeatures = np.c_[speechFeatures, speechClass]
dataset = np.vstack((dataset, speechFeatures)) dataset = np.vstack((dataset, speechFeatures))
return dataset, featureKeys # Appends the new class to the target array (1 for speech)
target = np.hstack((target, np.ones((dataset.shape[0] - target.size), dtype=int)))
return dataset, target, featureKeys
# Details about this part can be found in the link bellow: # Details about this part can be found in the link bellow:
# https://scikit-learn.org/stable/modules/feature_selection.html # https://scikit-learn.org/stable/modules/feature_selection.html
def featureSelection(dataset, featureKeys): def featureSelection(dataset, target, featureKeys):
# Selects features based on a variance threshold # Selects features based on a variance threshold
from sklearn.feature_selection import VarianceThreshold from sklearn.feature_selection import VarianceThreshold
varianceThreshold = 0.72 print(bcolors.YELLOW + 'Running variance threshold feature selection' + bcolors.ENDC)
varianceThreshold = 0.1
selector = VarianceThreshold(threshold = (varianceThreshold * (1 - varianceThreshold))) selector = VarianceThreshold(threshold = (varianceThreshold * (1 - varianceThreshold)))
varReducedDataset = selector.fit_transform(dataset) varReducedDataset = selector.fit_transform(dataset)
isRetained = selector.get_support() isRetained = selector.get_support()
print('Retaining features:') print(bcolors.GREEN + 'Retaining features:' + bcolors.ENDC)
for index, retain in enumerate(isRetained): for index, retain in enumerate(isRetained):
if retain and index < featureKeys.size: if retain and index < featureKeys.size:
print(featureKeys[index], end='\t', flush=True) print(featureKeys[index], end='\t', flush=True)
print('\n\nRemoving features:') print(bcolors.RED + '\n\nRemoving features:' + bcolors.ENDC)
for index, retain in enumerate(isRetained): for index, retain in enumerate(isRetained):
if not retain and index < featureKeys.size: if not retain and index < featureKeys.size:
print(featureKeys[index], end='\t', flush=True) print(featureKeys[index], end='\t', flush=True)
@ -77,10 +85,30 @@ def featureSelection(dataset, featureKeys):
# Selects features based on univariate statistical tests # Selects features based on univariate statistical tests
from sklearn.datasets import load_digits from sklearn.datasets import load_digits
from sklearn.feature_selection import SelectPercentile, mutual_info_regression from sklearn.feature_selection import SelectPercentile, mutual_info_classif
print(bcolors.YELLOW + 'Running feature selection based on mutual information' + bcolors.ENDC)
percentileSelector = SelectPercentile(mutual_info_classif, percentile=33)
perReducedDataset = percentileSelector.fit_transform(dataset, target)
isRetained = percentileSelector.get_support()
print(bcolors.BLUE + 'Scores of features:' + bcolors.ENDC)
for index, score in enumerate(percentileSelector.scores_):
print(featureKeys[index] + ' => ' + str(score), end='\t\t', flush=True)
if index%2:
print('')
print('')
print(bcolors.GREEN + 'Retaining features:' + bcolors.ENDC)
for index, retain in enumerate(isRetained):
if retain and index < featureKeys.size:
print(featureKeys[index], end='\t', flush=True)
perReducedDataset = SelectPercentile(mutual_info_regression, print(bcolors.RED + '\n\nRemoving features:' + bcolors.ENDC)
percentile=33).fit_transform(dataset[:, :-1], dataset[:, -1]) for index, retain in enumerate(isRetained):
if not retain and index < featureKeys.size:
print(featureKeys[index], end='\t', flush=True)
print('\n')
# TODO: change the return value after the values of the parameters are decided # TODO: change the return value after the values of the parameters are decided
# and the feature selection is complete # and the feature selection is complete
@ -91,13 +119,13 @@ def featureSelection(dataset, featureKeys):
def standardization(dataset): def standardization(dataset):
from sklearn import preprocessing from sklearn import preprocessing
print(bcolors.YELLOW + 'Running standardization' + bcolors.ENDC)
# Standardization # Standardization
scaledDataset = preprocessing.scale(dataset[:, :-1]) scaledDataset = preprocessing.scale(dataset)
scaledDataset = np.c_[scaledDataset, dataset[:, -1]]
print(bcolors.YELLOW + 'Running normalization' + bcolors.ENDC)
# Normalization # Normalization
scaledDataset = preprocessing.normalize(dataset[:, :-1], norm='l2') normalizedDataset = preprocessing.normalize(dataset, norm='l2')
scaledDataset = np.c_[scaledDataset, dataset[:, -1]]
# TODO: change the return value after the values of the parameters are decided # TODO: change the return value after the values of the parameters are decided
# and the feature selection is complete # and the feature selection is complete
@ -108,19 +136,23 @@ def standardization(dataset):
def PCA(dataset): def PCA(dataset):
from sklearn.decomposition import PCA from sklearn.decomposition import PCA
pca = PCA(n_components=10,svd_solver='full') print(bcolors.YELLOW + 'Running PCA' + bcolors.ENDC)
transformedDataset = pca.fit(dataset[:, :-1]).transform(dataset[:, :-1]) pca = PCA(n_components=10, svd_solver='full')
transformedDataset = np.c_[transformedDataset, dataset[:, -1]] transformedDataset = pca.fit(dataset).transform(dataset)
# TODO: change the return value after the values of the parameters are decided # TODO: change the return value after the values of the parameters are decided
# and the feature selection is complete # and the feature selection is complete
return dataset return dataset
# Prints a nice message to let the user know the module was imported # Prints a nice message to let the user know the module was imported
print('feature_preprocessing loaded') print(bcolors.BLUE + 'feature_preprocessing loaded' + bcolors.ENDC)
# Enables executing the module as a standalone script # Enables executing the module as a standalone script
if __name__ == "__main__": if __name__ == "__main__":
import sys import sys
dataset, featureKeys = createSingleFeaturesArray(sys.argv[1], sys.argv[2]) dataset, target, featureKeys = createSingleFeaturesArray(sys.argv[1], sys.argv[2])
PCA(standardization(featureSelection(dataset, featureKeys))) PCA(standardization(featureSelection(dataset, target, featureKeys)))
print(bcolors.GREEN + 'Saving results to files' + bcolors.ENDC)
np.save('dataset.npy', dataset)
np.save('target.npy', target)
np.save('featureKeys.npy', featureKeys)

BIN
classifier/preprocessing/dataset.npy

Binary file not shown.

BIN
classifier/preprocessing/featureKeys.npy

Binary file not shown.

BIN
classifier/preprocessing/target.npy

Binary file not shown.
Loading…
Cancel
Save