You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
158 lines
5.8 KiB
158 lines
5.8 KiB
from os import listdir
|
|
from os.path import isfile, join
|
|
import numpy as np
|
|
import json
|
|
|
|
class bcolors:
|
|
BLUE = '\033[94m'
|
|
GREEN = '\033[92m'
|
|
YELLOW = '\033[93m'
|
|
RED = '\033[91m'
|
|
ENDC = '\033[0m'
|
|
|
|
def arrayFromJSON(JSONPath):
|
|
with open(JSONPath) as jsonFile:
|
|
rawJSON = json.load(jsonFile)
|
|
|
|
keys = np.array([])
|
|
values = np.array([])
|
|
for featureKey, featureValues in rawJSON.items():
|
|
if keys.size == 0 or values.size == 0:
|
|
keys = np.array(featureKey)
|
|
values = np.array(featureValues)
|
|
else:
|
|
keys = np.append(keys, (np.array(featureKey)))
|
|
values = np.vstack((values, np.array(featureValues)))
|
|
|
|
values = np.transpose(values)
|
|
return keys, values
|
|
|
|
def createSingleFeaturesArray(musicJSONsPath, speechJSONsPath):
|
|
print(bcolors.YELLOW + 'Creating single features array' + bcolors.ENDC)
|
|
dataset = np.array([])
|
|
featureKeys = np.array([])
|
|
|
|
# Reads the extracted features for the music class
|
|
featuresFiles = [file for file in listdir(musicJSONsPath) if isfile(join(musicJSONsPath, file))]
|
|
for file in featuresFiles:
|
|
if dataset.size == 0:
|
|
# Gets feature arrays
|
|
featureKeys, musicFeatures = arrayFromJSON(musicJSONsPath + file)
|
|
# Initializes dataset array
|
|
dataset = np.copy(musicFeatures)
|
|
else:
|
|
# Gets feature arrays
|
|
musicFeatures = arrayFromJSON(musicJSONsPath + file)[1]
|
|
dataset = np.vstack((dataset, musicFeatures))
|
|
|
|
# Initializes target array (0 for music)
|
|
target = np.zeros((dataset.shape[0]), dtype=int)
|
|
|
|
# Reads the extracted features for the speech class
|
|
featuresFiles = [file for file in listdir(speechJSONsPath) if isfile(join(speechJSONsPath, file))]
|
|
for file in featuresFiles:
|
|
# Gets feature arrays
|
|
speechFeatures = arrayFromJSON(speechJSONsPath + file)[1]
|
|
dataset = np.vstack((dataset, speechFeatures))
|
|
|
|
# Appends the new class to the target array (1 for speech)
|
|
target = np.hstack((target, np.ones((dataset.shape[0] - target.size), dtype=int)))
|
|
|
|
return dataset, target, featureKeys
|
|
|
|
# Details about this part can be found in the link bellow:
|
|
# https://scikit-learn.org/stable/modules/feature_selection.html
|
|
def featureSelection(dataset, target, featureKeys):
|
|
# Selects features based on a variance threshold
|
|
from sklearn.feature_selection import VarianceThreshold
|
|
|
|
print(bcolors.YELLOW + 'Running variance threshold feature selection' + bcolors.ENDC)
|
|
varianceThreshold = 0.1
|
|
selector = VarianceThreshold(threshold = (varianceThreshold * (1 - varianceThreshold)))
|
|
varReducedDataset = selector.fit_transform(dataset)
|
|
isRetained = selector.get_support()
|
|
|
|
print(bcolors.GREEN + 'Retaining features:' + bcolors.ENDC)
|
|
for index, retain in enumerate(isRetained):
|
|
if retain and index < featureKeys.size:
|
|
print(featureKeys[index], end='\t', flush=True)
|
|
|
|
print(bcolors.RED + '\n\nRemoving features:' + bcolors.ENDC)
|
|
for index, retain in enumerate(isRetained):
|
|
if not retain and index < featureKeys.size:
|
|
print(featureKeys[index], end='\t', flush=True)
|
|
print('\n')
|
|
|
|
# Selects features based on univariate statistical tests
|
|
from sklearn.datasets import load_digits
|
|
from sklearn.feature_selection import SelectPercentile, mutual_info_classif
|
|
|
|
print(bcolors.YELLOW + 'Running feature selection based on mutual information' + bcolors.ENDC)
|
|
percentileSelector = SelectPercentile(mutual_info_classif, percentile=33)
|
|
perReducedDataset = percentileSelector.fit_transform(dataset, target)
|
|
isRetained = percentileSelector.get_support()
|
|
|
|
print(bcolors.BLUE + 'Scores of features:' + bcolors.ENDC)
|
|
for index, score in enumerate(percentileSelector.scores_):
|
|
print(featureKeys[index] + ' => ' + str(score), end='\t\t', flush=True)
|
|
if index%2:
|
|
print('')
|
|
print('')
|
|
|
|
print(bcolors.GREEN + 'Retaining features:' + bcolors.ENDC)
|
|
for index, retain in enumerate(isRetained):
|
|
if retain and index < featureKeys.size:
|
|
print(featureKeys[index], end='\t', flush=True)
|
|
|
|
print(bcolors.RED + '\n\nRemoving features:' + bcolors.ENDC)
|
|
for index, retain in enumerate(isRetained):
|
|
if not retain and index < featureKeys.size:
|
|
print(featureKeys[index], end='\t', flush=True)
|
|
print('\n')
|
|
|
|
# TODO: change the return value after the values of the parameters are decided
|
|
# and the feature selection is complete
|
|
return dataset
|
|
|
|
# Details about this part can be found in the link bellow:
|
|
# https://scikit-learn.org/stable/modules/preprocessing.html#preprocessing
|
|
def standardization(dataset):
|
|
from sklearn import preprocessing
|
|
|
|
print(bcolors.YELLOW + 'Running standardization' + bcolors.ENDC)
|
|
# Standardization
|
|
scaledDataset = preprocessing.scale(dataset)
|
|
|
|
print(bcolors.YELLOW + 'Running normalization' + bcolors.ENDC)
|
|
# Normalization
|
|
normalizedDataset = preprocessing.normalize(dataset, norm='l2')
|
|
|
|
# TODO: change the return value after the values of the parameters are decided
|
|
# and the feature selection is complete
|
|
return dataset
|
|
|
|
# Details about this part can be found in the link bellow:
|
|
# https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html#sklearn.decomposition.PCA
|
|
def PCA(dataset):
|
|
from sklearn.decomposition import PCA
|
|
|
|
print(bcolors.YELLOW + 'Running PCA' + bcolors.ENDC)
|
|
pca = PCA(n_components=10, svd_solver='full')
|
|
transformedDataset = pca.fit(dataset).transform(dataset)
|
|
|
|
# TODO: change the return value after the values of the parameters are decided
|
|
# and the feature selection is complete
|
|
return dataset
|
|
|
|
# Prints a nice message to let the user know the module was imported
|
|
print(bcolors.BLUE + 'feature_preprocessing loaded' + bcolors.ENDC)
|
|
|
|
# Enables executing the module as a standalone script
|
|
if __name__ == "__main__":
|
|
import sys
|
|
dataset, target, featureKeys = createSingleFeaturesArray(sys.argv[1], sys.argv[2])
|
|
PCA(standardization(featureSelection(dataset, target, featureKeys)))
|
|
print(bcolors.GREEN + 'Saving results to files' + bcolors.ENDC)
|
|
np.save('dataset.npy', dataset)
|
|
np.save('target.npy', target)
|
|
np.save('featureKeys.npy', featureKeys)
|