|
@ -3,7 +3,14 @@ from os.path import isfile, join |
|
|
import numpy as np |
|
|
import numpy as np |
|
|
import json |
|
|
import json |
|
|
|
|
|
|
|
|
def arrayFromJSONs(JSONPath): |
|
|
class bcolors: |
|
|
|
|
|
BLUE = '\033[94m' |
|
|
|
|
|
GREEN = '\033[92m' |
|
|
|
|
|
YELLOW = '\033[93m' |
|
|
|
|
|
RED = '\033[91m' |
|
|
|
|
|
ENDC = '\033[0m' |
|
|
|
|
|
|
|
|
|
|
|
def arrayFromJSON(JSONPath): |
|
|
with open(JSONPath) as jsonFile: |
|
|
with open(JSONPath) as jsonFile: |
|
|
rawJSON = json.load(jsonFile) |
|
|
rawJSON = json.load(jsonFile) |
|
|
|
|
|
|
|
@ -21,6 +28,7 @@ def arrayFromJSONs(JSONPath): |
|
|
return keys, values |
|
|
return keys, values |
|
|
|
|
|
|
|
|
def createSingleFeaturesArray(musicJSONsPath, speechJSONsPath): |
|
|
def createSingleFeaturesArray(musicJSONsPath, speechJSONsPath): |
|
|
|
|
|
print(bcolors.YELLOW + 'Creating single features array' + bcolors.ENDC) |
|
|
dataset = np.array([]) |
|
|
dataset = np.array([]) |
|
|
featureKeys = np.array([]) |
|
|
featureKeys = np.array([]) |
|
|
|
|
|
|
|
@ -29,47 +37,47 @@ def createSingleFeaturesArray(musicJSONsPath, speechJSONsPath): |
|
|
for file in featuresFiles: |
|
|
for file in featuresFiles: |
|
|
if dataset.size == 0: |
|
|
if dataset.size == 0: |
|
|
# Gets feature arrays |
|
|
# Gets feature arrays |
|
|
featureKeys, musicFeatures = arrayFromJSONs(musicJSONsPath + file) |
|
|
featureKeys, musicFeatures = arrayFromJSON(musicJSONsPath + file) |
|
|
# Appends the class to the arrays (0 for music, 1 for speech) |
|
|
# Initializes dataset array |
|
|
musicClass = np.zeros((musicFeatures.shape[0]), dtype=int) |
|
|
|
|
|
musicFeatures = np.c_[musicFeatures, musicClass] |
|
|
|
|
|
dataset = np.copy(musicFeatures) |
|
|
dataset = np.copy(musicFeatures) |
|
|
else: |
|
|
else: |
|
|
# Gets feature arrays |
|
|
# Gets feature arrays |
|
|
musicFeatures = arrayFromJSONs(musicJSONsPath + file)[1] |
|
|
musicFeatures = arrayFromJSON(musicJSONsPath + file)[1] |
|
|
# Appends the class to the arrays (0 for music, 1 for speech) |
|
|
|
|
|
musicFeatures = np.c_[musicFeatures, musicClass] |
|
|
|
|
|
dataset = np.vstack((dataset, musicFeatures)) |
|
|
dataset = np.vstack((dataset, musicFeatures)) |
|
|
|
|
|
|
|
|
|
|
|
# Initializes target array (0 for music) |
|
|
|
|
|
target = np.zeros((dataset.shape[0]), dtype=int) |
|
|
|
|
|
|
|
|
# Reads the extracted features for the speech class |
|
|
# Reads the extracted features for the speech class |
|
|
featuresFiles = [file for file in listdir(speechJSONsPath) if isfile(join(speechJSONsPath, file))] |
|
|
featuresFiles = [file for file in listdir(speechJSONsPath) if isfile(join(speechJSONsPath, file))] |
|
|
for file in featuresFiles: |
|
|
for file in featuresFiles: |
|
|
# Gets feature arrays |
|
|
# Gets feature arrays |
|
|
speechFeatures = arrayFromJSONs(speechJSONsPath + file)[1] |
|
|
speechFeatures = arrayFromJSON(speechJSONsPath + file)[1] |
|
|
# Appends the class to the arrays (0 for music, 1 for speech) |
|
|
|
|
|
speechClass = np.ones((speechFeatures.shape[0]), dtype=int) |
|
|
|
|
|
speechFeatures = np.c_[speechFeatures, speechClass] |
|
|
|
|
|
dataset = np.vstack((dataset, speechFeatures)) |
|
|
dataset = np.vstack((dataset, speechFeatures)) |
|
|
|
|
|
|
|
|
return dataset, featureKeys |
|
|
# Appends the new class to the target array (1 for speech) |
|
|
|
|
|
target = np.hstack((target, np.ones((dataset.shape[0] - target.size), dtype=int))) |
|
|
|
|
|
|
|
|
|
|
|
return dataset, target, featureKeys |
|
|
|
|
|
|
|
|
# Details about this part can be found in the link bellow: |
|
|
# Details about this part can be found in the link bellow: |
|
|
# https://scikit-learn.org/stable/modules/feature_selection.html |
|
|
# https://scikit-learn.org/stable/modules/feature_selection.html |
|
|
def featureSelection(dataset, featureKeys): |
|
|
def featureSelection(dataset, target, featureKeys): |
|
|
# Selects features based on a variance threshold |
|
|
# Selects features based on a variance threshold |
|
|
from sklearn.feature_selection import VarianceThreshold |
|
|
from sklearn.feature_selection import VarianceThreshold |
|
|
|
|
|
|
|
|
varianceThreshold = 0.72 |
|
|
print(bcolors.YELLOW + 'Running variance threshold feature selection' + bcolors.ENDC) |
|
|
|
|
|
varianceThreshold = 0.1 |
|
|
selector = VarianceThreshold(threshold = (varianceThreshold * (1 - varianceThreshold))) |
|
|
selector = VarianceThreshold(threshold = (varianceThreshold * (1 - varianceThreshold))) |
|
|
varReducedDataset = selector.fit_transform(dataset) |
|
|
varReducedDataset = selector.fit_transform(dataset) |
|
|
isRetained = selector.get_support() |
|
|
isRetained = selector.get_support() |
|
|
|
|
|
|
|
|
print('Retaining features:') |
|
|
print(bcolors.GREEN + 'Retaining features:' + bcolors.ENDC) |
|
|
for index, retain in enumerate(isRetained): |
|
|
for index, retain in enumerate(isRetained): |
|
|
if retain and index < featureKeys.size: |
|
|
if retain and index < featureKeys.size: |
|
|
print(featureKeys[index], end='\t', flush=True) |
|
|
print(featureKeys[index], end='\t', flush=True) |
|
|
|
|
|
|
|
|
print('\n\nRemoving features:') |
|
|
print(bcolors.RED + '\n\nRemoving features:' + bcolors.ENDC) |
|
|
for index, retain in enumerate(isRetained): |
|
|
for index, retain in enumerate(isRetained): |
|
|
if not retain and index < featureKeys.size: |
|
|
if not retain and index < featureKeys.size: |
|
|
print(featureKeys[index], end='\t', flush=True) |
|
|
print(featureKeys[index], end='\t', flush=True) |
|
@ -77,10 +85,30 @@ def featureSelection(dataset, featureKeys): |
|
|
|
|
|
|
|
|
# Selects features based on univariate statistical tests |
|
|
# Selects features based on univariate statistical tests |
|
|
from sklearn.datasets import load_digits |
|
|
from sklearn.datasets import load_digits |
|
|
from sklearn.feature_selection import SelectPercentile, mutual_info_regression |
|
|
from sklearn.feature_selection import SelectPercentile, mutual_info_classif |
|
|
|
|
|
|
|
|
|
|
|
print(bcolors.YELLOW + 'Running feature selection based on mutual information' + bcolors.ENDC) |
|
|
|
|
|
percentileSelector = SelectPercentile(mutual_info_classif, percentile=33) |
|
|
|
|
|
perReducedDataset = percentileSelector.fit_transform(dataset, target) |
|
|
|
|
|
isRetained = percentileSelector.get_support() |
|
|
|
|
|
|
|
|
|
|
|
print(bcolors.BLUE + 'Scores of features:' + bcolors.ENDC) |
|
|
|
|
|
for index, score in enumerate(percentileSelector.scores_): |
|
|
|
|
|
print(featureKeys[index] + ' => ' + str(score), end='\t\t', flush=True) |
|
|
|
|
|
if index%2: |
|
|
|
|
|
print('') |
|
|
|
|
|
print('') |
|
|
|
|
|
|
|
|
|
|
|
print(bcolors.GREEN + 'Retaining features:' + bcolors.ENDC) |
|
|
|
|
|
for index, retain in enumerate(isRetained): |
|
|
|
|
|
if retain and index < featureKeys.size: |
|
|
|
|
|
print(featureKeys[index], end='\t', flush=True) |
|
|
|
|
|
|
|
|
perReducedDataset = SelectPercentile(mutual_info_regression, |
|
|
print(bcolors.RED + '\n\nRemoving features:' + bcolors.ENDC) |
|
|
percentile=33).fit_transform(dataset[:, :-1], dataset[:, -1]) |
|
|
for index, retain in enumerate(isRetained): |
|
|
|
|
|
if not retain and index < featureKeys.size: |
|
|
|
|
|
print(featureKeys[index], end='\t', flush=True) |
|
|
|
|
|
print('\n') |
|
|
|
|
|
|
|
|
# TODO: change the return value after the values of the parameters are decided |
|
|
# TODO: change the return value after the values of the parameters are decided |
|
|
# and the feature selection is complete |
|
|
# and the feature selection is complete |
|
@ -91,13 +119,13 @@ def featureSelection(dataset, featureKeys): |
|
|
def standardization(dataset): |
|
|
def standardization(dataset): |
|
|
from sklearn import preprocessing |
|
|
from sklearn import preprocessing |
|
|
|
|
|
|
|
|
|
|
|
print(bcolors.YELLOW + 'Running standardization' + bcolors.ENDC) |
|
|
# Standardization |
|
|
# Standardization |
|
|
scaledDataset = preprocessing.scale(dataset[:, :-1]) |
|
|
scaledDataset = preprocessing.scale(dataset) |
|
|
scaledDataset = np.c_[scaledDataset, dataset[:, -1]] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print(bcolors.YELLOW + 'Running normalization' + bcolors.ENDC) |
|
|
# Normalization |
|
|
# Normalization |
|
|
scaledDataset = preprocessing.normalize(dataset[:, :-1], norm='l2') |
|
|
normalizedDataset = preprocessing.normalize(dataset, norm='l2') |
|
|
scaledDataset = np.c_[scaledDataset, dataset[:, -1]] |
|
|
|
|
|
|
|
|
|
|
|
# TODO: change the return value after the values of the parameters are decided |
|
|
# TODO: change the return value after the values of the parameters are decided |
|
|
# and the feature selection is complete |
|
|
# and the feature selection is complete |
|
@ -108,19 +136,23 @@ def standardization(dataset): |
|
|
def PCA(dataset): |
|
|
def PCA(dataset): |
|
|
from sklearn.decomposition import PCA |
|
|
from sklearn.decomposition import PCA |
|
|
|
|
|
|
|
|
pca = PCA(n_components=10,svd_solver='full') |
|
|
print(bcolors.YELLOW + 'Running PCA' + bcolors.ENDC) |
|
|
transformedDataset = pca.fit(dataset[:, :-1]).transform(dataset[:, :-1]) |
|
|
pca = PCA(n_components=10, svd_solver='full') |
|
|
transformedDataset = np.c_[transformedDataset, dataset[:, -1]] |
|
|
transformedDataset = pca.fit(dataset).transform(dataset) |
|
|
|
|
|
|
|
|
# TODO: change the return value after the values of the parameters are decided |
|
|
# TODO: change the return value after the values of the parameters are decided |
|
|
# and the feature selection is complete |
|
|
# and the feature selection is complete |
|
|
return dataset |
|
|
return dataset |
|
|
|
|
|
|
|
|
# Prints a nice message to let the user know the module was imported |
|
|
# Prints a nice message to let the user know the module was imported |
|
|
print('feature_preprocessing loaded') |
|
|
print(bcolors.BLUE + 'feature_preprocessing loaded' + bcolors.ENDC) |
|
|
|
|
|
|
|
|
# Enables executing the module as a standalone script |
|
|
# Enables executing the module as a standalone script |
|
|
if __name__ == "__main__": |
|
|
if __name__ == "__main__": |
|
|
import sys |
|
|
import sys |
|
|
dataset, featureKeys = createSingleFeaturesArray(sys.argv[1], sys.argv[2]) |
|
|
dataset, target, featureKeys = createSingleFeaturesArray(sys.argv[1], sys.argv[2]) |
|
|
PCA(standardization(featureSelection(dataset, featureKeys))) |
|
|
PCA(standardization(featureSelection(dataset, target, featureKeys))) |
|
|
|
|
|
print(bcolors.GREEN + 'Saving results to files' + bcolors.ENDC) |
|
|
|
|
|
np.save('dataset.npy', dataset) |
|
|
|
|
|
np.save('target.npy', target) |
|
|
|
|
|
np.save('featureKeys.npy', featureKeys) |