the-assignment/classifier/preprocessing/data_preprocessing.py


								from os import listdir

								from os.path import isfile, join

								import numpy as np

								import pandas as pd

								import json


								class bcolors:

									BLUE = '\033[94m'

									GREEN = '\033[92m'

									YELLOW = '\033[93m'

									RED = '\033[91m'

									ENDC = '\033[0m'


								def arrayFromJSON(JSONPath):

									with open(JSONPath) as jsonFile:

										rawJSON = json.load(jsonFile)


									keys = np.array([])

									values = np.array([])

									for featureKey, featureValues in rawJSON.items():

										if keys.size == 0 or values.size == 0:

											keys = np.array(featureKey)

											values = np.array(featureValues)

										else:

											keys = np.append(keys, (np.array(featureKey)))

											values = np.vstack((values, np.array(featureValues)))


									values = np.transpose(values)

									return keys, values


								def createSingleFeaturesArray(musicJSONsPath, speechJSONsPath):

									print(bcolors.YELLOW + 'Creating single features array' + bcolors.ENDC)

									dataset = np.array([])

									featureKeys = np.array([])


									# Reads the extracted features for the music class

									featuresFiles = [file for file in listdir(musicJSONsPath) if isfile(join(musicJSONsPath, file))]

									for file in featuresFiles:

										if dataset.size == 0:

											# Gets feature arrays

											featureKeys, musicFeatures = arrayFromJSON(musicJSONsPath + file)

											# Initializes dataset array

											dataset = np.copy(musicFeatures)

										else:

											# Gets feature arrays

											musicFeatures = arrayFromJSON(musicJSONsPath + file)[1]

											dataset = np.vstack((dataset, musicFeatures))


									# Initializes target array (0 for music)

									target = np.zeros((dataset.shape[0]), dtype=int)


									# Reads the extracted features for the speech class

									featuresFiles = [file for file in listdir(speechJSONsPath) if isfile(join(speechJSONsPath, file))]

									for file in featuresFiles:

										# Gets feature arrays

										speechFeatures = arrayFromJSON(speechJSONsPath + file)[1]

										dataset = np.vstack((dataset, speechFeatures))


									# Appends the new class to the target array (1 for speech)

									target = np.hstack((target, np.ones((dataset.shape[0] - target.size), dtype=int)))


									return dataset, target, featureKeys


								# Details about this part can be found in the link bellow:

								# https://scikit-learn.org/stable/modules/preprocessing.html#preprocessing

								def standardization(dataset):

									from sklearn import preprocessing


									print(bcolors.YELLOW + 'Running standardization' + bcolors.ENDC)

									# Standardization

									scaledDataset = preprocessing.scale(np.float64(dataset))


									return scaledDataset


								# Details about this part can be found in the link bellow:

								# https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html#sklearn.decomposition.PCA

								def PCA(dataset):

									from sklearn.decomposition import PCA


									print(bcolors.YELLOW + 'Running PCA' + bcolors.ENDC)

									pca = PCA(n_components=10, svd_solver='full', whiten = True)

									transformedDataset = pca.fit(dataset).transform(dataset)


									return pca, transformedDataset


								# Prints a nice message to let the user know the module was imported

								print(bcolors.BLUE + 'feature_preprocessing loaded' + bcolors.ENDC)


								# Enables executing the module as a standalone script

								if __name__ == "__main__":

									import sys

									dataset, target, featureKeys = createSingleFeaturesArray(sys.argv[1], sys.argv[2])

									scaledDataset = standardization(dataset)


									print(bcolors.GREEN + 'Saving scaled results to file' + bcolors.ENDC)

									datasetFrame = pd.DataFrame(scaledDataset, columns = featureKeys)

									datasetFrame = datasetFrame.assign(target=target)

									datasetFrame.to_pickle("./dataset.pkl")