the-assignment/classifier/preprocessing/data_preprocessing.py

from os import listdir
from os.path import isfile, join
import numpy as np
import json

class bcolors:
	BLUE = '\033[94m'
	GREEN = '\033[92m'
	YELLOW = '\033[93m'
	RED = '\033[91m'
	ENDC = '\033[0m'

def arrayFromJSON(JSONPath):
	with open(JSONPath) as jsonFile:
		rawJSON = json.load(jsonFile)

	keys = np.array([])
	values = np.array([])
	for featureKey, featureValues in rawJSON.items():
		if keys.size == 0 or values.size == 0:
			keys = np.array(featureKey)
			values = np.array(featureValues)
		else:
			keys = np.append(keys, (np.array(featureKey)))
			values = np.vstack((values, np.array(featureValues)))

	values = np.transpose(values)
	return keys, values

def createSingleFeaturesArray(musicJSONsPath, speechJSONsPath):
	print(bcolors.YELLOW + 'Creating single features array' + bcolors.ENDC)
	dataset = np.array([])
	featureKeys = np.array([])

	# Reads the extracted features for the music class
	featuresFiles = [file for file in listdir(musicJSONsPath) if isfile(join(musicJSONsPath, file))]
	for file in featuresFiles:
		if dataset.size == 0:
			# Gets feature arrays
			featureKeys, musicFeatures = arrayFromJSON(musicJSONsPath + file)
			# Initializes dataset array
			dataset = np.copy(musicFeatures)
		else:
			# Gets feature arrays
			musicFeatures = arrayFromJSON(musicJSONsPath + file)[1]
			dataset = np.vstack((dataset, musicFeatures))

	# Initializes target array (0 for music)
	target = np.zeros((dataset.shape[0]), dtype=int)

	# Reads the extracted features for the speech class
	featuresFiles = [file for file in listdir(speechJSONsPath) if isfile(join(speechJSONsPath, file))]
	for file in featuresFiles:
		# Gets feature arrays
		speechFeatures = arrayFromJSON(speechJSONsPath + file)[1]
		dataset = np.vstack((dataset, speechFeatures))

	# Appends the new class to the target array (1 for speech)
	target = np.hstack((target, np.ones((dataset.shape[0] - target.size), dtype=int)))

	return dataset, target, featureKeys

# Details about this part can be found in the link bellow:
# https://scikit-learn.org/stable/modules/feature_selection.html
def featureSelection(dataset, target, featureKeys):
	# Selects features based on a variance threshold
	from sklearn.feature_selection import VarianceThreshold

	print(bcolors.YELLOW + 'Running variance threshold feature selection' + bcolors.ENDC)
	varianceThreshold = 0.1
	selector = VarianceThreshold(threshold = (varianceThreshold * (1 - varianceThreshold)))
	varReducedDataset = selector.fit_transform(dataset)
	isRetained = selector.get_support()

	print(bcolors.GREEN + 'Retaining features:' + bcolors.ENDC)
	for index, retain in enumerate(isRetained):
		if retain and index < featureKeys.size:
			print(featureKeys[index], end='\t', flush=True)

	print(bcolors.RED + '\n\nRemoving features:' + bcolors.ENDC)
	for index, retain in enumerate(isRetained):
		if not retain and index < featureKeys.size:
			print(featureKeys[index], end='\t', flush=True)
	print('\n')

	# Selects features based on univariate statistical tests
	from sklearn.datasets import load_digits
	from sklearn.feature_selection import SelectPercentile, mutual_info_classif

	print(bcolors.YELLOW + 'Running feature selection based on mutual information' + bcolors.ENDC)
	percentileSelector = SelectPercentile(mutual_info_classif, percentile=33)
	perReducedDataset = percentileSelector.fit_transform(dataset, target)
	isRetained = percentileSelector.get_support()

	print(bcolors.BLUE + 'Scores of features:' + bcolors.ENDC)
	for index, score in enumerate(percentileSelector.scores_):
		print(featureKeys[index] + ' => ' + str(score), end='\t\t', flush=True)
		if index%2:
			print('')
	print('')

	print(bcolors.GREEN + 'Retaining features:' + bcolors.ENDC)
	for index, retain in enumerate(isRetained):
		if retain and index < featureKeys.size:
			print(featureKeys[index], end='\t', flush=True)

	print(bcolors.RED + '\n\nRemoving features:' + bcolors.ENDC)
	for index, retain in enumerate(isRetained):
		if not retain and index < featureKeys.size:
			print(featureKeys[index], end='\t', flush=True)
	print('\n')

	# TODO: change the return value after the values of the parameters are decided
	# and the feature selection is complete
	return dataset

# Details about this part can be found in the link bellow:
# https://scikit-learn.org/stable/modules/preprocessing.html#preprocessing
def standardization(dataset):
	from sklearn import preprocessing

	print(bcolors.YELLOW + 'Running standardization' + bcolors.ENDC)
	# Standardization
	scaledDataset = preprocessing.scale(dataset)

	print(bcolors.YELLOW + 'Running normalization' + bcolors.ENDC)
	# Normalization
	normalizedDataset = preprocessing.normalize(dataset, norm='l2')

	# TODO: change the return value after the values of the parameters are decided
	# and the feature selection is complete
	return dataset

# Details about this part can be found in the link bellow:
# https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html#sklearn.decomposition.PCA
def PCA(dataset):
	from sklearn.decomposition import PCA

	print(bcolors.YELLOW + 'Running PCA' + bcolors.ENDC)
	pca = PCA(n_components=10, svd_solver='full')
	transformedDataset = pca.fit(dataset).transform(dataset)

	# TODO: change the return value after the values of the parameters are decided
	# and the feature selection is complete
	return dataset

# Prints a nice message to let the user know the module was imported
print(bcolors.BLUE + 'feature_preprocessing loaded' + bcolors.ENDC)

# Enables executing the module as a standalone script
if __name__ == "__main__":
	import sys
	dataset, target, featureKeys = createSingleFeaturesArray(sys.argv[1], sys.argv[2])
	PCA(standardization(featureSelection(dataset, target, featureKeys)))
	print(bcolors.GREEN + 'Saving results to files' + bcolors.ENDC)
	np.save('dataset.npy', dataset)
	np.save('target.npy', target)
	np.save('featureKeys.npy', featureKeys)
Init data preprocessing 6 years ago			`from os import listdir`
			`from os.path import isfile, join`
			`import numpy as np`
			`import json`

Data preprocessing minor fixes 6 years ago			`class bcolors:`
			`BLUE = '\033[94m'`
			`GREEN = '\033[92m'`
			`YELLOW = '\033[93m'`
			`RED = '\033[91m'`
			`ENDC = '\033[0m'`

			`def arrayFromJSON(JSONPath):`
Init data preprocessing 6 years ago			`with open(JSONPath) as jsonFile:`
			`rawJSON = json.load(jsonFile)`

			`keys = np.array([])`
			`values = np.array([])`
			`for featureKey, featureValues in rawJSON.items():`
			`if keys.size == 0 or values.size == 0:`
			`keys = np.array(featureKey)`
			`values = np.array(featureValues)`
			`else:`
			`keys = np.append(keys, (np.array(featureKey)))`
			`values = np.vstack((values, np.array(featureValues)))`

			`values = np.transpose(values)`
			`return keys, values`

			`def createSingleFeaturesArray(musicJSONsPath, speechJSONsPath):`
Data preprocessing minor fixes 6 years ago			`print(bcolors.YELLOW + 'Creating single features array' + bcolors.ENDC)`
Init data preprocessing 6 years ago			`dataset = np.array([])`
			`featureKeys = np.array([])`

			`# Reads the extracted features for the music class`
			`featuresFiles = [file for file in listdir(musicJSONsPath) if isfile(join(musicJSONsPath, file))]`
			`for file in featuresFiles:`
			`if dataset.size == 0:`
			`# Gets feature arrays`
Data preprocessing minor fixes 6 years ago			`featureKeys, musicFeatures = arrayFromJSON(musicJSONsPath + file)`
			`# Initializes dataset array`
Init data preprocessing 6 years ago			`dataset = np.copy(musicFeatures)`
			`else:`
			`# Gets feature arrays`
Data preprocessing minor fixes 6 years ago			`musicFeatures = arrayFromJSON(musicJSONsPath + file)[1]`
Init data preprocessing 6 years ago			`dataset = np.vstack((dataset, musicFeatures))`

Data preprocessing minor fixes 6 years ago			`# Initializes target array (0 for music)`
			`target = np.zeros((dataset.shape[0]), dtype=int)`

Init data preprocessing 6 years ago			`# Reads the extracted features for the speech class`
			`featuresFiles = [file for file in listdir(speechJSONsPath) if isfile(join(speechJSONsPath, file))]`
			`for file in featuresFiles:`
			`# Gets feature arrays`
Data preprocessing minor fixes 6 years ago			`speechFeatures = arrayFromJSON(speechJSONsPath + file)[1]`
Init data preprocessing 6 years ago			`dataset = np.vstack((dataset, speechFeatures))`

Data preprocessing minor fixes 6 years ago			`# Appends the new class to the target array (1 for speech)`
			`target = np.hstack((target, np.ones((dataset.shape[0] - target.size), dtype=int)))`

			`return dataset, target, featureKeys`
Init data preprocessing 6 years ago
			`# Details about this part can be found in the link bellow:`
			`# https://scikit-learn.org/stable/modules/feature_selection.html`
Data preprocessing minor fixes 6 years ago			`def featureSelection(dataset, target, featureKeys):`
Init data preprocessing 6 years ago			`# Selects features based on a variance threshold`
			`from sklearn.feature_selection import VarianceThreshold`

Data preprocessing minor fixes 6 years ago			`print(bcolors.YELLOW + 'Running variance threshold feature selection' + bcolors.ENDC)`
			`varianceThreshold = 0.1`
Init data preprocessing 6 years ago			`selector = VarianceThreshold(threshold = (varianceThreshold * (1 - varianceThreshold)))`
			`varReducedDataset = selector.fit_transform(dataset)`
			`isRetained = selector.get_support()`

Data preprocessing minor fixes 6 years ago			`print(bcolors.GREEN + 'Retaining features:' + bcolors.ENDC)`
Init data preprocessing 6 years ago			`for index, retain in enumerate(isRetained):`
			`if retain and index < featureKeys.size:`
			`print(featureKeys[index], end='\t', flush=True)`

Data preprocessing minor fixes 6 years ago			`print(bcolors.RED + '\n\nRemoving features:' + bcolors.ENDC)`
Init data preprocessing 6 years ago			`for index, retain in enumerate(isRetained):`
			`if not retain and index < featureKeys.size:`
			`print(featureKeys[index], end='\t', flush=True)`
			`print('\n')`

			`# Selects features based on univariate statistical tests`
			`from sklearn.datasets import load_digits`
Data preprocessing minor fixes 6 years ago			`from sklearn.feature_selection import SelectPercentile, mutual_info_classif`

			`print(bcolors.YELLOW + 'Running feature selection based on mutual information' + bcolors.ENDC)`
			`percentileSelector = SelectPercentile(mutual_info_classif, percentile=33)`
			`perReducedDataset = percentileSelector.fit_transform(dataset, target)`
			`isRetained = percentileSelector.get_support()`

			`print(bcolors.BLUE + 'Scores of features:' + bcolors.ENDC)`
			`for index, score in enumerate(percentileSelector.scores_):`
			`print(featureKeys[index] + ' => ' + str(score), end='\t\t', flush=True)`
			`if index%2:`
			`print('')`
			`print('')`

			`print(bcolors.GREEN + 'Retaining features:' + bcolors.ENDC)`
			`for index, retain in enumerate(isRetained):`
			`if retain and index < featureKeys.size:`
			`print(featureKeys[index], end='\t', flush=True)`
Init data preprocessing 6 years ago
Data preprocessing minor fixes 6 years ago			`print(bcolors.RED + '\n\nRemoving features:' + bcolors.ENDC)`
			`for index, retain in enumerate(isRetained):`
			`if not retain and index < featureKeys.size:`
			`print(featureKeys[index], end='\t', flush=True)`
			`print('\n')`
Init data preprocessing 6 years ago
			`# TODO: change the return value after the values of the parameters are decided`
			`# and the feature selection is complete`
			`return dataset`

			`# Details about this part can be found in the link bellow:`
			`# https://scikit-learn.org/stable/modules/preprocessing.html#preprocessing`
			`def standardization(dataset):`
			`from sklearn import preprocessing`

Data preprocessing minor fixes 6 years ago			`print(bcolors.YELLOW + 'Running standardization' + bcolors.ENDC)`
Init data preprocessing 6 years ago			`# Standardization`
Data preprocessing minor fixes 6 years ago			`scaledDataset = preprocessing.scale(dataset)`
Init data preprocessing 6 years ago
Data preprocessing minor fixes 6 years ago			`print(bcolors.YELLOW + 'Running normalization' + bcolors.ENDC)`
Init data preprocessing 6 years ago			`# Normalization`
Data preprocessing minor fixes 6 years ago			`normalizedDataset = preprocessing.normalize(dataset, norm='l2')`
Init data preprocessing 6 years ago
			`# TODO: change the return value after the values of the parameters are decided`
			`# and the feature selection is complete`
			`return dataset`

			`# Details about this part can be found in the link bellow:`
			`# https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html#sklearn.decomposition.PCA`
			`def PCA(dataset):`
			`from sklearn.decomposition import PCA`

Data preprocessing minor fixes 6 years ago			`print(bcolors.YELLOW + 'Running PCA' + bcolors.ENDC)`
			`pca = PCA(n_components=10, svd_solver='full')`
			`transformedDataset = pca.fit(dataset).transform(dataset)`
Init data preprocessing 6 years ago
			`# TODO: change the return value after the values of the parameters are decided`
			`# and the feature selection is complete`
			`return dataset`

			`# Prints a nice message to let the user know the module was imported`
Data preprocessing minor fixes 6 years ago			`print(bcolors.BLUE + 'feature_preprocessing loaded' + bcolors.ENDC)`
Init data preprocessing 6 years ago
			`# Enables executing the module as a standalone script`
			`if __name__ == "__main__":`
			`import sys`
Data preprocessing minor fixes 6 years ago			`dataset, target, featureKeys = createSingleFeaturesArray(sys.argv[1], sys.argv[2])`
			`PCA(standardization(featureSelection(dataset, target, featureKeys)))`
			`print(bcolors.GREEN + 'Saving results to files' + bcolors.ENDC)`
			`np.save('dataset.npy', dataset)`
			`np.save('target.npy', target)`
			`np.save('featureKeys.npy', featureKeys)`