|
|
|
import numpy as np
|
|
|
|
import pandas as pd
|
|
|
|
|
|
|
|
class bcolors:
|
|
|
|
BLUE = '\033[94m'
|
|
|
|
GREEN = '\033[92m'
|
|
|
|
YELLOW = '\033[93m'
|
|
|
|
RED = '\033[91m'
|
|
|
|
ENDC = '\033[0m'
|
|
|
|
|
|
|
|
def simpleTrain(dataset, target, model='all'):
|
|
|
|
from sklearn.model_selection import train_test_split
|
|
|
|
trainingSet, testSet, trainingTarget, testTarget = train_test_split(dataset, target,
|
|
|
|
test_size=0.4, random_state=0)
|
|
|
|
|
|
|
|
if model == 'svm' or model == 'all':
|
|
|
|
# SVM training
|
|
|
|
from sklearn.svm import SVC
|
|
|
|
clf = SVC(gamma='scale')
|
|
|
|
clf.fit(trainingSet, trainingTarget)
|
|
|
|
svmAccuracy = clf.score(testSet, testTarget)
|
|
|
|
|
|
|
|
if model == 'dtree' or model == 'all':
|
|
|
|
# Decision tree
|
|
|
|
from sklearn import tree
|
|
|
|
clf = tree.DecisionTreeClassifier()
|
|
|
|
clf.fit(trainingSet, trainingTarget)
|
|
|
|
dtreeAccuracy = clf.score(testSet, testTarget)
|
|
|
|
|
|
|
|
if model == 'nn' or model == 'all':
|
|
|
|
# Multi-layer Perceptron
|
|
|
|
from sklearn.neural_network import MLPClassifier
|
|
|
|
clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 3), random_state=2)
|
|
|
|
clf.fit(trainingSet, trainingTarget)
|
|
|
|
nnAccuracy = clf.score(testSet, testTarget)
|
|
|
|
|
|
|
|
if model == 'bayes' or model == 'all':
|
|
|
|
# Naive Bayes
|
|
|
|
from sklearn.naive_bayes import GaussianNB
|
|
|
|
clf = GaussianNB()
|
|
|
|
clf.fit(trainingSet, trainingTarget)
|
|
|
|
bayesAccuracy = clf.score(testSet, testTarget)
|
|
|
|
|
|
|
|
if model == 'all':
|
|
|
|
return max([svmAccuracy, dtreeAccuracy, nnAccuracy, bayesAccuracy])
|
|
|
|
elif model == 'svm':
|
|
|
|
return svmAccuracy
|
|
|
|
elif model == 'dtree':
|
|
|
|
return dtreeAccuracy
|
|
|
|
elif model == 'nn':
|
|
|
|
return nnAccuracy
|
|
|
|
elif model == 'bayes':
|
|
|
|
return bayesAccuracy
|
|
|
|
|
|
|
|
def randomForest(dataset, target):
|
|
|
|
from sklearn.ensemble import RandomForestClassifier
|
|
|
|
from sklearn.model_selection import train_test_split
|
|
|
|
|
|
|
|
trainingSet, testSet, trainingTarget, testTarget = train_test_split(dataset,
|
|
|
|
target, test_size=0.4, random_state=0)
|
|
|
|
clf = RandomForestClassifier(n_estimators=500, criterion = 'entropy',
|
|
|
|
n_jobs = -1, random_state = 4)
|
|
|
|
clf = clf.fit(trainingSet, trainingTarget)
|
|
|
|
print("Random forest accuracy: {0:.2f}".format(100*clf.score(testSet, testTarget)))
|
|
|
|
|
|
|
|
def kFCrossValid(dataset, target, model = 'svm'):
|
|
|
|
from sklearn.model_selection import cross_val_score
|
|
|
|
from sklearn import metrics
|
|
|
|
from copy import deepcopy
|
|
|
|
|
|
|
|
clf = None
|
|
|
|
|
|
|
|
if model == 'svm':
|
|
|
|
# SVM training
|
|
|
|
from sklearn.svm import SVC
|
|
|
|
clf = SVC(gamma='scale')
|
|
|
|
elif model == 'dtree':
|
|
|
|
# Decision tree
|
|
|
|
from sklearn import tree
|
|
|
|
clf = tree.DecisionTreeClassifier()
|
|
|
|
elif model == 'nn':
|
|
|
|
# Multi-layer Perceptron
|
|
|
|
from sklearn.neural_network import MLPClassifier
|
|
|
|
clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 3), random_state=2)
|
|
|
|
elif model == 'bayes':
|
|
|
|
# Naive Bayes
|
|
|
|
from sklearn.naive_bayes import GaussianNB
|
|
|
|
clf = GaussianNB()
|
|
|
|
elif model == 'rndForest':
|
|
|
|
from sklearn.ensemble import ExtraTreesClassifier
|
|
|
|
clf = ExtraTreesClassifier(n_estimators=1500, criterion = 'entropy',
|
|
|
|
n_jobs = -1, random_state = 4)
|
|
|
|
else:
|
|
|
|
print('Error. model specified not supported')
|
|
|
|
return None
|
|
|
|
|
|
|
|
from sklearn.model_selection import KFold
|
|
|
|
kf = KFold(n_splits=5, shuffle=True, random_state=2)
|
|
|
|
|
|
|
|
maxAccuracy = 0
|
|
|
|
bestClf = None
|
|
|
|
|
|
|
|
for k, (train_index, test_index) in enumerate(kf.split(dataset)):
|
|
|
|
kTrainSet, kTestSet = dataset[train_index], dataset[test_index]
|
|
|
|
kTrainTarget, kTestTarget = target[train_index], target[test_index]
|
|
|
|
|
|
|
|
clf.fit(kTrainSet, kTrainTarget)
|
|
|
|
acc = clf.score(kTestSet, kTestTarget)
|
|
|
|
print("[fold {0}], score: {1:.2f}".format(k, 100*acc))
|
|
|
|
|
|
|
|
if acc > maxAccuracy:
|
|
|
|
maxAccuracy = acc
|
|
|
|
bestClf = deepcopy(clf)
|
|
|
|
|
|
|
|
return bestClf
|
|
|
|
|
|
|
|
# Prints a nice message to let the user know the module was imported
|
|
|
|
print(bcolors.BLUE + 'model_training loaded' + bcolors.ENDC)
|
|
|
|
|
|
|
|
# Enables executing the module as a standalone script
|
|
|
|
if __name__ == "__main__":
|
|
|
|
import sys
|
|
|
|
dataset = pd.read_pickle(sys.argv[1])
|
|
|
|
target = dataset.pop('target')
|
|
|
|
|
|
|
|
kFCrossValid(dataset.values, target, sys.argv[2])
|