From 5a1aadb41093c1628f2bbb7b708312291bd75c39 Mon Sep 17 00:00:00 2001 From: Christina Theodoridou Date: Fri, 30 Nov 2018 12:50:11 +0200 Subject: [PATCH] Some ml algorithms in python and R --- ml algorithms/AlgorithmsInPython.py | 129 ++++++++++++++++++++++++++++ ml algorithms/AlgorithmsInR.r | 85 ++++++++++++++++++ ml algorithms/DataPreprocessing.py | 108 +++++++++++++++++++++++ ml algorithms/DataPreprocessing.r | 83 ++++++++++++++++++ ml algorithms/LinearRegression.py | 20 +++++ tex/cites.bib | 16 ++-- 6 files changed, 432 insertions(+), 9 deletions(-) create mode 100644 ml algorithms/AlgorithmsInPython.py create mode 100644 ml algorithms/AlgorithmsInR.r create mode 100644 ml algorithms/DataPreprocessing.py create mode 100644 ml algorithms/DataPreprocessing.r create mode 100644 ml algorithms/LinearRegression.py diff --git a/ml algorithms/AlgorithmsInPython.py b/ml algorithms/AlgorithmsInPython.py new file mode 100644 index 0000000..073bcb7 --- /dev/null +++ b/ml algorithms/AlgorithmsInPython.py @@ -0,0 +1,129 @@ +#Linear Regression + +#Import Library +#Import other necessary libraries like pandas, numpy... +from sklearn import linear_model +#Load Train and Test datasets +#Identify feature and response variable(s) and values must be numeric and numpy arrays +x_train=input_variables_values_training_datasets +y_train=target_variables_values_training_datasets +x_test=input_variables_values_test_datasets +# Create linear regression object +linear = linear_model.LinearRegression() +# Train the model using the training sets and check score +linear.fit(x_train, y_train) +linear.score(x_train, y_train) +#Equation coefficient and Intercept +print('Coefficient: \n', linear.coef_) +print('Intercept: \n', linear.intercept_) +#Predict Output +predicted= linear.predict(x_test) + + + +#Logistic Regression + + +#Import Library +from sklearn.linear_model import LogisticRegression +#Assumed you have, X (predictor) and Y (target) for training data set and x_test(predictor) of test_dataset +# Create logistic regression object +model = LogisticRegression() +# Train the model using the training sets and check score +model.fit(X, y) +model.score(X, y) +#Equation coefficient and Intercept +print('Coefficient: \n', model.coef_) +print('Intercept: \n', model.intercept_) +#Predict Output +predicted= model.predict(x_test) + + + + +#Decision Tree +#Import Library +#Import other necessary libraries like pandas, numpy... +from sklearn import tree +#Assumed you have, X (predictor) and Y (target) for training data set and x_test(predictor) of test_dataset +# Create tree object +model = tree.DecisionTreeClassifier(criterion='gini') # for classification, here you can change the algorithm as gini or entropy (information gain) by default it is gini +# model = tree.DecisionTreeRegressor() for regression +# Train the model using the training sets and check score +model.fit(X, y) +model.score(X, y) +#Predict Output +predicted= model.predict(x_test) + + +#SVM + +#Import Library +from sklearn import svm +#Assumed you have, X (predictor) and Y (target) for training data set and x_test(predictor) of test_dataset +# Create SVM classification object +model = svm.svc() # there is various option associated with it, this is simple for classification. You can refer link, for mo# re detail. +# Train the model using the training sets and check score +model.fit(X, y) +model.score(X, y) +#Predict Output +predicted= model.predict(x_test) + +# Naive Bayes + +#Import Library +from sklearn.naive_bayes import GaussianNB +#Assumed you have, X (predictor) and Y (target) for training data set and x_test(predictor) of test_dataset +# Create SVM classification object model = GaussianNB() # there is other distribution for multinomial classes like Bernoulli Naive Bayes, Refer link +# Train the model using the training sets and check score +model.fit(X, y) +#Predict Output +predicted= model.predict(x_test) + +#kNN (k- Nearest Neighbors) + +#Import Library +from sklearn.neighbors import KNeighborsClassifier +#Assumed you have, X (predictor) and Y (target) for training data set and x_test(predictor) of test_dataset +# Create KNeighbors classifier object model +KNeighborsClassifier(n_neighbors=6) # default value for n_neighbors is 5 +# Train the model using the training sets and check score +model.fit(X, y) +#Predict Output +predicted= model.predict(x_test) + +#K-Means +#Import Library +from sklearn.cluster import KMeans +#Assumed you have, X (attributes) for training data set and x_test(attributes) of test_dataset +# Create KNeighbors classifier object model +k_means = KMeans(n_clusters=3, random_state=0) +# Train the model using the training sets and check score +model.fit(X) +#Predict Output +predicted= model.predict(x_test) + +#Random Forest + +#Import Library +from sklearn.ensemble import RandomForestClassifier +#Assumed you have, X (predictor) and Y (target) for training data set and x_test(predictor) of test_dataset +# Create Random Forest object +model= RandomForestClassifier() +# Train the model using the training sets and check score +model.fit(X, y) +#Predict Output +predicted= model.predict(x_test) + +#Dimensionality Reduction Algorithms + +#Import Library +from sklearn import decomposition +#Assumed you have training and test data set as train and test +# Create PCA obeject pca= decomposition.PCA(n_components=k) #default value of k =min(n_sample, n_features) +# For Factor analysis +#fa= decomposition.FactorAnalysis() +# Reduced the dimension of training dataset using PCA +train_reduced = pca.fit_transform(train) +#Reduced the dimension of test dataset +test_reduced = pca.transform(test) diff --git a/ml algorithms/AlgorithmsInR.r b/ml algorithms/AlgorithmsInR.r new file mode 100644 index 0000000..7c39a35 --- /dev/null +++ b/ml algorithms/AlgorithmsInR.r @@ -0,0 +1,85 @@ +#Linear Regression + +#Load Train and Test datasets +#Identify feature and response variable(s) and values must be numeric and numpy arrays +x_train <- input_variables_values_training_datasets +y_train <- target_variables_values_training_datasets +x_test <- input_variables_values_test_datasets +x <- cbind(x_train,y_train) +# Train the model using the training sets and check score +linear <- lm(y_train ~ ., data = x) +summary(linear) +#Predict Output +predicted= predict(linear,x_test) + +#Logistic Regression + +x <- cbind(x_train,y_train) +# Train the model using the training sets and check score +logistic <- glm(y_train ~ ., data = x,family='binomial') +summary(logistic) +#Predict Output +predicted= predict(logistic,x_test) + +#Decision tree + +library(rpart) +x <- cbind(x_train,y_train) +# grow tree +fit <- rpart(y_train ~ ., data = x,method="class") +summary(fit) +#Predict Output +predicted= predict(fit,x_test) + + +#SVM + +library(e1071) +x <- cbind(x_train,y_train) +# Fitting model +fit <-svm(y_train ~ ., data = x) +summary(fit) +#Predict Output +predicted= predict(fit,x_test) + +# Naive Bayes + +library(e1071) +x <- cbind(x_train,y_train) +# Fitting model +fit <-naiveBayes(y_train ~ ., data = x) +summary(fit) +#Predict Output +predicted= predict(fit,x_test) + +#kNN (k- Nearest Neighbors) + +library(knn) +x <- cbind(x_train,y_train) +# Fitting model +fit <-knn(y_train ~ ., data = x,k=5) +summary(fit) +#Predict Output +predicted= predict(fit,x_test) + +#K-Means + +library(cluster) +fit <- kmeans(X, 3) # 5 cluster solution + +#Random Forest + +library(randomForest) +x <- cbind(x_train,y_train) +# Fitting model +fit <- randomForest(Species ~ ., x,ntree=500) +summary(fit) +#Predict Output +predicted= predict(fit,x_test) + +#Dimensionality Reduction Algorithms + +library(stats) +pca <- princomp(train, cor = TRUE) +train_reduced <- predict(pca,train) +test_reduced <- predict(pca,test) diff --git a/ml algorithms/DataPreprocessing.py b/ml algorithms/DataPreprocessing.py new file mode 100644 index 0000000..98e927e --- /dev/null +++ b/ml algorithms/DataPreprocessing.py @@ -0,0 +1,108 @@ +''' +Created on Apr 25, 2016 +test code +@author: Wenqiang Feng +''' +import pandas as pd +#import numpy as np +import matplotlib.pyplot as plt +from pandas.tools.plotting import scatter_matrix +from docutils.parsers.rst.directives import path + +if __name__ == '__main__': + path ='~/Dropbox/MachineLearningAlgorithms/python_code/data/Heart.csv' + rawdata = pd.read_csv(path) + + print "data summary" + print rawdata.describe() + + # summary plot of the data + scatter_matrix(rawdata,figsize=[15,15]) + plt.show() + + # Histogram + rawdata.hist() + plt.show() + + # boxplot + pd.DataFrame.boxplot(rawdata) + plt.show() + + + print "Raw data size" + nrow, ncol = rawdata.shape + print nrow, ncol + + path = ('/home/feng/Dropbox/MachineLearningAlgorithms/python_code/data/' + 'energy_efficiency.xlsx') + path + + rawdataEnergy= pd.read_excel(path,sheetname=0) + + nrow=rawdata.shape[0] #gives number of row count + ncol=rawdata.shape[1] #gives number of col count + print nrow, ncol + col_names = rawdata.columns.tolist() + print "Column names:" + print col_names + print "Data Format:" + print rawdata.dtypes + + print "\nSample data:" + print(rawdata.head(6)) + + + print "\n correlation Matrix" + print rawdata.corr() + + # cocorrelation Matrix plot + pd.DataFrame.corr(rawdata) + plt.show() + + print "\n covariance Matrix" + print rawdata.cov() + + print rawdata[['Age','Ca']].corr() + pd.DataFrame.corr(rawdata) + plt.show() + + + + # define colors list, to be used to plot survived either red (=0) or green (=1) + colors=['red','green'] + + # make a scatter plot + +# rawdata.info() + + from scipy import stats + import seaborn as sns # just a conventional alias, don't know why + sns.corrplot(rawdata) # compute and plot the pair-wise correlations + # save to file, remove the big white borders + #plt.savefig('attribute_correlations.png', tight_layout=True) + plt.show() + + + attr = rawdata['Age'] + sns.distplot(attr) + plt.show() + + sns.distplot(attr, kde=False, fit=stats.gamma); + plt.show() + + # Two subplots, the axes array is 1-d + plt.figure(1) + plt.title('Histogram of Age') + plt.subplot(211) # 21,1 means first one of 2 rows, 1 col + sns.distplot(attr) + + plt.subplot(212) # 21,2 means second one of 2 rows, 1 col + sns.distplot(attr, kde=False, fit=stats.gamma); + + plt.show() + + + + + + diff --git a/ml algorithms/DataPreprocessing.r b/ml algorithms/DataPreprocessing.r new file mode 100644 index 0000000..44a9a69 --- /dev/null +++ b/ml algorithms/DataPreprocessing.r @@ -0,0 +1,83 @@ +rm(list = ls()) +# set the enverionment +path ='~/Dropbox/MachineLearningAlgorithms/python_code/data/Heart.csv' +rawdata = read.csv(path) + +# summary of the data +summary(rawdata) +# plot of the summary +plot(rawdata) + +dim(rawdata) +head(rawdata) +tail(rawdata) + +colnames(rawdata) +attach(rawdata) + +# get numerical data and remove NAN +numdata=na.omit(rawdata[,c(1:2,4:12)]) + +cor(numdata) +cov(numdata) + +dev.off() +# laod cocorrelation Matrix plot lib +library(corrplot) +M <- cor(numdata) +#par(mfrow =c (1,2)) +#corrplot(M, method = "square") +corrplot.mixed(M) + + +nrow=nrow(rawdata) +ncol=ncol(rawdata) +c(nrow, ncol) + + + +Nvars=ncol(numdata) +# checking data format +typeof(rawdata) +install.packages("mlbench") +library(mlbench) +sapply(rawdata, class) + +dev.off() +name=colnames(numdata) +Nvars=ncol(numdata) +# boxplot +par(mfrow =c (4,3)) +for (i in 1:Nvars) +{ + #boxplot(numdata[,i]~numdata[,Nvars],data=data,main=name[i]) + boxplot(numdata[,i],data=numdata,main=name[i]) +} + +# Histogram with normal curve plot +dev.off() +Nvars=ncol(numdata) +name=colnames(numdata) +par(mfrow =c (3,5)) +for (i in 1:Nvars) +{ + x<- numdata[,i] + h<-hist(x, breaks=10, freq=TRUE, col="blue", xlab=name[i],main=" ", + font.lab=1) + axis(1, tck=1, col.ticks="light gray") + axis(1, tck=-0.015, col.ticks="black") + axis(2, tck=1, col.ticks="light gray", lwd.ticks="1") + axis(2, tck=-0.015) + xfit<-seq(min(x),max(x),length=40) + yfit<-dnorm(xfit,mean=mean(x),sd=sd(x)) + yfit <- yfit*diff(h$mids[1:2])*length(x) + lines(xfit, yfit, col="blue", lwd=2) +} + + +library(reshape2) +library(ggplot2) +d <- melt(diamonds[,-c(2:4)]) +ggplot(d,aes(x = value)) + + facet_wrap(~variable,scales = "free_x") + + geom_histogram() diff --git a/ml algorithms/LinearRegression.py b/ml algorithms/LinearRegression.py new file mode 100644 index 0000000..8697190 --- /dev/null +++ b/ml algorithms/LinearRegression.py @@ -0,0 +1,20 @@ +#LinearRegression] + +#Import Library +#Import other necessary libraries like pandas, numpy... +from sklearn import linear_model +#Load Train and Test datasets +#Identify feature and response variable(s) and values must be numeric and numpy arrays +x_train=input_variables_values_training_datasets +y_train=target_variables_values_training_datasets +x_test=input_variables_values_test_datasets +# Create linear regression object +linear = linear_model.LinearRegression() +# Train the model using the training sets and check score +linear.fit(x_train, y_train) +linear.score(x_train, y_train) +#Equation coefficient and Intercept +print('Coefficient: \n', linear.coef_) +print('Intercept: \n', linear.intercept_) +#Predict Output +predicted= linear.predict(x_test) diff --git a/tex/cites.bib b/tex/cites.bib index 6297121..63c9ac2 100644 --- a/tex/cites.bib +++ b/tex/cites.bib @@ -1,44 +1,42 @@ @article { robust, -autor = "Julien Pinquier, Jean-Luc Rouas and Régine André-Obrecht", title = "ROBUST SPEECH/MUSIC CLASSIFICATION IN AUDIO DOCUMENTS", +author = "Julien Pinquier, Jean-Luc Rouas and Régine André-Obrecht", journal = "7th International Conference on Spoken Language Processing [ICSLP2002]", year = "2002" } @article { mirex, -autor = "Nikolaos Tsipas, Lazaros Vrysis, Charalampos Dimoulas and George Papanikolaou", title = "MIREX 2015: METHODS FOR SPEECH/MUSIC DETECTION AND CLASSIFICATION", +author = "Nikolaos Tsipas, Lazaros Vrysis, Charalampos Dimoulas and George Papanikolaou", journal = "MIREX 2015 Conference", year = "2015" } @article { speech, -autor = "Baniriskhem K. Khonglah and S.R. Mahadeva Prasanna", title = "Speech / music classification using speech-specific features", +author = "Baniriskhem K. Khonglah and S.R. Mahadeva Prasanna", journal = "Digital Signal Processing 48", year = "2016" } @article{ cuckoo, +title = "Speech classification based on cuckoo algorithm and support vector machines", author = "Wenlei Shi and Xinhai Fan", -title = "Speech classification based on cuckoo algorithm and support - vector machines", -journal = "2nd IEEE International Conference on Computational - Intelligence and Applications", +journal = "2nd IEEE International Conference on Computational Intelligence and Applications", year = "2017" } @article{ radio, -author = "Stanisław Kacprzak, Błażej Chwiećko and Bartosz Ziółko", title = "Speech/music discrimination for analysis of radio stations", +author = "Stanisław Kacprzak, Błażej Chwiećko and Bartosz Ziółko", journal = "2017 International Conference on Systems, Signals and Image Processing (IWSSIP)", year = "2017" } <<<<<<< HEAD @article { speech, -author = "Baniriskhem K. Khonglah and S.R. Mahadeva Prasanna", title = "Speech / music classification using speech-specific features", +author = "Baniriskhem K. Khonglah and S.R. Mahadeva Prasanna", journal = "Digital Signal Processing 48", year = "2016" )