Browse Source

Some ml algorithms in python and R

master
Christina Theodoridou 6 years ago
parent
commit
5a1aadb410
  1. 129
      ml algorithms/AlgorithmsInPython.py
  2. 85
      ml algorithms/AlgorithmsInR.r
  3. 108
      ml algorithms/DataPreprocessing.py
  4. 83
      ml algorithms/DataPreprocessing.r
  5. 20
      ml algorithms/LinearRegression.py
  6. 16
      tex/cites.bib

129
ml algorithms/AlgorithmsInPython.py

@ -0,0 +1,129 @@
#Linear Regression
#Import Library
#Import other necessary libraries like pandas, numpy...
from sklearn import linear_model
#Load Train and Test datasets
#Identify feature and response variable(s) and values must be numeric and numpy arrays
x_train=input_variables_values_training_datasets
y_train=target_variables_values_training_datasets
x_test=input_variables_values_test_datasets
# Create linear regression object
linear = linear_model.LinearRegression()
# Train the model using the training sets and check score
linear.fit(x_train, y_train)
linear.score(x_train, y_train)
#Equation coefficient and Intercept
print('Coefficient: \n', linear.coef_)
print('Intercept: \n', linear.intercept_)
#Predict Output
predicted= linear.predict(x_test)
#Logistic Regression
#Import Library
from sklearn.linear_model import LogisticRegression
#Assumed you have, X (predictor) and Y (target) for training data set and x_test(predictor) of test_dataset
# Create logistic regression object
model = LogisticRegression()
# Train the model using the training sets and check score
model.fit(X, y)
model.score(X, y)
#Equation coefficient and Intercept
print('Coefficient: \n', model.coef_)
print('Intercept: \n', model.intercept_)
#Predict Output
predicted= model.predict(x_test)
#Decision Tree
#Import Library
#Import other necessary libraries like pandas, numpy...
from sklearn import tree
#Assumed you have, X (predictor) and Y (target) for training data set and x_test(predictor) of test_dataset
# Create tree object
model = tree.DecisionTreeClassifier(criterion='gini') # for classification, here you can change the algorithm as gini or entropy (information gain) by default it is gini
# model = tree.DecisionTreeRegressor() for regression
# Train the model using the training sets and check score
model.fit(X, y)
model.score(X, y)
#Predict Output
predicted= model.predict(x_test)
#SVM
#Import Library
from sklearn import svm
#Assumed you have, X (predictor) and Y (target) for training data set and x_test(predictor) of test_dataset
# Create SVM classification object
model = svm.svc() # there is various option associated with it, this is simple for classification. You can refer link, for mo# re detail.
# Train the model using the training sets and check score
model.fit(X, y)
model.score(X, y)
#Predict Output
predicted= model.predict(x_test)
# Naive Bayes
#Import Library
from sklearn.naive_bayes import GaussianNB
#Assumed you have, X (predictor) and Y (target) for training data set and x_test(predictor) of test_dataset
# Create SVM classification object model = GaussianNB() # there is other distribution for multinomial classes like Bernoulli Naive Bayes, Refer link
# Train the model using the training sets and check score
model.fit(X, y)
#Predict Output
predicted= model.predict(x_test)
#kNN (k- Nearest Neighbors)
#Import Library
from sklearn.neighbors import KNeighborsClassifier
#Assumed you have, X (predictor) and Y (target) for training data set and x_test(predictor) of test_dataset
# Create KNeighbors classifier object model
KNeighborsClassifier(n_neighbors=6) # default value for n_neighbors is 5
# Train the model using the training sets and check score
model.fit(X, y)
#Predict Output
predicted= model.predict(x_test)
#K-Means
#Import Library
from sklearn.cluster import KMeans
#Assumed you have, X (attributes) for training data set and x_test(attributes) of test_dataset
# Create KNeighbors classifier object model
k_means = KMeans(n_clusters=3, random_state=0)
# Train the model using the training sets and check score
model.fit(X)
#Predict Output
predicted= model.predict(x_test)
#Random Forest
#Import Library
from sklearn.ensemble import RandomForestClassifier
#Assumed you have, X (predictor) and Y (target) for training data set and x_test(predictor) of test_dataset
# Create Random Forest object
model= RandomForestClassifier()
# Train the model using the training sets and check score
model.fit(X, y)
#Predict Output
predicted= model.predict(x_test)
#Dimensionality Reduction Algorithms
#Import Library
from sklearn import decomposition
#Assumed you have training and test data set as train and test
# Create PCA obeject pca= decomposition.PCA(n_components=k) #default value of k =min(n_sample, n_features)
# For Factor analysis
#fa= decomposition.FactorAnalysis()
# Reduced the dimension of training dataset using PCA
train_reduced = pca.fit_transform(train)
#Reduced the dimension of test dataset
test_reduced = pca.transform(test)

85
ml algorithms/AlgorithmsInR.r

@ -0,0 +1,85 @@
#Linear Regression
#Load Train and Test datasets
#Identify feature and response variable(s) and values must be numeric and numpy arrays
x_train <- input_variables_values_training_datasets
y_train <- target_variables_values_training_datasets
x_test <- input_variables_values_test_datasets
x <- cbind(x_train,y_train)
# Train the model using the training sets and check score
linear <- lm(y_train ~ ., data = x)
summary(linear)
#Predict Output
predicted= predict(linear,x_test)
#Logistic Regression
x <- cbind(x_train,y_train)
# Train the model using the training sets and check score
logistic <- glm(y_train ~ ., data = x,family='binomial')
summary(logistic)
#Predict Output
predicted= predict(logistic,x_test)
#Decision tree
library(rpart)
x <- cbind(x_train,y_train)
# grow tree
fit <- rpart(y_train ~ ., data = x,method="class")
summary(fit)
#Predict Output
predicted= predict(fit,x_test)
#SVM
library(e1071)
x <- cbind(x_train,y_train)
# Fitting model
fit <-svm(y_train ~ ., data = x)
summary(fit)
#Predict Output
predicted= predict(fit,x_test)
# Naive Bayes
library(e1071)
x <- cbind(x_train,y_train)
# Fitting model
fit <-naiveBayes(y_train ~ ., data = x)
summary(fit)
#Predict Output
predicted= predict(fit,x_test)
#kNN (k- Nearest Neighbors)
library(knn)
x <- cbind(x_train,y_train)
# Fitting model
fit <-knn(y_train ~ ., data = x,k=5)
summary(fit)
#Predict Output
predicted= predict(fit,x_test)
#K-Means
library(cluster)
fit <- kmeans(X, 3) # 5 cluster solution
#Random Forest
library(randomForest)
x <- cbind(x_train,y_train)
# Fitting model
fit <- randomForest(Species ~ ., x,ntree=500)
summary(fit)
#Predict Output
predicted= predict(fit,x_test)
#Dimensionality Reduction Algorithms
library(stats)
pca <- princomp(train, cor = TRUE)
train_reduced <- predict(pca,train)
test_reduced <- predict(pca,test)

108
ml algorithms/DataPreprocessing.py

@ -0,0 +1,108 @@
'''
Created on Apr 25, 2016
test code
@author: Wenqiang Feng
'''
import pandas as pd
#import numpy as np
import matplotlib.pyplot as plt
from pandas.tools.plotting import scatter_matrix
from docutils.parsers.rst.directives import path
if __name__ == '__main__':
path ='~/Dropbox/MachineLearningAlgorithms/python_code/data/Heart.csv'
rawdata = pd.read_csv(path)
print "data summary"
print rawdata.describe()
# summary plot of the data
scatter_matrix(rawdata,figsize=[15,15])
plt.show()
# Histogram
rawdata.hist()
plt.show()
# boxplot
pd.DataFrame.boxplot(rawdata)
plt.show()
print "Raw data size"
nrow, ncol = rawdata.shape
print nrow, ncol
path = ('/home/feng/Dropbox/MachineLearningAlgorithms/python_code/data/'
'energy_efficiency.xlsx')
path
rawdataEnergy= pd.read_excel(path,sheetname=0)
nrow=rawdata.shape[0] #gives number of row count
ncol=rawdata.shape[1] #gives number of col count
print nrow, ncol
col_names = rawdata.columns.tolist()
print "Column names:"
print col_names
print "Data Format:"
print rawdata.dtypes
print "\nSample data:"
print(rawdata.head(6))
print "\n correlation Matrix"
print rawdata.corr()
# cocorrelation Matrix plot
pd.DataFrame.corr(rawdata)
plt.show()
print "\n covariance Matrix"
print rawdata.cov()
print rawdata[['Age','Ca']].corr()
pd.DataFrame.corr(rawdata)
plt.show()
# define colors list, to be used to plot survived either red (=0) or green (=1)
colors=['red','green']
# make a scatter plot
# rawdata.info()
from scipy import stats
import seaborn as sns # just a conventional alias, don't know why
sns.corrplot(rawdata) # compute and plot the pair-wise correlations
# save to file, remove the big white borders
#plt.savefig('attribute_correlations.png', tight_layout=True)
plt.show()
attr = rawdata['Age']
sns.distplot(attr)
plt.show()
sns.distplot(attr, kde=False, fit=stats.gamma);
plt.show()
# Two subplots, the axes array is 1-d
plt.figure(1)
plt.title('Histogram of Age')
plt.subplot(211) # 21,1 means first one of 2 rows, 1 col
sns.distplot(attr)
plt.subplot(212) # 21,2 means second one of 2 rows, 1 col
sns.distplot(attr, kde=False, fit=stats.gamma);
plt.show()

83
ml algorithms/DataPreprocessing.r

@ -0,0 +1,83 @@
rm(list = ls())
# set the enverionment
path ='~/Dropbox/MachineLearningAlgorithms/python_code/data/Heart.csv'
rawdata = read.csv(path)
# summary of the data
summary(rawdata)
# plot of the summary
plot(rawdata)
dim(rawdata)
head(rawdata)
tail(rawdata)
colnames(rawdata)
attach(rawdata)
# get numerical data and remove NAN
numdata=na.omit(rawdata[,c(1:2,4:12)])
cor(numdata)
cov(numdata)
dev.off()
# laod cocorrelation Matrix plot lib
library(corrplot)
M <- cor(numdata)
#par(mfrow =c (1,2))
#corrplot(M, method = "square")
corrplot.mixed(M)
nrow=nrow(rawdata)
ncol=ncol(rawdata)
c(nrow, ncol)
Nvars=ncol(numdata)
# checking data format
typeof(rawdata)
install.packages("mlbench")
library(mlbench)
sapply(rawdata, class)
dev.off()
name=colnames(numdata)
Nvars=ncol(numdata)
# boxplot
par(mfrow =c (4,3))
for (i in 1:Nvars)
{
#boxplot(numdata[,i]~numdata[,Nvars],data=data,main=name[i])
boxplot(numdata[,i],data=numdata,main=name[i])
}
# Histogram with normal curve plot
dev.off()
Nvars=ncol(numdata)
name=colnames(numdata)
par(mfrow =c (3,5))
for (i in 1:Nvars)
{
x<- numdata[,i]
h<-hist(x, breaks=10, freq=TRUE, col="blue", xlab=name[i],main=" ",
font.lab=1)
axis(1, tck=1, col.ticks="light gray")
axis(1, tck=-0.015, col.ticks="black")
axis(2, tck=1, col.ticks="light gray", lwd.ticks="1")
axis(2, tck=-0.015)
xfit<-seq(min(x),max(x),length=40)
yfit<-dnorm(xfit,mean=mean(x),sd=sd(x))
yfit <- yfit*diff(h$mids[1:2])*length(x)
lines(xfit, yfit, col="blue", lwd=2)
}
library(reshape2)
library(ggplot2)
d <- melt(diamonds[,-c(2:4)])
ggplot(d,aes(x = value)) +
facet_wrap(~variable,scales = "free_x") +
geom_histogram()

20
ml algorithms/LinearRegression.py

@ -0,0 +1,20 @@
#LinearRegression]
#Import Library
#Import other necessary libraries like pandas, numpy...
from sklearn import linear_model
#Load Train and Test datasets
#Identify feature and response variable(s) and values must be numeric and numpy arrays
x_train=input_variables_values_training_datasets
y_train=target_variables_values_training_datasets
x_test=input_variables_values_test_datasets
# Create linear regression object
linear = linear_model.LinearRegression()
# Train the model using the training sets and check score
linear.fit(x_train, y_train)
linear.score(x_train, y_train)
#Equation coefficient and Intercept
print('Coefficient: \n', linear.coef_)
print('Intercept: \n', linear.intercept_)
#Predict Output
predicted= linear.predict(x_test)

16
tex/cites.bib

@ -1,44 +1,42 @@
@article { robust, @article { robust,
autor = "Julien Pinquier, Jean-Luc Rouas and Régine André-Obrecht",
title = "ROBUST SPEECH/MUSIC CLASSIFICATION IN AUDIO DOCUMENTS", title = "ROBUST SPEECH/MUSIC CLASSIFICATION IN AUDIO DOCUMENTS",
author = "Julien Pinquier, Jean-Luc Rouas and Régine André-Obrecht",
journal = "7th International Conference on Spoken Language Processing [ICSLP2002]", journal = "7th International Conference on Spoken Language Processing [ICSLP2002]",
year = "2002" year = "2002"
} }
@article { mirex, @article { mirex,
autor = "Nikolaos Tsipas, Lazaros Vrysis, Charalampos Dimoulas and George Papanikolaou",
title = "MIREX 2015: METHODS FOR SPEECH/MUSIC DETECTION AND CLASSIFICATION", title = "MIREX 2015: METHODS FOR SPEECH/MUSIC DETECTION AND CLASSIFICATION",
author = "Nikolaos Tsipas, Lazaros Vrysis, Charalampos Dimoulas and George Papanikolaou",
journal = "MIREX 2015 Conference", journal = "MIREX 2015 Conference",
year = "2015" year = "2015"
} }
@article { speech, @article { speech,
autor = "Baniriskhem K. Khonglah and S.R. Mahadeva Prasanna",
title = "Speech / music classification using speech-specific features", title = "Speech / music classification using speech-specific features",
author = "Baniriskhem K. Khonglah and S.R. Mahadeva Prasanna",
journal = "Digital Signal Processing 48", journal = "Digital Signal Processing 48",
year = "2016" year = "2016"
} }
@article{ cuckoo, @article{ cuckoo,
title = "Speech classification based on cuckoo algorithm and support vector machines",
author = "Wenlei Shi and Xinhai Fan", author = "Wenlei Shi and Xinhai Fan",
title = "Speech classification based on cuckoo algorithm and support journal = "2nd IEEE International Conference on Computational Intelligence and Applications",
vector machines",
journal = "2nd IEEE International Conference on Computational
Intelligence and Applications",
year = "2017" year = "2017"
} }
@article{ radio, @article{ radio,
author = "Stanisław Kacprzak, Błażej Chwiećko and Bartosz Ziółko",
title = "Speech/music discrimination for analysis of radio stations", title = "Speech/music discrimination for analysis of radio stations",
author = "Stanisław Kacprzak, Błażej Chwiećko and Bartosz Ziółko",
journal = "2017 International Conference on Systems, Signals and Image Processing (IWSSIP)", journal = "2017 International Conference on Systems, Signals and Image Processing (IWSSIP)",
year = "2017" year = "2017"
} }
<<<<<<< HEAD <<<<<<< HEAD
@article { speech, @article { speech,
author = "Baniriskhem K. Khonglah and S.R. Mahadeva Prasanna",
title = "Speech / music classification using speech-specific features", title = "Speech / music classification using speech-specific features",
author = "Baniriskhem K. Khonglah and S.R. Mahadeva Prasanna",
journal = "Digital Signal Processing 48", journal = "Digital Signal Processing 48",
year = "2016" year = "2016"
) )

Loading…
Cancel
Save