Christina Theodoridou
6 years ago
6 changed files with 432 additions and 9 deletions
@ -0,0 +1,129 @@ |
|||||
|
#Linear Regression |
||||
|
|
||||
|
#Import Library |
||||
|
#Import other necessary libraries like pandas, numpy... |
||||
|
from sklearn import linear_model |
||||
|
#Load Train and Test datasets |
||||
|
#Identify feature and response variable(s) and values must be numeric and numpy arrays |
||||
|
x_train=input_variables_values_training_datasets |
||||
|
y_train=target_variables_values_training_datasets |
||||
|
x_test=input_variables_values_test_datasets |
||||
|
# Create linear regression object |
||||
|
linear = linear_model.LinearRegression() |
||||
|
# Train the model using the training sets and check score |
||||
|
linear.fit(x_train, y_train) |
||||
|
linear.score(x_train, y_train) |
||||
|
#Equation coefficient and Intercept |
||||
|
print('Coefficient: \n', linear.coef_) |
||||
|
print('Intercept: \n', linear.intercept_) |
||||
|
#Predict Output |
||||
|
predicted= linear.predict(x_test) |
||||
|
|
||||
|
|
||||
|
|
||||
|
#Logistic Regression |
||||
|
|
||||
|
|
||||
|
#Import Library |
||||
|
from sklearn.linear_model import LogisticRegression |
||||
|
#Assumed you have, X (predictor) and Y (target) for training data set and x_test(predictor) of test_dataset |
||||
|
# Create logistic regression object |
||||
|
model = LogisticRegression() |
||||
|
# Train the model using the training sets and check score |
||||
|
model.fit(X, y) |
||||
|
model.score(X, y) |
||||
|
#Equation coefficient and Intercept |
||||
|
print('Coefficient: \n', model.coef_) |
||||
|
print('Intercept: \n', model.intercept_) |
||||
|
#Predict Output |
||||
|
predicted= model.predict(x_test) |
||||
|
|
||||
|
|
||||
|
|
||||
|
|
||||
|
#Decision Tree |
||||
|
#Import Library |
||||
|
#Import other necessary libraries like pandas, numpy... |
||||
|
from sklearn import tree |
||||
|
#Assumed you have, X (predictor) and Y (target) for training data set and x_test(predictor) of test_dataset |
||||
|
# Create tree object |
||||
|
model = tree.DecisionTreeClassifier(criterion='gini') # for classification, here you can change the algorithm as gini or entropy (information gain) by default it is gini |
||||
|
# model = tree.DecisionTreeRegressor() for regression |
||||
|
# Train the model using the training sets and check score |
||||
|
model.fit(X, y) |
||||
|
model.score(X, y) |
||||
|
#Predict Output |
||||
|
predicted= model.predict(x_test) |
||||
|
|
||||
|
|
||||
|
#SVM |
||||
|
|
||||
|
#Import Library |
||||
|
from sklearn import svm |
||||
|
#Assumed you have, X (predictor) and Y (target) for training data set and x_test(predictor) of test_dataset |
||||
|
# Create SVM classification object |
||||
|
model = svm.svc() # there is various option associated with it, this is simple for classification. You can refer link, for mo# re detail. |
||||
|
# Train the model using the training sets and check score |
||||
|
model.fit(X, y) |
||||
|
model.score(X, y) |
||||
|
#Predict Output |
||||
|
predicted= model.predict(x_test) |
||||
|
|
||||
|
# Naive Bayes |
||||
|
|
||||
|
#Import Library |
||||
|
from sklearn.naive_bayes import GaussianNB |
||||
|
#Assumed you have, X (predictor) and Y (target) for training data set and x_test(predictor) of test_dataset |
||||
|
# Create SVM classification object model = GaussianNB() # there is other distribution for multinomial classes like Bernoulli Naive Bayes, Refer link |
||||
|
# Train the model using the training sets and check score |
||||
|
model.fit(X, y) |
||||
|
#Predict Output |
||||
|
predicted= model.predict(x_test) |
||||
|
|
||||
|
#kNN (k- Nearest Neighbors) |
||||
|
|
||||
|
#Import Library |
||||
|
from sklearn.neighbors import KNeighborsClassifier |
||||
|
#Assumed you have, X (predictor) and Y (target) for training data set and x_test(predictor) of test_dataset |
||||
|
# Create KNeighbors classifier object model |
||||
|
KNeighborsClassifier(n_neighbors=6) # default value for n_neighbors is 5 |
||||
|
# Train the model using the training sets and check score |
||||
|
model.fit(X, y) |
||||
|
#Predict Output |
||||
|
predicted= model.predict(x_test) |
||||
|
|
||||
|
#K-Means |
||||
|
#Import Library |
||||
|
from sklearn.cluster import KMeans |
||||
|
#Assumed you have, X (attributes) for training data set and x_test(attributes) of test_dataset |
||||
|
# Create KNeighbors classifier object model |
||||
|
k_means = KMeans(n_clusters=3, random_state=0) |
||||
|
# Train the model using the training sets and check score |
||||
|
model.fit(X) |
||||
|
#Predict Output |
||||
|
predicted= model.predict(x_test) |
||||
|
|
||||
|
#Random Forest |
||||
|
|
||||
|
#Import Library |
||||
|
from sklearn.ensemble import RandomForestClassifier |
||||
|
#Assumed you have, X (predictor) and Y (target) for training data set and x_test(predictor) of test_dataset |
||||
|
# Create Random Forest object |
||||
|
model= RandomForestClassifier() |
||||
|
# Train the model using the training sets and check score |
||||
|
model.fit(X, y) |
||||
|
#Predict Output |
||||
|
predicted= model.predict(x_test) |
||||
|
|
||||
|
#Dimensionality Reduction Algorithms |
||||
|
|
||||
|
#Import Library |
||||
|
from sklearn import decomposition |
||||
|
#Assumed you have training and test data set as train and test |
||||
|
# Create PCA obeject pca= decomposition.PCA(n_components=k) #default value of k =min(n_sample, n_features) |
||||
|
# For Factor analysis |
||||
|
#fa= decomposition.FactorAnalysis() |
||||
|
# Reduced the dimension of training dataset using PCA |
||||
|
train_reduced = pca.fit_transform(train) |
||||
|
#Reduced the dimension of test dataset |
||||
|
test_reduced = pca.transform(test) |
@ -0,0 +1,85 @@ |
|||||
|
#Linear Regression |
||||
|
|
||||
|
#Load Train and Test datasets |
||||
|
#Identify feature and response variable(s) and values must be numeric and numpy arrays |
||||
|
x_train <- input_variables_values_training_datasets |
||||
|
y_train <- target_variables_values_training_datasets |
||||
|
x_test <- input_variables_values_test_datasets |
||||
|
x <- cbind(x_train,y_train) |
||||
|
# Train the model using the training sets and check score |
||||
|
linear <- lm(y_train ~ ., data = x) |
||||
|
summary(linear) |
||||
|
#Predict Output |
||||
|
predicted= predict(linear,x_test) |
||||
|
|
||||
|
#Logistic Regression |
||||
|
|
||||
|
x <- cbind(x_train,y_train) |
||||
|
# Train the model using the training sets and check score |
||||
|
logistic <- glm(y_train ~ ., data = x,family='binomial') |
||||
|
summary(logistic) |
||||
|
#Predict Output |
||||
|
predicted= predict(logistic,x_test) |
||||
|
|
||||
|
#Decision tree |
||||
|
|
||||
|
library(rpart) |
||||
|
x <- cbind(x_train,y_train) |
||||
|
# grow tree |
||||
|
fit <- rpart(y_train ~ ., data = x,method="class") |
||||
|
summary(fit) |
||||
|
#Predict Output |
||||
|
predicted= predict(fit,x_test) |
||||
|
|
||||
|
|
||||
|
#SVM |
||||
|
|
||||
|
library(e1071) |
||||
|
x <- cbind(x_train,y_train) |
||||
|
# Fitting model |
||||
|
fit <-svm(y_train ~ ., data = x) |
||||
|
summary(fit) |
||||
|
#Predict Output |
||||
|
predicted= predict(fit,x_test) |
||||
|
|
||||
|
# Naive Bayes |
||||
|
|
||||
|
library(e1071) |
||||
|
x <- cbind(x_train,y_train) |
||||
|
# Fitting model |
||||
|
fit <-naiveBayes(y_train ~ ., data = x) |
||||
|
summary(fit) |
||||
|
#Predict Output |
||||
|
predicted= predict(fit,x_test) |
||||
|
|
||||
|
#kNN (k- Nearest Neighbors) |
||||
|
|
||||
|
library(knn) |
||||
|
x <- cbind(x_train,y_train) |
||||
|
# Fitting model |
||||
|
fit <-knn(y_train ~ ., data = x,k=5) |
||||
|
summary(fit) |
||||
|
#Predict Output |
||||
|
predicted= predict(fit,x_test) |
||||
|
|
||||
|
#K-Means |
||||
|
|
||||
|
library(cluster) |
||||
|
fit <- kmeans(X, 3) # 5 cluster solution |
||||
|
|
||||
|
#Random Forest |
||||
|
|
||||
|
library(randomForest) |
||||
|
x <- cbind(x_train,y_train) |
||||
|
# Fitting model |
||||
|
fit <- randomForest(Species ~ ., x,ntree=500) |
||||
|
summary(fit) |
||||
|
#Predict Output |
||||
|
predicted= predict(fit,x_test) |
||||
|
|
||||
|
#Dimensionality Reduction Algorithms |
||||
|
|
||||
|
library(stats) |
||||
|
pca <- princomp(train, cor = TRUE) |
||||
|
train_reduced <- predict(pca,train) |
||||
|
test_reduced <- predict(pca,test) |
@ -0,0 +1,108 @@ |
|||||
|
''' |
||||
|
Created on Apr 25, 2016 |
||||
|
test code |
||||
|
@author: Wenqiang Feng |
||||
|
''' |
||||
|
import pandas as pd |
||||
|
#import numpy as np |
||||
|
import matplotlib.pyplot as plt |
||||
|
from pandas.tools.plotting import scatter_matrix |
||||
|
from docutils.parsers.rst.directives import path |
||||
|
|
||||
|
if __name__ == '__main__': |
||||
|
path ='~/Dropbox/MachineLearningAlgorithms/python_code/data/Heart.csv' |
||||
|
rawdata = pd.read_csv(path) |
||||
|
|
||||
|
print "data summary" |
||||
|
print rawdata.describe() |
||||
|
|
||||
|
# summary plot of the data |
||||
|
scatter_matrix(rawdata,figsize=[15,15]) |
||||
|
plt.show() |
||||
|
|
||||
|
# Histogram |
||||
|
rawdata.hist() |
||||
|
plt.show() |
||||
|
|
||||
|
# boxplot |
||||
|
pd.DataFrame.boxplot(rawdata) |
||||
|
plt.show() |
||||
|
|
||||
|
|
||||
|
print "Raw data size" |
||||
|
nrow, ncol = rawdata.shape |
||||
|
print nrow, ncol |
||||
|
|
||||
|
path = ('/home/feng/Dropbox/MachineLearningAlgorithms/python_code/data/' |
||||
|
'energy_efficiency.xlsx') |
||||
|
path |
||||
|
|
||||
|
rawdataEnergy= pd.read_excel(path,sheetname=0) |
||||
|
|
||||
|
nrow=rawdata.shape[0] #gives number of row count |
||||
|
ncol=rawdata.shape[1] #gives number of col count |
||||
|
print nrow, ncol |
||||
|
col_names = rawdata.columns.tolist() |
||||
|
print "Column names:" |
||||
|
print col_names |
||||
|
print "Data Format:" |
||||
|
print rawdata.dtypes |
||||
|
|
||||
|
print "\nSample data:" |
||||
|
print(rawdata.head(6)) |
||||
|
|
||||
|
|
||||
|
print "\n correlation Matrix" |
||||
|
print rawdata.corr() |
||||
|
|
||||
|
# cocorrelation Matrix plot |
||||
|
pd.DataFrame.corr(rawdata) |
||||
|
plt.show() |
||||
|
|
||||
|
print "\n covariance Matrix" |
||||
|
print rawdata.cov() |
||||
|
|
||||
|
print rawdata[['Age','Ca']].corr() |
||||
|
pd.DataFrame.corr(rawdata) |
||||
|
plt.show() |
||||
|
|
||||
|
|
||||
|
|
||||
|
# define colors list, to be used to plot survived either red (=0) or green (=1) |
||||
|
colors=['red','green'] |
||||
|
|
||||
|
# make a scatter plot |
||||
|
|
||||
|
# rawdata.info() |
||||
|
|
||||
|
from scipy import stats |
||||
|
import seaborn as sns # just a conventional alias, don't know why |
||||
|
sns.corrplot(rawdata) # compute and plot the pair-wise correlations |
||||
|
# save to file, remove the big white borders |
||||
|
#plt.savefig('attribute_correlations.png', tight_layout=True) |
||||
|
plt.show() |
||||
|
|
||||
|
|
||||
|
attr = rawdata['Age'] |
||||
|
sns.distplot(attr) |
||||
|
plt.show() |
||||
|
|
||||
|
sns.distplot(attr, kde=False, fit=stats.gamma); |
||||
|
plt.show() |
||||
|
|
||||
|
# Two subplots, the axes array is 1-d |
||||
|
plt.figure(1) |
||||
|
plt.title('Histogram of Age') |
||||
|
plt.subplot(211) # 21,1 means first one of 2 rows, 1 col |
||||
|
sns.distplot(attr) |
||||
|
|
||||
|
plt.subplot(212) # 21,2 means second one of 2 rows, 1 col |
||||
|
sns.distplot(attr, kde=False, fit=stats.gamma); |
||||
|
|
||||
|
plt.show() |
||||
|
|
||||
|
|
||||
|
|
||||
|
|
||||
|
|
||||
|
|
@ -0,0 +1,83 @@ |
|||||
|
rm(list = ls()) |
||||
|
# set the enverionment |
||||
|
path ='~/Dropbox/MachineLearningAlgorithms/python_code/data/Heart.csv' |
||||
|
rawdata = read.csv(path) |
||||
|
|
||||
|
# summary of the data |
||||
|
summary(rawdata) |
||||
|
# plot of the summary |
||||
|
plot(rawdata) |
||||
|
|
||||
|
dim(rawdata) |
||||
|
head(rawdata) |
||||
|
tail(rawdata) |
||||
|
|
||||
|
colnames(rawdata) |
||||
|
attach(rawdata) |
||||
|
|
||||
|
# get numerical data and remove NAN |
||||
|
numdata=na.omit(rawdata[,c(1:2,4:12)]) |
||||
|
|
||||
|
cor(numdata) |
||||
|
cov(numdata) |
||||
|
|
||||
|
dev.off() |
||||
|
# laod cocorrelation Matrix plot lib |
||||
|
library(corrplot) |
||||
|
M <- cor(numdata) |
||||
|
#par(mfrow =c (1,2)) |
||||
|
#corrplot(M, method = "square") |
||||
|
corrplot.mixed(M) |
||||
|
|
||||
|
|
||||
|
nrow=nrow(rawdata) |
||||
|
ncol=ncol(rawdata) |
||||
|
c(nrow, ncol) |
||||
|
|
||||
|
|
||||
|
|
||||
|
Nvars=ncol(numdata) |
||||
|
# checking data format |
||||
|
typeof(rawdata) |
||||
|
install.packages("mlbench") |
||||
|
library(mlbench) |
||||
|
sapply(rawdata, class) |
||||
|
|
||||
|
dev.off() |
||||
|
name=colnames(numdata) |
||||
|
Nvars=ncol(numdata) |
||||
|
# boxplot |
||||
|
par(mfrow =c (4,3)) |
||||
|
for (i in 1:Nvars) |
||||
|
{ |
||||
|
#boxplot(numdata[,i]~numdata[,Nvars],data=data,main=name[i]) |
||||
|
boxplot(numdata[,i],data=numdata,main=name[i]) |
||||
|
} |
||||
|
|
||||
|
# Histogram with normal curve plot |
||||
|
dev.off() |
||||
|
Nvars=ncol(numdata) |
||||
|
name=colnames(numdata) |
||||
|
par(mfrow =c (3,5)) |
||||
|
for (i in 1:Nvars) |
||||
|
{ |
||||
|
x<- numdata[,i] |
||||
|
h<-hist(x, breaks=10, freq=TRUE, col="blue", xlab=name[i],main=" ", |
||||
|
font.lab=1) |
||||
|
axis(1, tck=1, col.ticks="light gray") |
||||
|
axis(1, tck=-0.015, col.ticks="black") |
||||
|
axis(2, tck=1, col.ticks="light gray", lwd.ticks="1") |
||||
|
axis(2, tck=-0.015) |
||||
|
xfit<-seq(min(x),max(x),length=40) |
||||
|
yfit<-dnorm(xfit,mean=mean(x),sd=sd(x)) |
||||
|
yfit <- yfit*diff(h$mids[1:2])*length(x) |
||||
|
lines(xfit, yfit, col="blue", lwd=2) |
||||
|
} |
||||
|
|
||||
|
|
||||
|
library(reshape2) |
||||
|
library(ggplot2) |
||||
|
d <- melt(diamonds[,-c(2:4)]) |
||||
|
ggplot(d,aes(x = value)) + |
||||
|
facet_wrap(~variable,scales = "free_x") + |
||||
|
geom_histogram() |
@ -0,0 +1,20 @@ |
|||||
|
#LinearRegression] |
||||
|
|
||||
|
#Import Library |
||||
|
#Import other necessary libraries like pandas, numpy... |
||||
|
from sklearn import linear_model |
||||
|
#Load Train and Test datasets |
||||
|
#Identify feature and response variable(s) and values must be numeric and numpy arrays |
||||
|
x_train=input_variables_values_training_datasets |
||||
|
y_train=target_variables_values_training_datasets |
||||
|
x_test=input_variables_values_test_datasets |
||||
|
# Create linear regression object |
||||
|
linear = linear_model.LinearRegression() |
||||
|
# Train the model using the training sets and check score |
||||
|
linear.fit(x_train, y_train) |
||||
|
linear.score(x_train, y_train) |
||||
|
#Equation coefficient and Intercept |
||||
|
print('Coefficient: \n', linear.coef_) |
||||
|
print('Intercept: \n', linear.intercept_) |
||||
|
#Predict Output |
||||
|
predicted= linear.predict(x_test) |
Loading…
Reference in new issue