Christina Theodoridou
6 years ago
6 changed files with 432 additions and 9 deletions
@ -0,0 +1,129 @@ |
|||
#Linear Regression |
|||
|
|||
#Import Library |
|||
#Import other necessary libraries like pandas, numpy... |
|||
from sklearn import linear_model |
|||
#Load Train and Test datasets |
|||
#Identify feature and response variable(s) and values must be numeric and numpy arrays |
|||
x_train=input_variables_values_training_datasets |
|||
y_train=target_variables_values_training_datasets |
|||
x_test=input_variables_values_test_datasets |
|||
# Create linear regression object |
|||
linear = linear_model.LinearRegression() |
|||
# Train the model using the training sets and check score |
|||
linear.fit(x_train, y_train) |
|||
linear.score(x_train, y_train) |
|||
#Equation coefficient and Intercept |
|||
print('Coefficient: \n', linear.coef_) |
|||
print('Intercept: \n', linear.intercept_) |
|||
#Predict Output |
|||
predicted= linear.predict(x_test) |
|||
|
|||
|
|||
|
|||
#Logistic Regression |
|||
|
|||
|
|||
#Import Library |
|||
from sklearn.linear_model import LogisticRegression |
|||
#Assumed you have, X (predictor) and Y (target) for training data set and x_test(predictor) of test_dataset |
|||
# Create logistic regression object |
|||
model = LogisticRegression() |
|||
# Train the model using the training sets and check score |
|||
model.fit(X, y) |
|||
model.score(X, y) |
|||
#Equation coefficient and Intercept |
|||
print('Coefficient: \n', model.coef_) |
|||
print('Intercept: \n', model.intercept_) |
|||
#Predict Output |
|||
predicted= model.predict(x_test) |
|||
|
|||
|
|||
|
|||
|
|||
#Decision Tree |
|||
#Import Library |
|||
#Import other necessary libraries like pandas, numpy... |
|||
from sklearn import tree |
|||
#Assumed you have, X (predictor) and Y (target) for training data set and x_test(predictor) of test_dataset |
|||
# Create tree object |
|||
model = tree.DecisionTreeClassifier(criterion='gini') # for classification, here you can change the algorithm as gini or entropy (information gain) by default it is gini |
|||
# model = tree.DecisionTreeRegressor() for regression |
|||
# Train the model using the training sets and check score |
|||
model.fit(X, y) |
|||
model.score(X, y) |
|||
#Predict Output |
|||
predicted= model.predict(x_test) |
|||
|
|||
|
|||
#SVM |
|||
|
|||
#Import Library |
|||
from sklearn import svm |
|||
#Assumed you have, X (predictor) and Y (target) for training data set and x_test(predictor) of test_dataset |
|||
# Create SVM classification object |
|||
model = svm.svc() # there is various option associated with it, this is simple for classification. You can refer link, for mo# re detail. |
|||
# Train the model using the training sets and check score |
|||
model.fit(X, y) |
|||
model.score(X, y) |
|||
#Predict Output |
|||
predicted= model.predict(x_test) |
|||
|
|||
# Naive Bayes |
|||
|
|||
#Import Library |
|||
from sklearn.naive_bayes import GaussianNB |
|||
#Assumed you have, X (predictor) and Y (target) for training data set and x_test(predictor) of test_dataset |
|||
# Create SVM classification object model = GaussianNB() # there is other distribution for multinomial classes like Bernoulli Naive Bayes, Refer link |
|||
# Train the model using the training sets and check score |
|||
model.fit(X, y) |
|||
#Predict Output |
|||
predicted= model.predict(x_test) |
|||
|
|||
#kNN (k- Nearest Neighbors) |
|||
|
|||
#Import Library |
|||
from sklearn.neighbors import KNeighborsClassifier |
|||
#Assumed you have, X (predictor) and Y (target) for training data set and x_test(predictor) of test_dataset |
|||
# Create KNeighbors classifier object model |
|||
KNeighborsClassifier(n_neighbors=6) # default value for n_neighbors is 5 |
|||
# Train the model using the training sets and check score |
|||
model.fit(X, y) |
|||
#Predict Output |
|||
predicted= model.predict(x_test) |
|||
|
|||
#K-Means |
|||
#Import Library |
|||
from sklearn.cluster import KMeans |
|||
#Assumed you have, X (attributes) for training data set and x_test(attributes) of test_dataset |
|||
# Create KNeighbors classifier object model |
|||
k_means = KMeans(n_clusters=3, random_state=0) |
|||
# Train the model using the training sets and check score |
|||
model.fit(X) |
|||
#Predict Output |
|||
predicted= model.predict(x_test) |
|||
|
|||
#Random Forest |
|||
|
|||
#Import Library |
|||
from sklearn.ensemble import RandomForestClassifier |
|||
#Assumed you have, X (predictor) and Y (target) for training data set and x_test(predictor) of test_dataset |
|||
# Create Random Forest object |
|||
model= RandomForestClassifier() |
|||
# Train the model using the training sets and check score |
|||
model.fit(X, y) |
|||
#Predict Output |
|||
predicted= model.predict(x_test) |
|||
|
|||
#Dimensionality Reduction Algorithms |
|||
|
|||
#Import Library |
|||
from sklearn import decomposition |
|||
#Assumed you have training and test data set as train and test |
|||
# Create PCA obeject pca= decomposition.PCA(n_components=k) #default value of k =min(n_sample, n_features) |
|||
# For Factor analysis |
|||
#fa= decomposition.FactorAnalysis() |
|||
# Reduced the dimension of training dataset using PCA |
|||
train_reduced = pca.fit_transform(train) |
|||
#Reduced the dimension of test dataset |
|||
test_reduced = pca.transform(test) |
@ -0,0 +1,85 @@ |
|||
#Linear Regression |
|||
|
|||
#Load Train and Test datasets |
|||
#Identify feature and response variable(s) and values must be numeric and numpy arrays |
|||
x_train <- input_variables_values_training_datasets |
|||
y_train <- target_variables_values_training_datasets |
|||
x_test <- input_variables_values_test_datasets |
|||
x <- cbind(x_train,y_train) |
|||
# Train the model using the training sets and check score |
|||
linear <- lm(y_train ~ ., data = x) |
|||
summary(linear) |
|||
#Predict Output |
|||
predicted= predict(linear,x_test) |
|||
|
|||
#Logistic Regression |
|||
|
|||
x <- cbind(x_train,y_train) |
|||
# Train the model using the training sets and check score |
|||
logistic <- glm(y_train ~ ., data = x,family='binomial') |
|||
summary(logistic) |
|||
#Predict Output |
|||
predicted= predict(logistic,x_test) |
|||
|
|||
#Decision tree |
|||
|
|||
library(rpart) |
|||
x <- cbind(x_train,y_train) |
|||
# grow tree |
|||
fit <- rpart(y_train ~ ., data = x,method="class") |
|||
summary(fit) |
|||
#Predict Output |
|||
predicted= predict(fit,x_test) |
|||
|
|||
|
|||
#SVM |
|||
|
|||
library(e1071) |
|||
x <- cbind(x_train,y_train) |
|||
# Fitting model |
|||
fit <-svm(y_train ~ ., data = x) |
|||
summary(fit) |
|||
#Predict Output |
|||
predicted= predict(fit,x_test) |
|||
|
|||
# Naive Bayes |
|||
|
|||
library(e1071) |
|||
x <- cbind(x_train,y_train) |
|||
# Fitting model |
|||
fit <-naiveBayes(y_train ~ ., data = x) |
|||
summary(fit) |
|||
#Predict Output |
|||
predicted= predict(fit,x_test) |
|||
|
|||
#kNN (k- Nearest Neighbors) |
|||
|
|||
library(knn) |
|||
x <- cbind(x_train,y_train) |
|||
# Fitting model |
|||
fit <-knn(y_train ~ ., data = x,k=5) |
|||
summary(fit) |
|||
#Predict Output |
|||
predicted= predict(fit,x_test) |
|||
|
|||
#K-Means |
|||
|
|||
library(cluster) |
|||
fit <- kmeans(X, 3) # 5 cluster solution |
|||
|
|||
#Random Forest |
|||
|
|||
library(randomForest) |
|||
x <- cbind(x_train,y_train) |
|||
# Fitting model |
|||
fit <- randomForest(Species ~ ., x,ntree=500) |
|||
summary(fit) |
|||
#Predict Output |
|||
predicted= predict(fit,x_test) |
|||
|
|||
#Dimensionality Reduction Algorithms |
|||
|
|||
library(stats) |
|||
pca <- princomp(train, cor = TRUE) |
|||
train_reduced <- predict(pca,train) |
|||
test_reduced <- predict(pca,test) |
@ -0,0 +1,108 @@ |
|||
''' |
|||
Created on Apr 25, 2016 |
|||
test code |
|||
@author: Wenqiang Feng |
|||
''' |
|||
import pandas as pd |
|||
#import numpy as np |
|||
import matplotlib.pyplot as plt |
|||
from pandas.tools.plotting import scatter_matrix |
|||
from docutils.parsers.rst.directives import path |
|||
|
|||
if __name__ == '__main__': |
|||
path ='~/Dropbox/MachineLearningAlgorithms/python_code/data/Heart.csv' |
|||
rawdata = pd.read_csv(path) |
|||
|
|||
print "data summary" |
|||
print rawdata.describe() |
|||
|
|||
# summary plot of the data |
|||
scatter_matrix(rawdata,figsize=[15,15]) |
|||
plt.show() |
|||
|
|||
# Histogram |
|||
rawdata.hist() |
|||
plt.show() |
|||
|
|||
# boxplot |
|||
pd.DataFrame.boxplot(rawdata) |
|||
plt.show() |
|||
|
|||
|
|||
print "Raw data size" |
|||
nrow, ncol = rawdata.shape |
|||
print nrow, ncol |
|||
|
|||
path = ('/home/feng/Dropbox/MachineLearningAlgorithms/python_code/data/' |
|||
'energy_efficiency.xlsx') |
|||
path |
|||
|
|||
rawdataEnergy= pd.read_excel(path,sheetname=0) |
|||
|
|||
nrow=rawdata.shape[0] #gives number of row count |
|||
ncol=rawdata.shape[1] #gives number of col count |
|||
print nrow, ncol |
|||
col_names = rawdata.columns.tolist() |
|||
print "Column names:" |
|||
print col_names |
|||
print "Data Format:" |
|||
print rawdata.dtypes |
|||
|
|||
print "\nSample data:" |
|||
print(rawdata.head(6)) |
|||
|
|||
|
|||
print "\n correlation Matrix" |
|||
print rawdata.corr() |
|||
|
|||
# cocorrelation Matrix plot |
|||
pd.DataFrame.corr(rawdata) |
|||
plt.show() |
|||
|
|||
print "\n covariance Matrix" |
|||
print rawdata.cov() |
|||
|
|||
print rawdata[['Age','Ca']].corr() |
|||
pd.DataFrame.corr(rawdata) |
|||
plt.show() |
|||
|
|||
|
|||
|
|||
# define colors list, to be used to plot survived either red (=0) or green (=1) |
|||
colors=['red','green'] |
|||
|
|||
# make a scatter plot |
|||
|
|||
# rawdata.info() |
|||
|
|||
from scipy import stats |
|||
import seaborn as sns # just a conventional alias, don't know why |
|||
sns.corrplot(rawdata) # compute and plot the pair-wise correlations |
|||
# save to file, remove the big white borders |
|||
#plt.savefig('attribute_correlations.png', tight_layout=True) |
|||
plt.show() |
|||
|
|||
|
|||
attr = rawdata['Age'] |
|||
sns.distplot(attr) |
|||
plt.show() |
|||
|
|||
sns.distplot(attr, kde=False, fit=stats.gamma); |
|||
plt.show() |
|||
|
|||
# Two subplots, the axes array is 1-d |
|||
plt.figure(1) |
|||
plt.title('Histogram of Age') |
|||
plt.subplot(211) # 21,1 means first one of 2 rows, 1 col |
|||
sns.distplot(attr) |
|||
|
|||
plt.subplot(212) # 21,2 means second one of 2 rows, 1 col |
|||
sns.distplot(attr, kde=False, fit=stats.gamma); |
|||
|
|||
plt.show() |
|||
|
|||
|
|||
|
|||
|
|||
|
|||
|
@ -0,0 +1,83 @@ |
|||
rm(list = ls()) |
|||
# set the enverionment |
|||
path ='~/Dropbox/MachineLearningAlgorithms/python_code/data/Heart.csv' |
|||
rawdata = read.csv(path) |
|||
|
|||
# summary of the data |
|||
summary(rawdata) |
|||
# plot of the summary |
|||
plot(rawdata) |
|||
|
|||
dim(rawdata) |
|||
head(rawdata) |
|||
tail(rawdata) |
|||
|
|||
colnames(rawdata) |
|||
attach(rawdata) |
|||
|
|||
# get numerical data and remove NAN |
|||
numdata=na.omit(rawdata[,c(1:2,4:12)]) |
|||
|
|||
cor(numdata) |
|||
cov(numdata) |
|||
|
|||
dev.off() |
|||
# laod cocorrelation Matrix plot lib |
|||
library(corrplot) |
|||
M <- cor(numdata) |
|||
#par(mfrow =c (1,2)) |
|||
#corrplot(M, method = "square") |
|||
corrplot.mixed(M) |
|||
|
|||
|
|||
nrow=nrow(rawdata) |
|||
ncol=ncol(rawdata) |
|||
c(nrow, ncol) |
|||
|
|||
|
|||
|
|||
Nvars=ncol(numdata) |
|||
# checking data format |
|||
typeof(rawdata) |
|||
install.packages("mlbench") |
|||
library(mlbench) |
|||
sapply(rawdata, class) |
|||
|
|||
dev.off() |
|||
name=colnames(numdata) |
|||
Nvars=ncol(numdata) |
|||
# boxplot |
|||
par(mfrow =c (4,3)) |
|||
for (i in 1:Nvars) |
|||
{ |
|||
#boxplot(numdata[,i]~numdata[,Nvars],data=data,main=name[i]) |
|||
boxplot(numdata[,i],data=numdata,main=name[i]) |
|||
} |
|||
|
|||
# Histogram with normal curve plot |
|||
dev.off() |
|||
Nvars=ncol(numdata) |
|||
name=colnames(numdata) |
|||
par(mfrow =c (3,5)) |
|||
for (i in 1:Nvars) |
|||
{ |
|||
x<- numdata[,i] |
|||
h<-hist(x, breaks=10, freq=TRUE, col="blue", xlab=name[i],main=" ", |
|||
font.lab=1) |
|||
axis(1, tck=1, col.ticks="light gray") |
|||
axis(1, tck=-0.015, col.ticks="black") |
|||
axis(2, tck=1, col.ticks="light gray", lwd.ticks="1") |
|||
axis(2, tck=-0.015) |
|||
xfit<-seq(min(x),max(x),length=40) |
|||
yfit<-dnorm(xfit,mean=mean(x),sd=sd(x)) |
|||
yfit <- yfit*diff(h$mids[1:2])*length(x) |
|||
lines(xfit, yfit, col="blue", lwd=2) |
|||
} |
|||
|
|||
|
|||
library(reshape2) |
|||
library(ggplot2) |
|||
d <- melt(diamonds[,-c(2:4)]) |
|||
ggplot(d,aes(x = value)) + |
|||
facet_wrap(~variable,scales = "free_x") + |
|||
geom_histogram() |
@ -0,0 +1,20 @@ |
|||
#LinearRegression] |
|||
|
|||
#Import Library |
|||
#Import other necessary libraries like pandas, numpy... |
|||
from sklearn import linear_model |
|||
#Load Train and Test datasets |
|||
#Identify feature and response variable(s) and values must be numeric and numpy arrays |
|||
x_train=input_variables_values_training_datasets |
|||
y_train=target_variables_values_training_datasets |
|||
x_test=input_variables_values_test_datasets |
|||
# Create linear regression object |
|||
linear = linear_model.LinearRegression() |
|||
# Train the model using the training sets and check score |
|||
linear.fit(x_train, y_train) |
|||
linear.score(x_train, y_train) |
|||
#Equation coefficient and Intercept |
|||
print('Coefficient: \n', linear.coef_) |
|||
print('Intercept: \n', linear.intercept_) |
|||
#Predict Output |
|||
predicted= linear.predict(x_test) |
Loading…
Reference in new issue