Some ml algorithms in python and R

7 years ago · 5a1aadb410
6 changed files with 432 additions and 9 deletions
--- a/algorithms/AlgorithmsInPython.py
+++ b/algorithms/AlgorithmsInPython.py
@ -0,0 +1,129 @@
 #Linear Regression
 #Import Library
 #Import other necessary libraries like pandas, numpy...
 from sklearn import linear_model
 #Load Train and Test datasets
 #Identify feature and response variable(s) and values must be numeric and numpy arrays
 x_train=input_variables_values_training_datasets
 y_train=target_variables_values_training_datasets
 x_test=input_variables_values_test_datasets
 # Create linear regression object
 linear = linear_model.LinearRegression()
 # Train the model using the training sets and check score
 linear.fit(x_train, y_train)
 linear.score(x_train, y_train)
 #Equation coefficient and Intercept
 print('Coefficient: \n', linear.coef_)
 print('Intercept: \n', linear.intercept_)
 #Predict Output
 predicted= linear.predict(x_test)
 #Logistic Regression
 #Import Library
 from sklearn.linear_model import LogisticRegression
 #Assumed you have, X (predictor) and Y (target) for training data set and x_test(predictor) of test_dataset
 # Create logistic regression object
 model = LogisticRegression()
 # Train the model using the training sets and check score
 model.fit(X, y)
 model.score(X, y)
 #Equation coefficient and Intercept
 print('Coefficient: \n', model.coef_)
 print('Intercept: \n', model.intercept_)
 #Predict Output
 predicted= model.predict(x_test)
 #Decision Tree
 #Import Library
 #Import other necessary libraries like pandas, numpy...
 from sklearn import tree
 #Assumed you have, X (predictor) and Y (target) for training data set and x_test(predictor) of test_dataset
 # Create tree object
 model = tree.DecisionTreeClassifier(criterion='gini') # for classification, here you can change the algorithm as gini or entropy (information gain) by default it is gini
 # model = tree.DecisionTreeRegressor() for regression
 # Train the model using the training sets and check score
 model.fit(X, y)
 model.score(X, y)
 #Predict Output
 predicted= model.predict(x_test)
 #SVM
 #Import Library
 from sklearn import svm
 #Assumed you have, X (predictor) and Y (target) for training data set and x_test(predictor) of test_dataset
 # Create SVM classification object
 model = svm.svc() # there is various option associated with it, this is simple for classification. You can refer link, for mo# re detail.
 # Train the model using the training sets and check score
 model.fit(X, y)
 model.score(X, y)
 #Predict Output
 predicted= model.predict(x_test)
 # Naive Bayes
 #Import Library
 from sklearn.naive_bayes import GaussianNB
 #Assumed you have, X (predictor) and Y (target) for training data set and x_test(predictor) of test_dataset
 # Create SVM classification object model = GaussianNB() # there is other distribution for multinomial classes like Bernoulli Naive Bayes, Refer link
 # Train the model using the training sets and check score
 model.fit(X, y)
 #Predict Output
 predicted= model.predict(x_test)
 #kNN (k- Nearest Neighbors)
 #Import Library
 from sklearn.neighbors import KNeighborsClassifier
 #Assumed you have, X (predictor) and Y (target) for training data set and x_test(predictor) of test_dataset
 # Create KNeighbors classifier object model
 KNeighborsClassifier(n_neighbors=6) # default value for n_neighbors is 5
 # Train the model using the training sets and check score
 model.fit(X, y)
 #Predict Output
 predicted= model.predict(x_test)
 #K-Means
 #Import Library
 from sklearn.cluster import KMeans
 #Assumed you have, X (attributes) for training data set and x_test(attributes) of test_dataset
 # Create KNeighbors classifier object model
 k_means = KMeans(n_clusters=3, random_state=0)
 # Train the model using the training sets and check score
 model.fit(X)
 #Predict Output
 predicted= model.predict(x_test)
 #Random Forest
 #Import Library
 from sklearn.ensemble import RandomForestClassifier
 #Assumed you have, X (predictor) and Y (target) for training data set and x_test(predictor) of test_dataset
 # Create Random Forest object
 model= RandomForestClassifier()
 # Train the model using the training sets and check score
 model.fit(X, y)
 #Predict Output
 predicted= model.predict(x_test)
 #Dimensionality Reduction Algorithms
 #Import Library
 from sklearn import decomposition
 #Assumed you have training and test data set as train and test
 # Create PCA obeject pca= decomposition.PCA(n_components=k) #default value of k =min(n_sample, n_features)
 # For Factor analysis
 #fa= decomposition.FactorAnalysis()
 # Reduced the dimension of training dataset using PCA
 train_reduced = pca.fit_transform(train)
 #Reduced the dimension of test dataset
 test_reduced = pca.transform(test)
--- a/algorithms/AlgorithmsInR.r
+++ b/algorithms/AlgorithmsInR.r
@ -0,0 +1,85 @@
 #Linear Regression
 #Load Train and Test datasets
 #Identify feature and response variable(s) and values must be numeric and numpy arrays
 x_train <- input_variables_values_training_datasets
 y_train <- target_variables_values_training_datasets
 x_test <- input_variables_values_test_datasets
 x <- cbind(x_train,y_train)
 # Train the model using the training sets and check score
 linear <- lm(y_train ~ ., data = x)
 summary(linear)
 #Predict Output
 predicted= predict(linear,x_test)
 #Logistic Regression
 x <- cbind(x_train,y_train)
 # Train the model using the training sets and check score
 logistic <- glm(y_train ~ ., data = x,family='binomial')
 summary(logistic)
 #Predict Output
 predicted= predict(logistic,x_test)
 #Decision tree
 library(rpart)
 x <- cbind(x_train,y_train)
 # grow tree
 fit <- rpart(y_train ~ ., data = x,method="class")
 summary(fit)
 #Predict Output
 predicted= predict(fit,x_test)
 #SVM
 library(e1071)
 x <- cbind(x_train,y_train)
 # Fitting model
 fit <-svm(y_train ~ ., data = x)
 summary(fit)
 #Predict Output
 predicted= predict(fit,x_test)
 # Naive Bayes
 library(e1071)
 x <- cbind(x_train,y_train)
 # Fitting model
 fit <-naiveBayes(y_train ~ ., data = x)
 summary(fit)
 #Predict Output
 predicted= predict(fit,x_test)
 #kNN (k- Nearest Neighbors)
 library(knn)
 x <- cbind(x_train,y_train)
 # Fitting model
 fit <-knn(y_train ~ ., data = x,k=5)
 summary(fit)
 #Predict Output
 predicted= predict(fit,x_test)
 #K-Means
 library(cluster)
 fit <- kmeans(X, 3) # 5 cluster solution
 #Random Forest
 library(randomForest)
 x <- cbind(x_train,y_train)
 # Fitting model
 fit <- randomForest(Species ~ ., x,ntree=500)
 summary(fit)
 #Predict Output
 predicted= predict(fit,x_test)
 #Dimensionality Reduction Algorithms
 library(stats)
 pca <- princomp(train, cor = TRUE)
 train_reduced  <- predict(pca,train)
 test_reduced  <- predict(pca,test)
--- a/algorithms/DataPreprocessing.py
+++ b/algorithms/DataPreprocessing.py
@ -0,0 +1,108 @@
 '''
 Created on Apr 25, 2016
 test code
@author: Wenqiang Feng
 '''
 import pandas as pd
 #import numpy as np
 import matplotlib.pyplot as plt
 from pandas.tools.plotting import scatter_matrix
 from docutils.parsers.rst.directives import path
 if __name__ == '__main__':
    path ='~/Dropbox/MachineLearningAlgorithms/python_code/data/Heart.csv'
    rawdata = pd.read_csv(path)
    print "data summary"
    print rawdata.describe()
    # summary plot of the data
    scatter_matrix(rawdata,figsize=[15,15])
    plt.show()
    # Histogram
    rawdata.hist()
    plt.show()
    # boxplot
    pd.DataFrame.boxplot(rawdata)
    plt.show()
    print "Raw data size"
    nrow, ncol = rawdata.shape
    print nrow, ncol
    path = ('/home/feng/Dropbox/MachineLearningAlgorithms/python_code/data/'
    'energy_efficiency.xlsx')
    path
    rawdataEnergy= pd.read_excel(path,sheetname=0)
    nrow=rawdata.shape[0] #gives number of row count
    ncol=rawdata.shape[1] #gives number of col count
    print nrow, ncol
    col_names = rawdata.columns.tolist()
    print "Column names:"
    print col_names
    print "Data Format:"
    print rawdata.dtypes
    print "\nSample data:"
    print(rawdata.head(6))
    print "\n correlation Matrix"
    print rawdata.corr()
    # cocorrelation Matrix plot
    pd.DataFrame.corr(rawdata)
    plt.show()
    print "\n covariance Matrix"
    print rawdata.cov()
    print rawdata[['Age','Ca']].corr()
    pd.DataFrame.corr(rawdata)
    plt.show()
    # define colors list, to be used to plot survived either red (=0) or green (=1)
    colors=['red','green']
    # make a scatter plot
 #    rawdata.info()
    from scipy import stats
    import seaborn as sns # just a conventional alias, don't know why
    sns.corrplot(rawdata) # compute and plot the pair-wise correlations
    # save to file, remove the big white borders
    #plt.savefig('attribute_correlations.png', tight_layout=True)
    plt.show()
    attr = rawdata['Age']
    sns.distplot(attr)
    plt.show()
    sns.distplot(attr, kde=False, fit=stats.gamma);
    plt.show()
    # Two subplots, the axes array is 1-d
    plt.figure(1)
    plt.title('Histogram of Age')
    plt.subplot(211) # 21,1 means first one of 2 rows, 1 col
    sns.distplot(attr)
    plt.subplot(212) #  21,2 means second one of 2 rows, 1 col
    sns.distplot(attr, kde=False, fit=stats.gamma);
    plt.show()
--- a/algorithms/DataPreprocessing.r
+++ b/algorithms/DataPreprocessing.r
@ -0,0 +1,83 @@
 rm(list = ls())
 # set the enverionment
 path ='~/Dropbox/MachineLearningAlgorithms/python_code/data/Heart.csv'
 rawdata = read.csv(path)
 # summary of the data
 summary(rawdata)
 # plot of the summary
 plot(rawdata)
 dim(rawdata)
 head(rawdata)
 tail(rawdata)
 colnames(rawdata)
 attach(rawdata)
 # get numerical data and remove NAN
 numdata=na.omit(rawdata[,c(1:2,4:12)])
 cor(numdata)
 cov(numdata)
 dev.off()
 # laod cocorrelation Matrix plot lib
 library(corrplot)
 M <- cor(numdata)
 #par(mfrow =c (1,2))
 #corrplot(M, method = "square")
 corrplot.mixed(M)
 nrow=nrow(rawdata)
 ncol=ncol(rawdata)
 c(nrow, ncol)
 Nvars=ncol(numdata)
 # checking data format
 typeof(rawdata)
 install.packages("mlbench")
 library(mlbench)
 sapply(rawdata, class)
 dev.off()
 name=colnames(numdata)
 Nvars=ncol(numdata)
 # boxplot
 par(mfrow =c (4,3))
 for (i in 1:Nvars)
 {
  #boxplot(numdata[,i]~numdata[,Nvars],data=data,main=name[i])
  boxplot(numdata[,i],data=numdata,main=name[i])
 }
 # Histogram with normal curve plot
 dev.off()
 Nvars=ncol(numdata)
 name=colnames(numdata)
 par(mfrow =c (3,5))
 for (i in 1:Nvars)
 {
  x<- numdata[,i]
  h<-hist(x, breaks=10, freq=TRUE, col="blue", xlab=name[i],main=" ",
            font.lab=1)
  axis(1, tck=1, col.ticks="light gray")
  axis(1, tck=-0.015, col.ticks="black")
  axis(2, tck=1, col.ticks="light gray", lwd.ticks="1")
  axis(2, tck=-0.015)
  xfit<-seq(min(x),max(x),length=40)
  yfit<-dnorm(xfit,mean=mean(x),sd=sd(x))
  yfit <- yfit*diff(h$mids[1:2])*length(x)
  lines(xfit, yfit, col="blue", lwd=2)
 }
 library(reshape2)
 library(ggplot2)
 d <- melt(diamonds[,-c(2:4)])
 ggplot(d,aes(x = value)) +
  facet_wrap(~variable,scales = "free_x") +
  geom_histogram()
--- a/algorithms/LinearRegression.py
+++ b/algorithms/LinearRegression.py
@ -0,0 +1,20 @@
 #LinearRegression]
 #Import Library
 #Import other necessary libraries like pandas, numpy...
 from sklearn import linear_model
 #Load Train and Test datasets
 #Identify feature and response variable(s) and values must be numeric and numpy arrays
 x_train=input_variables_values_training_datasets
 y_train=target_variables_values_training_datasets
 x_test=input_variables_values_test_datasets
 # Create linear regression object
 linear = linear_model.LinearRegression()
 # Train the model using the training sets and check score
 linear.fit(x_train, y_train)
 linear.score(x_train, y_train)
 #Equation coefficient and Intercept
 print('Coefficient: \n', linear.coef_)
 print('Intercept: \n', linear.intercept_)
 #Predict Output
 predicted= linear.predict(x_test)
--- a/tex/cites.bib
+++ b/tex/cites.bib
@ -1,44 +1,42 @@
@article { robust,
 autor = "Julien Pinquier, Jean-Luc Rouas and Régine André-Obrecht",
 title = "ROBUST SPEECH/MUSIC CLASSIFICATION IN AUDIO DOCUMENTS",
 author = "Julien Pinquier, Jean-Luc Rouas and Régine André-Obrecht",
 journal = "7th International Conference on Spoken Language Processing [ICSLP2002]",
 year = "2002"
 }
@article { mirex,
 autor = "Nikolaos Tsipas, Lazaros Vrysis, Charalampos Dimoulas and George Papanikolaou",
 title = "MIREX 2015: METHODS FOR SPEECH/MUSIC DETECTION AND CLASSIFICATION",
 author = "Nikolaos Tsipas, Lazaros Vrysis, Charalampos Dimoulas and George Papanikolaou",
 journal = "MIREX 2015 Conference",
 year = "2015"
 }
@article { speech,
 autor = "Baniriskhem K. Khonglah and S.R. Mahadeva Prasanna",
 title = "Speech / music classification using speech-specific features",
 author = "Baniriskhem K. Khonglah and S.R. Mahadeva Prasanna",
 journal = "Digital Signal Processing 48",
 year = "2016"
 }
@article{ cuckoo,
 title = "Speech classification based on cuckoo algorithm and support vector machines",
 author = "Wenlei Shi and Xinhai Fan",
-title = "Speech classification based on cuckoo algorithm and support
+journal = "2nd IEEE International Conference on Computational Intelligence and Applications",	  
                  vector machines",
 journal = "2nd IEEE International Conference on Computational
                  Intelligence and Applications",	  
 year = "2017"
 }
@article{ radio,
 author = "Stanisław Kacprzak, Błażej Chwiećko and Bartosz Ziółko",
 title = "Speech/music discrimination for analysis of radio stations",
 author = "Stanisław Kacprzak, Błażej Chwiećko and Bartosz Ziółko",
 journal = "2017 International Conference on Systems, Signals and Image Processing (IWSSIP)",
 year = "2017"
 }
 <<<<<<< HEAD
@article { speech,
 author = "Baniriskhem K. Khonglah and S.R. Mahadeva Prasanna",
 title = "Speech / music classification using speech-specific features",
 author = "Baniriskhem K. Khonglah and S.R. Mahadeva Prasanna",
 journal = "Digital Signal Processing 48",
 year = "2016"
 )