You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
108 lines
2.3 KiB
108 lines
2.3 KiB
'''
|
|
Created on Apr 25, 2016
|
|
test code
|
|
@author: Wenqiang Feng
|
|
'''
|
|
import pandas as pd
|
|
#import numpy as np
|
|
import matplotlib.pyplot as plt
|
|
from pandas.tools.plotting import scatter_matrix
|
|
from docutils.parsers.rst.directives import path
|
|
|
|
if __name__ == '__main__':
|
|
path ='~/Dropbox/MachineLearningAlgorithms/python_code/data/Heart.csv'
|
|
rawdata = pd.read_csv(path)
|
|
|
|
print "data summary"
|
|
print rawdata.describe()
|
|
|
|
# summary plot of the data
|
|
scatter_matrix(rawdata,figsize=[15,15])
|
|
plt.show()
|
|
|
|
# Histogram
|
|
rawdata.hist()
|
|
plt.show()
|
|
|
|
# boxplot
|
|
pd.DataFrame.boxplot(rawdata)
|
|
plt.show()
|
|
|
|
|
|
print "Raw data size"
|
|
nrow, ncol = rawdata.shape
|
|
print nrow, ncol
|
|
|
|
path = ('/home/feng/Dropbox/MachineLearningAlgorithms/python_code/data/'
|
|
'energy_efficiency.xlsx')
|
|
path
|
|
|
|
rawdataEnergy= pd.read_excel(path,sheetname=0)
|
|
|
|
nrow=rawdata.shape[0] #gives number of row count
|
|
ncol=rawdata.shape[1] #gives number of col count
|
|
print nrow, ncol
|
|
col_names = rawdata.columns.tolist()
|
|
print "Column names:"
|
|
print col_names
|
|
print "Data Format:"
|
|
print rawdata.dtypes
|
|
|
|
print "\nSample data:"
|
|
print(rawdata.head(6))
|
|
|
|
|
|
print "\n correlation Matrix"
|
|
print rawdata.corr()
|
|
|
|
# cocorrelation Matrix plot
|
|
pd.DataFrame.corr(rawdata)
|
|
plt.show()
|
|
|
|
print "\n covariance Matrix"
|
|
print rawdata.cov()
|
|
|
|
print rawdata[['Age','Ca']].corr()
|
|
pd.DataFrame.corr(rawdata)
|
|
plt.show()
|
|
|
|
|
|
|
|
# define colors list, to be used to plot survived either red (=0) or green (=1)
|
|
colors=['red','green']
|
|
|
|
# make a scatter plot
|
|
|
|
# rawdata.info()
|
|
|
|
from scipy import stats
|
|
import seaborn as sns # just a conventional alias, don't know why
|
|
sns.corrplot(rawdata) # compute and plot the pair-wise correlations
|
|
# save to file, remove the big white borders
|
|
#plt.savefig('attribute_correlations.png', tight_layout=True)
|
|
plt.show()
|
|
|
|
|
|
attr = rawdata['Age']
|
|
sns.distplot(attr)
|
|
plt.show()
|
|
|
|
sns.distplot(attr, kde=False, fit=stats.gamma);
|
|
plt.show()
|
|
|
|
# Two subplots, the axes array is 1-d
|
|
plt.figure(1)
|
|
plt.title('Histogram of Age')
|
|
plt.subplot(211) # 21,1 means first one of 2 rows, 1 col
|
|
sns.distplot(attr)
|
|
|
|
plt.subplot(212) # 21,2 means second one of 2 rows, 1 col
|
|
sns.distplot(attr, kde=False, fit=stats.gamma);
|
|
|
|
plt.show()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|