In [33]:
%matplotlib inline
from matplotlib import pylab as plt
plt.rcParams['figure.figsize'] = (15.0, 10.0)
import pandas as pd
In [3]:
data = pd.read_csv("978-3-319-12065-2/chapter-5/metropolitan.csv")
In [4]:
data.shape #dim(data) in R
Out[4]:
In [5]:
len(data)#nrow(data) in R
Out[5]:
In [6]:
data.ix[:,0:3].head(6)
Out[6]:
In [7]:
data.describe()
Out[7]:
In [8]:
data[data.NAME == "Carson City, NV"].ix[:,0:2]
Out[8]:
In [9]:
data[data.NAME == "Washington-Arlington-Alexandria, DC-VA-MD-WV"].ix[:,0:2]
Out[9]:
In [10]:
data.sort(['CENSUS2010POP']).head(6)
Out[10]:
In [11]:
data.sort(['CENSUS2010POP'],ascending=[False]).head(6)
Out[11]:
In [12]:
output = data.sort(['CENSUS2010POP'],ascending=[False])
data.ix[output.index[:10]].ix[:,0:2]
Out[12]:
In [13]:
data[data.LSAD == "Metropolitan Statistical Area"].head()
Out[13]:
In [17]:
data.groupby(["LSAD"]).CENSUS2010POP.mean()
Out[17]:
In [25]:
data_split = data.groupby(["LSAD"])
for indx,groups in data_split:
print groups.ix[:5].sort(ascending=True).ix[:,0:3]
#rework
In [26]:
data_split.groups.keys()
Out[26]:
In [27]:
data_county = data_split.get_group('County or equivalent')
data_division = data_split.get_group('Metropolitan Division')
data_metro = data_split.get_group('Metropolitan Statistical Area')
data_micro = data_split.get_group('Micropolitan Statistical Area')
In [28]:
data["CENSUS2010POP"].corr(data["BIRTHS2010"])
Out[28]:
In [34]:
import seaborn as sns
#sns.corrplot(data,method='spearman',diag_names=False)
sns.corrplot(data,diag_names=False)
Out[34]:
In [35]:
data_metro.CENSUS2010POP.corr(data_metro.DOMESTICMIG2010)
Out[35]:
In [36]:
data_micro.boxplot("BIRTHS2010",showmeans = True)
Out[36]:
In [37]:
sns.boxplot(data_micro['BIRTHS2010'].values,showmeans = True)
Out[37]:
In [38]:
sns.boxplot(data_micro[['BIRTHS2010','DEATHS2010']].values,showmeans = True,names=['BIRTHS2010','DEATHS2010'])
Out[38]:
In [39]:
sns.boxplot(data.BIRTHS2010,groupby = data.LSAD)
Out[39]:
In [40]:
data_micro.BIRTHS2010.hist()
Out[40]:
In [41]:
sns.distplot(data_micro.BIRTHS2010.values,kde=False)
Out[41]:
In [42]:
sns.distplot(data_micro.BIRTHS2010.values,rug=True,kde=False)
Out[42]:
In [43]:
sns.distplot(data_micro.BIRTHS2010.values)
Out[43]:
In [44]:
sns.distplot(data_micro.BIRTHS2010.values,kde=False,hist_kws={'cumulative':True})
Out[44]:
In [45]:
data_micro.skew(numeric_only=True)
Out[45]:
In [46]:
data_metro.skew(numeric_only=True)
Out[46]:
In [49]:
data_micro.kurtosis()
Out[49]:
I didn't find a gini function in Python library. So I wrote my own function :-)
In [50]:
import numpy as np
def gini_coeff(x):
n = len(x)
s = x.sum()
r = np.argsort(np.argsort(-x))
return 1 - (2.0 * (r*x).sum() + s)/(n*s)
gini_coeff(data_metro.CENSUS2010POP)
Out[50]:
In [51]:
survey_data = pd.read_csv("978-3-319-12065-2/chapter-5/survey-fixed.csv")
survey_data.head()
Out[51]:
In [52]:
survey_data.describe()
Out[52]:
In [53]:
sns.distplot(survey_data.weight,kde=False)
Out[53]:
In [56]:
sns.boxplot(survey_data.weight.values,showmeans = True,names=["weight"])
Out[56]: