CH 5 Summary Statistics

Case Study : US Census


In [33]:
%matplotlib inline
from matplotlib import pylab as plt
plt.rcParams['figure.figsize'] = (15.0, 10.0)
import pandas as pd

In [3]:
data = pd.read_csv("978-3-319-12065-2/chapter-5/metropolitan.csv")

5.1.1 Dataset Size


In [4]:
data.shape #dim(data) in R


Out[4]:
(2759, 11)

In [5]:
len(data)#nrow(data) in R


Out[5]:
2759

5.1.2 Summarizing the Data


In [6]:
data.ix[:,0:3].head(6)


Out[6]:
NAME LSAD CENSUS2010POP
0 Abilene, TX Metropolitan Statistical Area 165252
1 Callahan County, TX County or equivalent 13544
2 Jones County, TX County or equivalent 20202
3 Taylor County, TX County or equivalent 131506
4 Akron, OH Metropolitan Statistical Area 703200
5 Portage County, OH County or equivalent 161419

In [7]:
data.describe()


Out[7]:
CENSUS2010POP NPOPCHG_2010 NATURALINC2010 BIRTHS2010 DEATHS2010 NETMIG2010 INTERNATIONALMIG2010 DOMESTICMIG2010 RESIDUAL2010
count 2759.000000 2759.000000 2759.000000 2759.000000 2759.000000 2759.000000 2759.000000 2759.000000 2759.000000
mean 239150.943820 475.017035 326.031171 773.950344 447.919174 148.305183 163.640449 -15.335266 0.680681
std 785448.702172 1832.513065 1379.131425 2623.272276 1312.072679 982.640175 794.041934 1210.348710 23.145641
min 416.000000 -5525.000000 -794.000000 0.000000 0.000000 -9786.000000 -2.000000 -25584.000000 -204.000000
25% 33258.000000 -10.000000 7.000000 98.000000 79.000000 -38.000000 3.000000 -62.000000 -3.000000
50% 62544.000000 54.000000 40.000000 184.000000 145.000000 15.000000 14.000000 0.000000 -1.000000
75% 156813.000000 257.500000 161.000000 483.000000 318.000000 123.000000 61.000000 73.000000 1.000000
max 18897109.000000 29631.000000 28129.000000 60868.000000 32739.000000 14574.000000 20199.000000 9416.000000 376.000000

In [8]:
data[data.NAME == "Carson City, NV"].ix[:,0:2]


Out[8]:
NAME LSAD
226 Carson City, NV Metropolitan Statistical Area
227 Carson City, NV County or equivalent

In [9]:
data[data.NAME == "Washington-Arlington-Alexandria, DC-VA-MD-WV"].ix[:,0:2]


Out[9]:
NAME LSAD
1418 Washington-Arlington-Alexandria, DC-VA-MD-WV Metropolitan Statistical Area
1422 Washington-Arlington-Alexandria, DC-VA-MD-WV Metropolitan Division

5.1.3 Ordering Data by a Variable


In [10]:
data.sort(['CENSUS2010POP']).head(6)


Out[10]:
NAME LSAD CENSUS2010POP NPOPCHG_2010 NATURALINC2010 BIRTHS2010 DEATHS2010 NETMIG2010 INTERNATIONALMIG2010 DOMESTICMIG2010 RESIDUAL2010
2088 Kenedy County, TX County or equivalent 416 3 1 1 0 2 0 2 0
2330 McPherson County, NE County or equivalent 539 1 1 1 0 0 0 0 0
2516 Banner County, NE County or equivalent 690 6 1 1 0 5 0 5 0
2329 Logan County, NE County or equivalent 763 4 -1 0 1 5 0 5 0
1793 Billings County, ND County or equivalent 783 -10 2 2 0 -11 0 -11 -1
2387 Roberts County, TX County or equivalent 929 -5 1 2 1 -6 0 -6 0

In [11]:
data.sort(['CENSUS2010POP'],ascending=[False]).head(6)


Out[11]:
NAME LSAD CENSUS2010POP NPOPCHG_2010 NATURALINC2010 BIRTHS2010 DEATHS2010 NETMIG2010 INTERNATIONALMIG2010 DOMESTICMIG2010 RESIDUAL2010
945 New York-Northern New Jersey-Long Island, NY-N... Metropolitan Statistical Area 18897109 22540 28129 60868 32739 -5385 20199 -25584 -204
789 Los Angeles-Long Beach-Santa Ana, CA Metropolitan Statistical Area 12828837 15534 22576 41178 18602 -7373 13472 -20845 331
961 New York-White Plains-Wayne, NY-NJ Metropolitan Division 11576251 15482 21371 40129 18758 -5831 15394 -21225 -58
791 Los Angeles County, CA County or equivalent 9818605 8168 17578 31907 14329 -9786 10568 -20354 376
790 Los Angeles-Long Beach-Glendale, CA Metropolitan Division 9818605 8168 17578 31907 14329 -9786 10568 -20354 376
270 Chicago-Joliet-Naperville, IL-IN-WI Metropolitan Statistical Area 9461105 11479 16148 32384 16236 -4720 6046 -10766 51

In [12]:
output = data.sort(['CENSUS2010POP'],ascending=[False])
data.ix[output.index[:10]].ix[:,0:2]


Out[12]:
NAME LSAD
945 New York-Northern New Jersey-Long Island, NY-N... Metropolitan Statistical Area
789 Los Angeles-Long Beach-Santa Ana, CA Metropolitan Statistical Area
961 New York-White Plains-Wayne, NY-NJ Metropolitan Division
791 Los Angeles County, CA County or equivalent
790 Los Angeles-Long Beach-Glendale, CA Metropolitan Division
270 Chicago-Joliet-Naperville, IL-IN-WI Metropolitan Statistical Area
271 Chicago-Joliet-Naperville, IL Metropolitan Division
367 Dallas-Fort Worth-Arlington, TX Metropolitan Statistical Area
1045 Philadelphia-Camden-Wilmington, PA-NJ-DE-MD Metropolitan Statistical Area
599 Houston-Sugar Land-Baytown, TX Metropolitan Statistical Area

5.1.4 Group and Split Data by a Variable


In [13]:
data[data.LSAD == "Metropolitan Statistical Area"].head()


Out[13]:
NAME LSAD CENSUS2010POP NPOPCHG_2010 NATURALINC2010 BIRTHS2010 DEATHS2010 NETMIG2010 INTERNATIONALMIG2010 DOMESTICMIG2010 RESIDUAL2010
0 Abilene, TX Metropolitan Statistical Area 165252 417 228 609 381 190 77 113 -1
4 Akron, OH Metropolitan Statistical Area 703200 -332 310 1945 1635 -631 127 -758 -11
7 Albany, GA Metropolitan Statistical Area 157308 277 220 542 322 57 36 21 0
13 Albany-Schenectady-Troy, NY Metropolitan Statistical Area 870716 248 443 2293 1850 -184 408 -592 -11
19 Albuquerque, NM Metropolitan Statistical Area 887077 2893 1387 2938 1551 1499 468 1031 7

In [17]:
data.groupby(["LSAD"]).CENSUS2010POP.mean()


Out[17]:
LSAD
County or equivalent              161779
Metropolitan Division            2803269
Metropolitan Statistical Area     705786
Micropolitan Statistical Area      53721
Name: CENSUS2010POP, dtype: int64

In [25]:
data_split = data.groupby(["LSAD"])
for indx,groups in data_split:
    print groups.ix[:5].sort(ascending=True).ix[:,0:3]
#rework


                  NAME                  LSAD  CENSUS2010POP
1  Callahan County, TX  County or equivalent          13544
2     Jones County, TX  County or equivalent          20202
3    Taylor County, TX  County or equivalent         131506
5   Portage County, OH  County or equivalent         161419
Empty DataFrame
Columns: [NAME, LSAD, CENSUS2010POP]
Index: []
          NAME                           LSAD  CENSUS2010POP
0  Abilene, TX  Metropolitan Statistical Area         165252
4    Akron, OH  Metropolitan Statistical Area         703200
Empty DataFrame
Columns: [NAME, LSAD, CENSUS2010POP]
Index: []

In [26]:
data_split.groups.keys()


Out[26]:
['Metropolitan Division',
 'County or equivalent',
 'Micropolitan Statistical Area',
 'Metropolitan Statistical Area']

In [27]:
data_county = data_split.get_group('County or equivalent')
data_division = data_split.get_group('Metropolitan Division')
data_metro = data_split.get_group('Metropolitan Statistical Area')
data_micro = data_split.get_group('Micropolitan Statistical Area')

5.1.5 Variable Correlation


In [28]:
data["CENSUS2010POP"].corr(data["BIRTHS2010"])


Out[28]:
0.99516078090537374

In [34]:
import seaborn as sns
#sns.corrplot(data,method='spearman',diag_names=False)
sns.corrplot(data,diag_names=False)


Out[34]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f91ca08e110>

In [35]:
data_metro.CENSUS2010POP.corr(data_metro.DOMESTICMIG2010)


Out[35]:
-0.52716858609671935

5.2 Getting Sense of Data Distribution

5.2.1 Box Plots


In [36]:
data_micro.boxplot("BIRTHS2010",showmeans = True)


/home/ubuntu/anaconda/lib/python2.7/site-packages/pandas/tools/plotting.py:2633: FutureWarning: 
The default value for 'return_type' will change to 'axes' in a future release.
 To use the future behavior now, set return_type='axes'.
 To keep the previous behavior and silence this warning, set return_type='dict'.
  warnings.warn(msg, FutureWarning)
Out[36]:
{'boxes': [<matplotlib.lines.Line2D at 0x7f91c9cbae90>],
 'caps': [<matplotlib.lines.Line2D at 0x7f91c9c48d10>,
  <matplotlib.lines.Line2D at 0x7f91c9c55390>],
 'fliers': [<matplotlib.lines.Line2D at 0x7f91c9c60690>],
 'means': [<matplotlib.lines.Line2D at 0x7f91c9c60050>],
 'medians': [<matplotlib.lines.Line2D at 0x7f91c9c559d0>],
 'whiskers': [<matplotlib.lines.Line2D at 0x7f91c9cbac90>,
  <matplotlib.lines.Line2D at 0x7f91c9c486d0>]}

In [37]:
sns.boxplot(data_micro['BIRTHS2010'].values,showmeans = True)


Out[37]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f91c9cef9d0>

In [38]:
sns.boxplot(data_micro[['BIRTHS2010','DEATHS2010']].values,showmeans = True,names=['BIRTHS2010','DEATHS2010'])


Out[38]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f91ca0585d0>

In [39]:
sns.boxplot(data.BIRTHS2010,groupby = data.LSAD)


Out[39]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f91c9b4fb50>

5.2.2 Histograms


In [40]:
data_micro.BIRTHS2010.hist()


Out[40]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f91c9941b50>

In [41]:
sns.distplot(data_micro.BIRTHS2010.values,kde=False)


Out[41]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f91c994f4d0>

Rug


In [42]:
sns.distplot(data_micro.BIRTHS2010.values,rug=True,kde=False)


Out[42]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f91c9502e10>

In [43]:
sns.distplot(data_micro.BIRTHS2010.values)


Out[43]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f91c88709d0>

Cumulative


In [44]:
sns.distplot(data_micro.BIRTHS2010.values,kde=False,hist_kws={'cumulative':True})


Out[44]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f91c94b5b90>

5.2.3 Mesuring Data Symmetry Using Skewness and Kurtosis


In [45]:
data_micro.skew(numeric_only=True)


Out[45]:
CENSUS2010POP           1.742990
NPOPCHG_2010            2.644012
NATURALINC2010          1.017018
BIRTHS2010              1.687774
DEATHS2010              1.554309
NETMIG2010              2.614688
INTERNATIONALMIG2010    4.497461
DOMESTICMIG2010         2.378099
RESIDUAL2010            0.922628
dtype: float64

In [46]:
data_metro.skew(numeric_only=True)


Out[46]:
CENSUS2010POP           6.704732
NPOPCHG_2010            4.737815
NATURALINC2010          6.132388
BIRTHS2010              6.496510
DEATHS2010              6.634178
NETMIG2010              3.305375
INTERNATIONALMIG2010    8.193531
DOMESTICMIG2010        -6.113914
RESIDUAL2010            3.183461
dtype: float64

In [49]:
data_micro.kurtosis()


Out[49]:
CENSUS2010POP            3.801310
NPOPCHG_2010            14.596497
NATURALINC2010           6.658625
BIRTHS2010               3.545330
DEATHS2010               3.863693
NETMIG2010              18.855522
INTERNATIONALMIG2010    32.138889
DOMESTICMIG2010         19.549023
RESIDUAL2010            14.659210
dtype: float64

I didn't find a gini function in Python library. So I wrote my own function :-)


In [50]:
import numpy as np

def gini_coeff(x):
    n = len(x)
    s = x.sum()
    r = np.argsort(np.argsort(-x))
    return 1 - (2.0 * (r*x).sum() + s)/(n*s)

gini_coeff(data_metro.CENSUS2010POP)


Out[50]:
0.6593656365060403

In [51]:
survey_data = pd.read_csv("978-3-319-12065-2/chapter-5/survey-fixed.csv")
survey_data.head()


Out[51]:
sex height weight handedness exercise smoke
0 Female 68.000000 158 Right Some Never
1 Male 70.000000 256 Left None Regul
2 Male 70.415094 204 Right None Occas
3 Male 63.000000 187 Right None Never
4 Male 65.000000 168 Right Some Never

In [52]:
survey_data.describe()


Out[52]:
height weight
count 237.000000 237.000000
mean 67.845457 172.932489
std 3.752505 62.131930
min 59.000000 0.000000
25% 65.274510 150.000000
50% 67.000000 169.000000
75% 70.415094 191.000000
max 79.000000 958.000000

In [53]:
sns.distplot(survey_data.weight,kde=False)


Out[53]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f91c8597650>

In [56]:
sns.boxplot(survey_data.weight.values,showmeans = True,names=["weight"])


Out[56]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f91c825c450>