CH 5 Summary Statistics

Case Study : US Census



In [33]:

    
%matplotlib inline
from matplotlib import pylab as plt
plt.rcParams['figure.figsize'] = (15.0, 10.0)
import pandas as pd



In [3]:

    
data = pd.read_csv("978-3-319-12065-2/chapter-5/metropolitan.csv")

5.1.1 Dataset Size



In [4]:

    
data.shape #dim(data) in R









    Out[4]:





(2759, 11)



In [5]:

    
len(data)#nrow(data) in R









    Out[5]:





2759

5.1.2 Summarizing the Data



In [6]:

    
data.ix[:,0:3].head(6)









    Out[6]:






  
    
      
      NAME
      LSAD
      CENSUS2010POP
    
  
  
    
      0
      Abilene, TX
      Metropolitan Statistical Area
      165252
    
    
      1
      Callahan County, TX
      County or equivalent
      13544
    
    
      2
      Jones County, TX
      County or equivalent
      20202
    
    
      3
      Taylor County, TX
      County or equivalent
      131506
    
    
      4
      Akron, OH
      Metropolitan Statistical Area
      703200
    
    
      5
      Portage County, OH
      County or equivalent
      161419



In [7]:

    
data.describe()









    Out[7]:






  
    
      
      CENSUS2010POP
      NPOPCHG_2010
      NATURALINC2010
      BIRTHS2010
      DEATHS2010
      NETMIG2010
      INTERNATIONALMIG2010
      DOMESTICMIG2010
      RESIDUAL2010
    
  
  
    
      count
      2759.000000
      2759.000000
      2759.000000
      2759.000000
      2759.000000
      2759.000000
      2759.000000
      2759.000000
      2759.000000
    
    
      mean
      239150.943820
      475.017035
      326.031171
      773.950344
      447.919174
      148.305183
      163.640449
      -15.335266
      0.680681
    
    
      std
      785448.702172
      1832.513065
      1379.131425
      2623.272276
      1312.072679
      982.640175
      794.041934
      1210.348710
      23.145641
    
    
      min
      416.000000
      -5525.000000
      -794.000000
      0.000000
      0.000000
      -9786.000000
      -2.000000
      -25584.000000
      -204.000000
    
    
      25%
      33258.000000
      -10.000000
      7.000000
      98.000000
      79.000000
      -38.000000
      3.000000
      -62.000000
      -3.000000
    
    
      50%
      62544.000000
      54.000000
      40.000000
      184.000000
      145.000000
      15.000000
      14.000000
      0.000000
      -1.000000
    
    
      75%
      156813.000000
      257.500000
      161.000000
      483.000000
      318.000000
      123.000000
      61.000000
      73.000000
      1.000000
    
    
      max
      18897109.000000
      29631.000000
      28129.000000
      60868.000000
      32739.000000
      14574.000000
      20199.000000
      9416.000000
      376.000000



In [8]:

    
data[data.NAME == "Carson City, NV"].ix[:,0:2]









    Out[8]:






  
    
      
      NAME
      LSAD
    
  
  
    
      226
      Carson City, NV
      Metropolitan Statistical Area
    
    
      227
      Carson City, NV
      County or equivalent



In [9]:

    
data[data.NAME == "Washington-Arlington-Alexandria, DC-VA-MD-WV"].ix[:,0:2]









    Out[9]:






  
    
      
      NAME
      LSAD
    
  
  
    
      1418
      Washington-Arlington-Alexandria, DC-VA-MD-WV
      Metropolitan Statistical Area
    
    
      1422
      Washington-Arlington-Alexandria, DC-VA-MD-WV
      Metropolitan Division

5.1.3 Ordering Data by a Variable



In [10]:

    
data.sort(['CENSUS2010POP']).head(6)









    Out[10]:






  
    
      
      NAME
      LSAD
      CENSUS2010POP
      NPOPCHG_2010
      NATURALINC2010
      BIRTHS2010
      DEATHS2010
      NETMIG2010
      INTERNATIONALMIG2010
      DOMESTICMIG2010
      RESIDUAL2010
    
  
  
    
      2088
      Kenedy County, TX
      County or equivalent
      416
      3
      1
      1
      0
      2
      0
      2
      0
    
    
      2330
      McPherson County, NE
      County or equivalent
      539
      1
      1
      1
      0
      0
      0
      0
      0
    
    
      2516
      Banner County, NE
      County or equivalent
      690
      6
      1
      1
      0
      5
      0
      5
      0
    
    
      2329
      Logan County, NE
      County or equivalent
      763
      4
      -1
      0
      1
      5
      0
      5
      0
    
    
      1793
      Billings County, ND
      County or equivalent
      783
      -10
      2
      2
      0
      -11
      0
      -11
      -1
    
    
      2387
      Roberts County, TX
      County or equivalent
      929
      -5
      1
      2
      1
      -6
      0
      -6
      0



In [11]:

    
data.sort(['CENSUS2010POP'],ascending=[False]).head(6)









    Out[11]:






  
    
      
      NAME
      LSAD
      CENSUS2010POP
      NPOPCHG_2010
      NATURALINC2010
      BIRTHS2010
      DEATHS2010
      NETMIG2010
      INTERNATIONALMIG2010
      DOMESTICMIG2010
      RESIDUAL2010
    
  
  
    
      945
      New York-Northern New Jersey-Long Island, NY-N...
      Metropolitan Statistical Area
      18897109
      22540
      28129
      60868
      32739
      -5385
      20199
      -25584
      -204
    
    
      789
      Los Angeles-Long Beach-Santa Ana, CA
      Metropolitan Statistical Area
      12828837
      15534
      22576
      41178
      18602
      -7373
      13472
      -20845
      331
    
    
      961
      New York-White Plains-Wayne, NY-NJ
      Metropolitan Division
      11576251
      15482
      21371
      40129
      18758
      -5831
      15394
      -21225
      -58
    
    
      791
      Los Angeles County, CA
      County or equivalent
      9818605
      8168
      17578
      31907
      14329
      -9786
      10568
      -20354
      376
    
    
      790
      Los Angeles-Long Beach-Glendale, CA
      Metropolitan Division
      9818605
      8168
      17578
      31907
      14329
      -9786
      10568
      -20354
      376
    
    
      270
      Chicago-Joliet-Naperville, IL-IN-WI
      Metropolitan Statistical Area
      9461105
      11479
      16148
      32384
      16236
      -4720
      6046
      -10766
      51



In [12]:

    
output = data.sort(['CENSUS2010POP'],ascending=[False])
data.ix[output.index[:10]].ix[:,0:2]









    Out[12]:






  
    
      
      NAME
      LSAD
    
  
  
    
      945
      New York-Northern New Jersey-Long Island, NY-N...
      Metropolitan Statistical Area
    
    
      789
      Los Angeles-Long Beach-Santa Ana, CA
      Metropolitan Statistical Area
    
    
      961
      New York-White Plains-Wayne, NY-NJ
      Metropolitan Division
    
    
      791
      Los Angeles County, CA
      County or equivalent
    
    
      790
      Los Angeles-Long Beach-Glendale, CA
      Metropolitan Division
    
    
      270
      Chicago-Joliet-Naperville, IL-IN-WI
      Metropolitan Statistical Area
    
    
      271
      Chicago-Joliet-Naperville, IL
      Metropolitan Division
    
    
      367
      Dallas-Fort Worth-Arlington, TX
      Metropolitan Statistical Area
    
    
      1045
      Philadelphia-Camden-Wilmington, PA-NJ-DE-MD
      Metropolitan Statistical Area
    
    
      599
      Houston-Sugar Land-Baytown, TX
      Metropolitan Statistical Area

5.1.4 Group and Split Data by a Variable



In [13]:

    
data[data.LSAD == "Metropolitan Statistical Area"].head()









    Out[13]:






  
    
      
      NAME
      LSAD
      CENSUS2010POP
      NPOPCHG_2010
      NATURALINC2010
      BIRTHS2010
      DEATHS2010
      NETMIG2010
      INTERNATIONALMIG2010
      DOMESTICMIG2010
      RESIDUAL2010
    
  
  
    
      0
      Abilene, TX
      Metropolitan Statistical Area
      165252
      417
      228
      609
      381
      190
      77
      113
      -1
    
    
      4
      Akron, OH
      Metropolitan Statistical Area
      703200
      -332
      310
      1945
      1635
      -631
      127
      -758
      -11
    
    
      7
      Albany, GA
      Metropolitan Statistical Area
      157308
      277
      220
      542
      322
      57
      36
      21
      0
    
    
      13
      Albany-Schenectady-Troy, NY
      Metropolitan Statistical Area
      870716
      248
      443
      2293
      1850
      -184
      408
      -592
      -11
    
    
      19
      Albuquerque, NM
      Metropolitan Statistical Area
      887077
      2893
      1387
      2938
      1551
      1499
      468
      1031
      7



In [17]:

    
data.groupby(["LSAD"]).CENSUS2010POP.mean()









    Out[17]:





LSAD
County or equivalent              161779
Metropolitan Division            2803269
Metropolitan Statistical Area     705786
Micropolitan Statistical Area      53721
Name: CENSUS2010POP, dtype: int64



In [25]:

    
data_split = data.groupby(["LSAD"])
for indx,groups in data_split:
    print groups.ix[:5].sort(ascending=True).ix[:,0:3]
#rework









    



                  NAME                  LSAD  CENSUS2010POP
1  Callahan County, TX  County or equivalent          13544
2     Jones County, TX  County or equivalent          20202
3    Taylor County, TX  County or equivalent         131506
5   Portage County, OH  County or equivalent         161419
Empty DataFrame
Columns: [NAME, LSAD, CENSUS2010POP]
Index: []
          NAME                           LSAD  CENSUS2010POP
0  Abilene, TX  Metropolitan Statistical Area         165252
4    Akron, OH  Metropolitan Statistical Area         703200
Empty DataFrame
Columns: [NAME, LSAD, CENSUS2010POP]
Index: []



In [26]:

    
data_split.groups.keys()









    Out[26]:





['Metropolitan Division',
 'County or equivalent',
 'Micropolitan Statistical Area',
 'Metropolitan Statistical Area']



In [27]:

    
data_county = data_split.get_group('County or equivalent')
data_division = data_split.get_group('Metropolitan Division')
data_metro = data_split.get_group('Metropolitan Statistical Area')
data_micro = data_split.get_group('Micropolitan Statistical Area')

5.1.5 Variable Correlation



In [28]:

    
data["CENSUS2010POP"].corr(data["BIRTHS2010"])









    Out[28]:





0.99516078090537374



In [34]:

    
import seaborn as sns
#sns.corrplot(data,method='spearman',diag_names=False)
sns.corrplot(data,diag_names=False)









    Out[34]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f91ca08e110>



In [35]:

    
data_metro.CENSUS2010POP.corr(data_metro.DOMESTICMIG2010)









    Out[35]:





-0.52716858609671935

5.2 Getting Sense of Data Distribution

5.2.1 Box Plots



In [36]:

    
data_micro.boxplot("BIRTHS2010",showmeans = True)









    



/home/ubuntu/anaconda/lib/python2.7/site-packages/pandas/tools/plotting.py:2633: FutureWarning: 
The default value for 'return_type' will change to 'axes' in a future release.
 To use the future behavior now, set return_type='axes'.
 To keep the previous behavior and silence this warning, set return_type='dict'.
  warnings.warn(msg, FutureWarning)






    Out[36]:





{'boxes': [<matplotlib.lines.Line2D at 0x7f91c9cbae90>],
 'caps': [<matplotlib.lines.Line2D at 0x7f91c9c48d10>,
  <matplotlib.lines.Line2D at 0x7f91c9c55390>],
 'fliers': [<matplotlib.lines.Line2D at 0x7f91c9c60690>],
 'means': [<matplotlib.lines.Line2D at 0x7f91c9c60050>],
 'medians': [<matplotlib.lines.Line2D at 0x7f91c9c559d0>],
 'whiskers': [<matplotlib.lines.Line2D at 0x7f91c9cbac90>,
  <matplotlib.lines.Line2D at 0x7f91c9c486d0>]}



In [37]:

    
sns.boxplot(data_micro['BIRTHS2010'].values,showmeans = True)









    Out[37]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f91c9cef9d0>



In [38]:

    
sns.boxplot(data_micro[['BIRTHS2010','DEATHS2010']].values,showmeans = True,names=['BIRTHS2010','DEATHS2010'])









    Out[38]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f91ca0585d0>



In [39]:

    
sns.boxplot(data.BIRTHS2010,groupby = data.LSAD)









    Out[39]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f91c9b4fb50>

5.2.2 Histograms



In [40]:

    
data_micro.BIRTHS2010.hist()









    Out[40]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f91c9941b50>



In [41]:

    
sns.distplot(data_micro.BIRTHS2010.values,kde=False)









    Out[41]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f91c994f4d0>

Rug



In [42]:

    
sns.distplot(data_micro.BIRTHS2010.values,rug=True,kde=False)









    Out[42]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f91c9502e10>



In [43]:

    
sns.distplot(data_micro.BIRTHS2010.values)









    Out[43]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f91c88709d0>

Cumulative



In [44]:

    
sns.distplot(data_micro.BIRTHS2010.values,kde=False,hist_kws={'cumulative':True})









    Out[44]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f91c94b5b90>

5.2.3 Mesuring Data Symmetry Using Skewness and Kurtosis



In [45]:

    
data_micro.skew(numeric_only=True)









    Out[45]:





CENSUS2010POP           1.742990
NPOPCHG_2010            2.644012
NATURALINC2010          1.017018
BIRTHS2010              1.687774
DEATHS2010              1.554309
NETMIG2010              2.614688
INTERNATIONALMIG2010    4.497461
DOMESTICMIG2010         2.378099
RESIDUAL2010            0.922628
dtype: float64



In [46]:

    
data_metro.skew(numeric_only=True)









    Out[46]:





CENSUS2010POP           6.704732
NPOPCHG_2010            4.737815
NATURALINC2010          6.132388
BIRTHS2010              6.496510
DEATHS2010              6.634178
NETMIG2010              3.305375
INTERNATIONALMIG2010    8.193531
DOMESTICMIG2010        -6.113914
RESIDUAL2010            3.183461
dtype: float64



In [49]:

    
data_micro.kurtosis()









    Out[49]:





CENSUS2010POP            3.801310
NPOPCHG_2010            14.596497
NATURALINC2010           6.658625
BIRTHS2010               3.545330
DEATHS2010               3.863693
NETMIG2010              18.855522
INTERNATIONALMIG2010    32.138889
DOMESTICMIG2010         19.549023
RESIDUAL2010            14.659210
dtype: float64

I didn't find a gini function in Python library. So I wrote my own function :-)



In [50]:

    
import numpy as np

def gini_coeff(x):
    n = len(x)
    s = x.sum()
    r = np.argsort(np.argsort(-x))
    return 1 - (2.0 * (r*x).sum() + s)/(n*s)

gini_coeff(data_metro.CENSUS2010POP)









    Out[50]:





0.6593656365060403



In [51]:

    
survey_data = pd.read_csv("978-3-319-12065-2/chapter-5/survey-fixed.csv")
survey_data.head()









    Out[51]:






  
    
      
      sex
      height
      weight
      handedness
      exercise
      smoke
    
  
  
    
      0
      Female
      68.000000
      158
      Right
      Some
      Never
    
    
      1
      Male
      70.000000
      256
      Left
      None
      Regul
    
    
      2
      Male
      70.415094
      204
      Right
      None
      Occas
    
    
      3
      Male
      63.000000
      187
      Right
      None
      Never
    
    
      4
      Male
      65.000000
      168
      Right
      Some
      Never



In [52]:

    
survey_data.describe()



In [53]:

    
sns.distplot(survey_data.weight,kde=False)









    Out[53]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f91c8597650>



In [56]:

    
sns.boxplot(survey_data.weight.values,showmeans = True,names=["weight"])









    Out[56]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f91c825c450>

	NAME	LSAD	CENSUS2010POP
0	Abilene, TX	Metropolitan Statistical Area	165252
1	Callahan County, TX	County or equivalent	13544
2	Jones County, TX	County or equivalent	20202
3	Taylor County, TX	County or equivalent	131506
4	Akron, OH	Metropolitan Statistical Area	703200
5	Portage County, OH	County or equivalent	161419

	CENSUS2010POP	NPOPCHG_2010	NATURALINC2010	BIRTHS2010	DEATHS2010	NETMIG2010	INTERNATIONALMIG2010	DOMESTICMIG2010	RESIDUAL2010
count	2759.000000	2759.000000	2759.000000	2759.000000	2759.000000	2759.000000	2759.000000	2759.000000	2759.000000
mean	239150.943820	475.017035	326.031171	773.950344	447.919174	148.305183	163.640449	-15.335266	0.680681
std	785448.702172	1832.513065	1379.131425	2623.272276	1312.072679	982.640175	794.041934	1210.348710	23.145641
min	416.000000	-5525.000000	-794.000000	0.000000	0.000000	-9786.000000	-2.000000	-25584.000000	-204.000000
25%	33258.000000	-10.000000	7.000000	98.000000	79.000000	-38.000000	3.000000	-62.000000	-3.000000
50%	62544.000000	54.000000	40.000000	184.000000	145.000000	15.000000	14.000000	0.000000	-1.000000
75%	156813.000000	257.500000	161.000000	483.000000	318.000000	123.000000	61.000000	73.000000	1.000000
max	18897109.000000	29631.000000	28129.000000	60868.000000	32739.000000	14574.000000	20199.000000	9416.000000	376.000000

	NAME	LSAD
226	Carson City, NV	Metropolitan Statistical Area
227	Carson City, NV	County or equivalent

	NAME	LSAD
1418	Washington-Arlington-Alexandria, DC-VA-MD-WV	Metropolitan Statistical Area
1422	Washington-Arlington-Alexandria, DC-VA-MD-WV	Metropolitan Division

	NAME	LSAD	CENSUS2010POP	NPOPCHG_2010	NATURALINC2010	BIRTHS2010	DEATHS2010	NETMIG2010	DOMESTICMIG2010	RESIDUAL2010
2088	Kenedy County, TX	County or equivalent	416	3	1	1	0	2	2	0
2330	McPherson County, NE	County or equivalent	539	1	1	1	0	0	0	0
2516	Banner County, NE	County or equivalent	690	6	1	1	0	5	5	0
2329	Logan County, NE	County or equivalent	763	4	-1	0	1	5	5	0
1793	Billings County, ND	County or equivalent	783	-10	2	2	0	-11	-11	-1
2387	Roberts County, TX	County or equivalent	929	-5	1	2	1	-6	-6	0

	NAME	LSAD	CENSUS2010POP	NPOPCHG_2010	NATURALINC2010	BIRTHS2010	DEATHS2010	NETMIG2010	INTERNATIONALMIG2010	DOMESTICMIG2010	RESIDUAL2010
945	New York-Northern New Jersey-Long Island, NY-N...	Metropolitan Statistical Area	18897109	22540	28129	60868	32739	-5385	20199	-25584	-204
789	Los Angeles-Long Beach-Santa Ana, CA	Metropolitan Statistical Area	12828837	15534	22576	41178	18602	-7373	13472	-20845	331
961	New York-White Plains-Wayne, NY-NJ	Metropolitan Division	11576251	15482	21371	40129	18758	-5831	15394	-21225	-58
791	Los Angeles County, CA	County or equivalent	9818605	8168	17578	31907	14329	-9786	10568	-20354	376
790	Los Angeles-Long Beach-Glendale, CA	Metropolitan Division	9818605	8168	17578	31907	14329	-9786	10568	-20354	376
270	Chicago-Joliet-Naperville, IL-IN-WI	Metropolitan Statistical Area	9461105	11479	16148	32384	16236	-4720	6046	-10766	51

	sex	height	weight	handedness	exercise	smoke
0	Female	68.000000	158	Right	Some	Never
1	Male	70.000000	256	Left	None	Regul
2	Male	70.415094	204	Right	None	Occas
3	Male	63.000000	187	Right	None	Never
4	Male	65.000000	168	Right	Some	Never

	height	weight
count	237.000000	237.000000
mean	67.845457	172.932489
std	3.752505	62.131930
min	59.000000	0.000000
25%	65.274510	150.000000
50%	67.000000	169.000000
75%	70.415094	191.000000
max	79.000000	958.000000