notebook.community

Edit and run



In [1]:

    
"""
My code is even better than the author's code (at least this is true for Chapter 1)
Let's create more fun in statistics learning!
"""
import pandas as pd
import numpy as np



In [5]:

    
preg_raw = pd.read_csv("2002FemPregOut.csv")
reps_raw = pd.read_csv("2002FemRespOut.csv")



In [6]:

    
preg_raw.head()









    Out[6]:







  
    
      
      caseid
      nbrnaliv
      babysex
      birthwgt_lb
      birthwgt_oz
      prglngth
      outcome
      birthord
      agepreg
      finalwgt
    
  
  
    
      0
      1
      1.0
      1.0
      8.0
      13.0
      39
      1
      1.0
      3316.0
      6448.271112
    
    
      1
      1
      1.0
      2.0
      7.0
      14.0
      39
      1
      2.0
      3925.0
      6448.271112
    
    
      2
      2
      3.0
      1.0
      9.0
      2.0
      39
      1
      1.0
      1433.0
      12999.542264
    
    
      3
      2
      1.0
      2.0
      7.0
      0.0
      39
      1
      2.0
      1783.0
      12999.542264
    
    
      4
      2
      1.0
      2.0
      6.0
      3.0
      39
      1
      3.0
      1833.0
      12999.542264



In [9]:

    
print preg_raw.isnull().sum()  # It seems that 4445 are not live birth....
print preg_raw.shape









    



caseid            0
nbrnaliv       4445
babysex        4449
birthwgt_lb    4449
birthwgt_oz    4506
prglngth          0
outcome           0
birthord       4445
agepreg         352
finalwgt          0
dtype: int64
(13593, 10)



In [10]:

    
preg_raw = preg_raw.dropna()  # it has thrown away all the 4506 rows that has na...
print preg_raw.shape



In [15]:

    
# There are Americans gave birth to 10 children in 2002?! My God! Superwomen!
preg_raw['birthord'].unique()









    Out[15]:





array([  1.,   2.,   3.,   4.,   5.,   6.,   7.,   8.,   9.,  10.])



In [25]:

    
# Get average pregancy length for each birth order group

grouped_prglngth_ct = preg_raw[['birthord', 'prglngth']]\
                      .groupby(['birthord'])['prglngth']\
                      .agg(['mean', 'median', 'var'])\
                      .sort_values(['mean'], ascending=False)
grouped_prglngth_ct



In [ ]:

    
## It seems that, first born takes longer time to come but just a few hours more on average
## First born median preganancy length is the same as 2nd to 8th born

	mean	median	var
birthord
1.0	38.606254	39.0	7.809997
2.0	38.572527	39.0	6.629441
3.0	38.520000	39.0	5.972026
4.0	38.441247	39.0	7.612525
5.0	38.392000	39.0	6.127355
7.0	38.105263	39.0	5.432749
6.0	38.040816	39.0	21.498299
8.0	37.571429	39.0	7.952381
9.0	37.500000	37.5	4.500000
10.0	36.000000	36.0	NaN

	caseid	nbrnaliv	babysex	birthwgt_lb	birthwgt_oz	prglngth	outcome	birthord	agepreg	finalwgt
0	1	1.0	1.0	8.0	13.0	39	1	1.0	3316.0	6448.271112
1	1	1.0	2.0	7.0	14.0	39	1	2.0	3925.0	6448.271112
2	2	3.0	1.0	9.0	2.0	39	1	1.0	1433.0	12999.542264
3	2	1.0	2.0	7.0	0.0	39	1	2.0	1783.0	12999.542264
4	2	1.0	2.0	6.0	3.0	39	1	3.0	1833.0	12999.542264