In [1]:
"""
My code is even better than the author's code (at least this is true for Chapter 1)
Let's create more fun in statistics learning!
"""
import pandas as pd
import numpy as np
In [5]:
preg_raw = pd.read_csv("2002FemPregOut.csv")
reps_raw = pd.read_csv("2002FemRespOut.csv")
In [6]:
preg_raw.head()
Out[6]:
In [9]:
print preg_raw.isnull().sum() # It seems that 4445 are not live birth....
print preg_raw.shape
In [10]:
preg_raw = preg_raw.dropna() # it has thrown away all the 4506 rows that has na...
print preg_raw.shape
In [15]:
# There are Americans gave birth to 10 children in 2002?! My God! Superwomen!
preg_raw['birthord'].unique()
Out[15]:
In [25]:
# Get average pregancy length for each birth order group
grouped_prglngth_ct = preg_raw[['birthord', 'prglngth']]\
.groupby(['birthord'])['prglngth']\
.agg(['mean', 'median', 'var'])\
.sort_values(['mean'], ascending=False)
grouped_prglngth_ct
Out[25]:
In [ ]:
## It seems that, first born takes longer time to come but just a few hours more on average
## First born median preganancy length is the same as 2nd to 8th born