Visit http://thinkstats.com/nsfg.html. Read/Accept the terms of use for this data


In [ ]:
!cd ~/.metrique/repos   # files and such will be downloaded here...

In [ ]:
fem_preg = 'http://greenteapress.com/thinkstats/2002FemPreg.dat.gz'
fem_resp = 'http://greenteapress.com/thinkstats/2002FemResp.dat.gz'
male = 'http://greenteapress.com/thinkstats/2002Male.dat.gz'

Imports


In [ ]:
from collections import defaultdict
import os
import matplotlib.pyplot as plt
import pandas as pd
import urllib
import metrique as m

Download the various Think Stats (TT) datasets (assuming you agreed to the terms)


In [ ]:
uris = (fem_preg, fem_resp, male)
for f in uris:
    bf = os.path.basename(f)
    if not os.path.exists(bf):
        urllib.urlretrieve(f, bf)

Download the TT survey.py for parsing the .dat files being distributed


In [ ]:
survey = 'http://greenteapress.com/thinkstats/survey.py'
if not os.path.exists(os.path.basename(survey)):
    urllib.urlretrieve(survey, 'survey.py')

Test to see if it works


In [ ]:
!python survey.py

Exercise 1-3 (Pg7)

Using survey.py, read all pregnancy records and convert them to a list of dict


In [ ]:
import survey
pregs = survey.Pregnancies()
pregs.ReadRecords()
pregs = pregs.records
pregs = [r.__dict__ for r in pregs]
_ = [o.update({'_oid': i}) for i, o in enumerate(pregs)]

resps = survey.Respondents()
resps.ReadRecords()
resps = resps.records
resps = [r.__dict__ for r in resps]
_ = [o.update({'_oid': i}) for i, o in enumerate(resps)]

Create a new cube and alias its pandas.dataframe form


In [ ]:
import metrique as m
_pregs = m.pyclient(objects=pregs, name='pregs_2002')
_pregs[0]
_pregs.df

In [ ]:
_resps = m.pyclient(objects=resps, name='resps_2002')
_resps[0]
_resps.df

In [ ]:
df = _pregs.df
pregs_k = len(_pregs.df)
births_k = len(_pregs.df[_pregs.df.outcome == 1])
first_borns = _pregs.df[_pregs.df.birthord == 1]
not_first_borns = df[_pregs.df.birthord != 1]

resps_k = len(_resps.df)

In [ ]:
print 'pregnancies: %s' % pregs_k, 
print 'live births: %s (%s%%)' % (births_k, int(float(births_k)/pregs_k*100))
print 'respondents: %s' % resps_k

In [ ]:
fb_len_mean = first_borns.prglength.mean()
fb_20 = first_borns.prglength[first_borns.prglength > 20]

In [ ]:
nfb_len_mean = not_first_borns.prglength.mean()
nfb_20 = not_first_borns.prglength[not_first_borns.prglength > 20]

In [ ]:
not_first_borns.prglength.hist()
first_borns.prglength.hist()

In [ ]:
print 'first borns:', first_borns.prglength.describe()
print
print 'not first borns:', not_first_borns.prglength.describe()

Excersize 2-5


In [ ]:
# include only records where prglength > 20, as < 20 seems "unlikely" 
plt.figure()
fb_20.hist(bins=30, label='1')
nfb_20.hist(bins=30, label='>=2')
plt.legend()

In [ ]:


In [ ]: