Visit http://thinkstats.com/nsfg.html. Read/Accept the terms of use for this data
In [ ]:
!cd ~/.metrique/repos # files and such will be downloaded here...
In [ ]:
fem_preg = 'http://greenteapress.com/thinkstats/2002FemPreg.dat.gz'
fem_resp = 'http://greenteapress.com/thinkstats/2002FemResp.dat.gz'
male = 'http://greenteapress.com/thinkstats/2002Male.dat.gz'
Imports
In [ ]:
from collections import defaultdict
import os
import matplotlib.pyplot as plt
import pandas as pd
import urllib
import metrique as m
Download the various Think Stats (TT) datasets (assuming you agreed to the terms)
In [ ]:
uris = (fem_preg, fem_resp, male)
for f in uris:
bf = os.path.basename(f)
if not os.path.exists(bf):
urllib.urlretrieve(f, bf)
Download the TT survey.py for parsing the .dat files being distributed
In [ ]:
survey = 'http://greenteapress.com/thinkstats/survey.py'
if not os.path.exists(os.path.basename(survey)):
urllib.urlretrieve(survey, 'survey.py')
Test to see if it works
In [ ]:
!python survey.py
Exercise 1-3 (Pg7)
Using survey.py, read all pregnancy records and convert them to a list of dict
In [ ]:
import survey
pregs = survey.Pregnancies()
pregs.ReadRecords()
pregs = pregs.records
pregs = [r.__dict__ for r in pregs]
_ = [o.update({'_oid': i}) for i, o in enumerate(pregs)]
resps = survey.Respondents()
resps.ReadRecords()
resps = resps.records
resps = [r.__dict__ for r in resps]
_ = [o.update({'_oid': i}) for i, o in enumerate(resps)]
Create a new cube and alias its pandas.dataframe form
In [ ]:
import metrique as m
_pregs = m.pyclient(objects=pregs, name='pregs_2002')
_pregs[0]
_pregs.df
In [ ]:
_resps = m.pyclient(objects=resps, name='resps_2002')
_resps[0]
_resps.df
In [ ]:
df = _pregs.df
pregs_k = len(_pregs.df)
births_k = len(_pregs.df[_pregs.df.outcome == 1])
first_borns = _pregs.df[_pregs.df.birthord == 1]
not_first_borns = df[_pregs.df.birthord != 1]
resps_k = len(_resps.df)
In [ ]:
print 'pregnancies: %s' % pregs_k,
print 'live births: %s (%s%%)' % (births_k, int(float(births_k)/pregs_k*100))
print 'respondents: %s' % resps_k
In [ ]:
fb_len_mean = first_borns.prglength.mean()
fb_20 = first_borns.prglength[first_borns.prglength > 20]
In [ ]:
nfb_len_mean = not_first_borns.prglength.mean()
nfb_20 = not_first_borns.prglength[not_first_borns.prglength > 20]
In [ ]:
not_first_borns.prglength.hist()
first_borns.prglength.hist()
In [ ]:
print 'first borns:', first_borns.prglength.describe()
print
print 'not first borns:', not_first_borns.prglength.describe()
Excersize 2-5
In [ ]:
# include only records where prglength > 20, as < 20 seems "unlikely"
plt.figure()
fb_20.hist(bins=30, label='1')
nfb_20.hist(bins=30, label='>=2')
plt.legend()
In [ ]:
In [ ]: