In [1]:
from __future__ import print_function, division
import marriage
import thinkstats2
import thinkplot
import math
import pandas as pd
import numpy as np
%matplotlib inline
Load the data:
In [2]:
def ReadFemPreg(dct_file, dat_file, usecols):
"""Reads the NSFG pregnancy data.
dct_file: string file name
dat_file: string file name
returns: DataFrame
"""
dct = thinkstats2.ReadStataDct(dct_file, encoding='ISO-8859-1')
df = dct.ReadFixedWidth(dat_file, compression='gzip', usecols=usecols)
return df
In [77]:
usecols = ['outcome', 'prglngth', 'birthord', 'finalwgt']
preg6 = ReadFemPreg('2002FemPreg.dct', '2002FemPreg.dat.gz', usecols)
print(preg6.shape)
preg6 = preg6[preg6.outcome == 1]
In [4]:
preg6.birthord.value_counts().sort_index()
Out[4]:
In [5]:
preg6.prglngth.value_counts().sort_index()
Out[5]:
Make a list of DataFrames, one for each cycle:
In [6]:
sum(preg6.prglngth >= 27)
Out[6]:
In [7]:
preg6.finalwgt.describe()
Out[7]:
In [78]:
usecols = ['outcome', 'prglngth', 'birthord', 'wgtq1q16']
preg7 = ReadFemPreg('2006_2010_FemPregSetup.dct', '2006_2010_FemPreg.dat.gz', usecols)
print(preg7.shape)
preg7 = preg7[preg7.outcome == 1]
preg7['finalwgt'] = preg7.wgtq1q16
preg7.shape
Out[78]:
In [9]:
preg7.birthord.value_counts().sort_index()
Out[9]:
In [10]:
preg7.prglngth.value_counts().sort_index()
Out[10]:
In [11]:
sum(preg7.prglngth >= 27)
Out[11]:
In [12]:
# note: min and max should be 38.700642694-49735.071265
preg7.finalwgt.describe()
Out[12]:
In [80]:
usecols = ['outcome', 'prglngth', 'birthord', 'wgt2011_2013']
preg8 = ReadFemPreg('2011_2013_FemPregSetup.dct', '2011_2013_FemPregData.dat.gz', usecols)
print(preg7.shape)
preg8 = preg8[preg8.outcome == 1]
preg8['finalwgt'] = preg8.wgt2011_2013
preg8.shape
Out[80]:
In [14]:
preg8.birthord.value_counts().sort_index()
Out[14]:
In [15]:
preg8.prglngth.value_counts().sort_index()
Out[15]:
In [16]:
sum(preg8.prglngth >= 27)
Out[16]:
In [17]:
preg8.finalwgt.describe()
Out[17]:
In [18]:
#resp8 = marriage.ReadFemResp2013()
#marriage.Validate2013(resp8)
#resp8 = resp8[resp8.parity >= 1]
#resp7 = marriage.ReadFemResp2010()
#marriage.Validate2010(resp7)
#resp7 = resp7[resp7.parity >= 1]
#resp6 = marriage.ReadFemResp2002()
#marriage.Validate2002(resp6)
#resp6 = resp6[resp6.parity >= 1]
In [81]:
#resps = [resp6, resp7, resp8]
pregs = [preg6, preg7, preg8]
for preg in pregs:
print(len(preg))
In [20]:
def SummarizeCycle(df):
ages = df.age.min(), df.age.max()
ages= np.array(ages)
intvws = df.cmintvw.min(), df.cmintvw.max()
intvws = np.array(intvws) / 12 + 1900
births = df.cmbirth.min(), df.cmbirth.max()
births = np.array(births) / 12 + 1900
print('# & ', intvws.astype(int), '&', len(df), '&', births.astype(int), r'\\')
#for resp in reversed(resps):
# SummarizeCycle(resp)
In [21]:
def ResampleAndSelect(resp, preg):
sample = thinkstats2.ResampleRowsWeighted(resp, column='finalwgt')
print('1')
dfs = [preg[preg.caseid == caseid] for caseid in sample.caseid]
print('2')
rows = pd.concat(dfs, ignore_index=True)
print('3')
return rows
#%time sample6 = ResampleAndSelect(resp6, preg6)
#sample6.shape
In [22]:
#grouped6 = preg6.groupby('caseid')
In [23]:
def ResampleAndSelect(resp, grouped):
sample = thinkstats2.ResampleRowsWeighted(resp, column='finalwgt')
print('1')
dfs = [grouped.get_group(caseid) for caseid in sample.caseid]
print('2', len(dfs))
rows = pd.concat(dfs, ignore_index=True)
print('3')
return rows
#%time sample6 = ResampleAndSelect(resp6, grouped6)
#sample6.shape
In [24]:
def ResamplePreg(preg):
sample = thinkstats2.ResampleRowsWeighted(preg, column='finalwgt')
return sample
#%time sample7 = ResamplePreg(preg7)
#ample7.shape
In [25]:
def Resample(pregs):
samples = [thinkstats2.ResampleRowsWeighted(preg, column='finalwgt')
for preg in pregs]
sample = pd.concat(samples)
return sample
sample = Resample(pregs)
In [26]:
firsts = sample[sample.birthord == 1]
firsts.shape
Out[26]:
In [27]:
others = sample[sample.birthord > 1]
others.shape
Out[27]:
In [28]:
sum(firsts.prglngth.isnull())
Out[28]:
In [29]:
sum(others.prglngth.isnull())
Out[29]:
In [30]:
firsts.prglngth.value_counts().sort_index()
Out[30]:
In [31]:
others.prglngth.value_counts().sort_index()
Out[31]:
In [32]:
len(firsts.prglngth), len(others.prglngth)
Out[32]:
In [33]:
firsts.prglngth.mean(), others.prglngth.mean()
Out[33]:
In [34]:
diff = firsts.prglngth.mean() - others.prglngth.mean()
diff, diff * 7, diff * 7 * 24
Out[34]:
In [35]:
firsts.prglngth.std(), others.prglngth.std()
Out[35]:
In [36]:
se1 = firsts.prglngth.std() / math.sqrt(len(firsts.prglngth))
se1
Out[36]:
In [37]:
se2 = others.prglngth.std() / math.sqrt(len(others.prglngth))
se2
Out[37]:
In [38]:
diff / se1, diff / se2
Out[38]:
In [39]:
cdf_firsts = thinkstats2.Cdf(firsts.prglngth, label='firsts')
thinkplot.PrePlot(2)
thinkplot.Cdf(cdf_firsts)
thinkplot.Config(xlabel='pregnancy length (weeks)',
ylabel='CDF',
loc='upper left')
In [40]:
cdf_others = thinkstats2.Cdf(others.prglngth, label='others')
thinkplot.Cdf(cdf_others)
thinkplot.Config(xlabel='pregnancy length (weeks)',
ylabel='CDF',
loc='upper left')
In [41]:
jitter = 0.5
length_firsts = thinkstats2.Jitter(firsts.prglngth, jitter)
cdf_firsts = thinkstats2.Cdf(length_firsts, label='firsts')
length_others = thinkstats2.Jitter(others.prglngth, jitter)
cdf_others = thinkstats2.Cdf(length_others, label='others')
thinkplot.PrePlot(2)
thinkplot.Cdf(cdf_firsts)
thinkplot.Cdf(cdf_others)
thinkplot.Config(xlabel='pregnancy length (weeks)',
xlim=[30, 45],
ylabel='CDF',
loc='upper left')
In [42]:
def RemainingDurationPmf(pmf, weeks):
"""Returns PMF of remaining duration conditioned on weeks.
pmf: PMF of pregnancy length
weeks: current weeks of pregnancy
"""
new = thinkstats2.Pmf(label=pmf.label)
for x, p in pmf.Items():
if x >= weeks:
new[x - weeks] = p
new.Normalize()
return new
In [43]:
pmf_firsts = thinkstats2.Pmf(length_firsts, label='firsts')
pmf_others = thinkstats2.Pmf(length_others, label='others')
In [44]:
def PlotRemaining(pmfs, weeks):
"""Plot CDF of remaining weeks.
pmfs: list of PMF
weeks: current weeks of pregnancy
"""
cdfs = [RemainingDurationPmf(pmf, weeks).MakeCdf()
for pmf in pmfs]
thinkplot.PrePlot(len(cdfs))
thinkplot.Cdfs(cdfs)
In [45]:
loc = 'lower right'
PlotRemaining([pmf_firsts, pmf_others], 30)
thinkplot.Config(xlabel='pregnancy length (weeks)',
ylabel='CDF',
loc=loc)
In [46]:
PlotRemaining([pmf_firsts, pmf_others], 32)
thinkplot.Config(xlabel='pregnancy length (weeks)',
ylabel='CDF',
loc=loc)
In [47]:
PlotRemaining([pmf_firsts, pmf_others], 34)
thinkplot.Config(xlabel='pregnancy length (weeks)',
ylabel='CDF',
loc=loc)
In [48]:
PlotRemaining([pmf_firsts, pmf_others], 36)
thinkplot.Config(xlabel='pregnancy length (weeks)',
ylabel='CDF',
loc=loc)
In [49]:
PlotRemaining([pmf_firsts, pmf_others], 38)
thinkplot.Config(xlabel='pregnancy length (weeks)',
ylabel='CDF',
loc=loc)
In [50]:
PlotRemaining([pmf_firsts, pmf_others], 39)
thinkplot.Config(xlabel='pregnancy length (weeks)',
ylabel='CDF',
loc=loc)
In [51]:
def PercentilesRemaining(pmfs, weeks):
cdfs = [RemainingDurationPmf(pmf, weeks).MakeCdf()
for pmf in pmfs]
for cdf in cdfs:
print([cdf.Percentile(p) * 7 for p in [25, 50, 75]])
In [52]:
PercentilesRemaining([pmf_firsts, pmf_others], 39)
In [53]:
percentiles = [50, 75, 25]
def PercentilesRemaining(pmf, ts):
array = np.zeros((len(ts), len(percentiles)))
for i, t in enumerate(ts):
cdf = RemainingDurationPmf(pmf, t).MakeCdf()
ps = [cdf.Percentile(p) * 7 for p in percentiles]
#print(t, ps)
array[i, :] = ps
return array
In [54]:
ts = np.arange(36, 44)
ps_firsts = PercentilesRemaining(pmf_firsts, ts)
ps_firsts
Out[54]:
In [55]:
ps_others = PercentilesRemaining(pmf_others, ts)
ps_others
Out[55]:
In [56]:
nrows, ncols = ps_firsts.shape
thinkplot.PrePlot(3)
for i in range(ncols):
thinkplot.Plot(ts, ps_firsts[:, i], label=percentiles[i])
thinkplot.PrePlot(3)
for i in range(ncols):
thinkplot.Plot(ts, ps_others[:, i], linestyle='dashed')
thinkplot.Config(xlabel='t (weeks)',
ylabel='remaining time (days)',
legend=True,
loc='upper right')
In [57]:
sample = Resample(pregs)
sample.shape
Out[57]:
In [59]:
firsts = sample[sample.birthord == 1]
others = sample[sample.birthord > 1]
In [60]:
jitter = 0.5
lengths = thinkstats2.Jitter(firsts.prglngth, jitter)
pmf_firsts = thinkstats2.Pmf(lengths, label='firsts')
lengths = thinkstats2.Jitter(others.prglngth, jitter)
pmf_others = thinkstats2.Pmf(lengths, label='others')
In [61]:
def SelectFirsts(preg):
return preg[preg.birthord == 1]
In [62]:
def SelectOthers(preg):
return preg[preg.birthord > 1]
In [63]:
def MeanRemaining(pmf, t):
rem_pmf = RemainingDurationPmf(pmf, t)
mean = rem_pmf.Mean()
return mean
MeanRemaining(pmf_firsts, 36)
Out[63]:
In [64]:
def MedianRemaining(pmf, t):
rem_pmf = RemainingDurationPmf(pmf, t)
rem_cdf = rem_pmf.MakeCdf()
median = rem_cdf.Percentile(50)
return median
MedianRemaining(pmf_firsts, 36)
Out[64]:
In [65]:
def ProbRemainingLess(pmf, t, weeks=1):
rem_pmf = RemainingDurationPmf(pmf, t)
rem_cdf = rem_pmf.MakeCdf()
p = rem_cdf[weeks]
return p
ProbRemainingLess(pmf_firsts, 36)
Out[65]:
In [66]:
def Remaining(preg, iters, ts, select_func, stat_func):
"""Computes remaining lifetime versus weeks of pregnancy.
preg: DataFrame of pregnancy data
iters: int, how many resampling iterations to run
ts: sequence of float, times to evaluate, in weeks
select_func: function that selects a subset of pregnancies
stat_func: function that computes summary statistic
returns: array with one row for each percentile, one column
for each item in ts
"""
# make an array to hold one row per iteration,
# one col per element of ts
array = np.zeros((iters, len(ts)))
for i in range(iters):
sample = Resample(pregs)
group = select_func(sample)
lengths = thinkstats2.Jitter(group.prglngth, jitter=0.5)
pmf = thinkstats2.Pmf(lengths)
stats = [stat_func(pmf, t) for t in ts]
array[i, :] = stats
array = np.sort(array, axis=0)
rows = [thinkstats2.PercentileRow(array, p) for p in [10, 50, 90]]
return rows
In [71]:
def PlotStats(stat_func, iters, ts):
"""Plots a summary statistic versus weeks of pregnancy.
stat_func: function that computes summary statistic
iters: int, how many resampling iterations to run
ts: sequence of float, times to evaluate, in weeks
"""
# list of labels and select_funcs
labels = [('firsts', SelectFirsts),
('others', SelectOthers)]
thinkplot.PrePlot(2)
for label, select_func in labels:
rows = Remaining(pmf_firsts, iters, ts, select_func, stat_func)
thinkplot.FillBetween(ts, rows[0], rows[2], color='gray')
thinkplot.Plot(ts, rows[1], label=label)
print(label)
for t, stat in zip(ts, rows[1]):
print(t, stat)
In [74]:
iters = 101
ts = np.arange(36, 43.5, 0.5)
PlotStats(MeanRemaining, iters, ts)
thinkplot.Config(xlabel='week of pregnancy',
ylabel='remaining time (weeks)',
legend=True,
loc='upper right')
thinkplot.Save(root='first1', formats=['pdf', 'png'])
In [75]:
PlotStats(MedianRemaining, iters, ts)
thinkplot.Config(xlabel='t (weeks)',
ylabel='week of pregnancy',
legend=True,
loc='upper right')
thinkplot.Save(root='first2', formats=['pdf', 'png'])
In [76]:
PlotStats(ProbRemainingLess, iters, ts)
thinkplot.Config(xlabel='week of pregnancy',
ylabel='prob(birth within 1 week)',
legend=True,
loc='upper left')
thinkplot.Save(root='first3', formats=['pdf', 'png'])
In [ ]: