In [1]:
    
from __future__ import print_function, division
import marriage
import thinkstats2
import thinkplot
import math
import pandas as pd
import numpy as np
%matplotlib inline
    
Load the data:
In [2]:
    
def ReadFemPreg(dct_file, dat_file, usecols):
    """Reads the NSFG pregnancy data.
    dct_file: string file name
    dat_file: string file name
    returns: DataFrame
    """
    dct = thinkstats2.ReadStataDct(dct_file, encoding='ISO-8859-1')
    df = dct.ReadFixedWidth(dat_file, compression='gzip', usecols=usecols)
    return df
    
In [77]:
    
usecols = ['outcome', 'prglngth', 'birthord', 'finalwgt']
preg6 = ReadFemPreg('2002FemPreg.dct', '2002FemPreg.dat.gz', usecols)
print(preg6.shape)
preg6 = preg6[preg6.outcome == 1]
    
    
In [4]:
    
preg6.birthord.value_counts().sort_index()
    
    Out[4]:
In [5]:
    
preg6.prglngth.value_counts().sort_index()
    
    Out[5]:
Make a list of DataFrames, one for each cycle:
In [6]:
    
sum(preg6.prglngth >= 27)
    
    Out[6]:
In [7]:
    
preg6.finalwgt.describe()
    
    Out[7]:
In [78]:
    
usecols = ['outcome', 'prglngth', 'birthord', 'wgtq1q16']
preg7 = ReadFemPreg('2006_2010_FemPregSetup.dct', '2006_2010_FemPreg.dat.gz', usecols)
print(preg7.shape)
preg7 = preg7[preg7.outcome == 1]
preg7['finalwgt'] = preg7.wgtq1q16
preg7.shape
    
    
    Out[78]:
In [9]:
    
preg7.birthord.value_counts().sort_index()
    
    Out[9]:
In [10]:
    
preg7.prglngth.value_counts().sort_index()
    
    Out[10]:
In [11]:
    
sum(preg7.prglngth >= 27)
    
    Out[11]:
In [12]:
    
# note: min and max should be 38.700642694-49735.071265
preg7.finalwgt.describe()
    
    Out[12]:
In [80]:
    
usecols = ['outcome', 'prglngth', 'birthord', 'wgt2011_2013']
preg8 = ReadFemPreg('2011_2013_FemPregSetup.dct', '2011_2013_FemPregData.dat.gz', usecols)
print(preg7.shape)
preg8 = preg8[preg8.outcome == 1]
preg8['finalwgt'] = preg8.wgt2011_2013
preg8.shape
    
    
    Out[80]:
In [14]:
    
preg8.birthord.value_counts().sort_index()
    
    Out[14]:
In [15]:
    
preg8.prglngth.value_counts().sort_index()
    
    Out[15]:
In [16]:
    
sum(preg8.prglngth >= 27)
    
    Out[16]:
In [17]:
    
preg8.finalwgt.describe()
    
    Out[17]:
In [18]:
    
#resp8 = marriage.ReadFemResp2013()
#marriage.Validate2013(resp8)
#resp8 = resp8[resp8.parity >= 1]
#resp7 = marriage.ReadFemResp2010()
#marriage.Validate2010(resp7)
#resp7 = resp7[resp7.parity >= 1]
#resp6 = marriage.ReadFemResp2002()
#marriage.Validate2002(resp6)
#resp6 = resp6[resp6.parity >= 1]
    
In [81]:
    
#resps = [resp6, resp7, resp8]
pregs = [preg6, preg7, preg8]
for preg in pregs:
    print(len(preg))
    
    
In [20]:
    
def SummarizeCycle(df):
    ages = df.age.min(), df.age.max()
    ages= np.array(ages)
    
    intvws = df.cmintvw.min(), df.cmintvw.max()
    intvws = np.array(intvws) / 12 + 1900
    
    births = df.cmbirth.min(), df.cmbirth.max()
    births = np.array(births) / 12 + 1900
    print('# & ', intvws.astype(int), '&', len(df), '&', births.astype(int), r'\\')
    
#for resp in reversed(resps):
#    SummarizeCycle(resp)
    
In [21]:
    
def ResampleAndSelect(resp, preg):
    sample = thinkstats2.ResampleRowsWeighted(resp, column='finalwgt')
    print('1')
    
    dfs = [preg[preg.caseid == caseid] for caseid in sample.caseid]
    print('2')
    
    rows = pd.concat(dfs, ignore_index=True)
    print('3')
    return rows
#%time sample6 = ResampleAndSelect(resp6, preg6)
#sample6.shape
    
In [22]:
    
#grouped6 = preg6.groupby('caseid')
    
In [23]:
    
def ResampleAndSelect(resp, grouped):
    sample = thinkstats2.ResampleRowsWeighted(resp, column='finalwgt')
    print('1')
    dfs = [grouped.get_group(caseid) for caseid in sample.caseid]
    print('2', len(dfs))
    rows = pd.concat(dfs, ignore_index=True)
    print('3')
    return rows
#%time sample6 = ResampleAndSelect(resp6, grouped6)
#sample6.shape
    
In [24]:
    
def ResamplePreg(preg):
    sample = thinkstats2.ResampleRowsWeighted(preg, column='finalwgt')
    return sample
#%time sample7 = ResamplePreg(preg7)
#ample7.shape
    
In [25]:
    
def Resample(pregs):
    samples = [thinkstats2.ResampleRowsWeighted(preg, column='finalwgt') 
               for preg in pregs]
    sample = pd.concat(samples)
    return sample
sample = Resample(pregs)
    
In [26]:
    
firsts = sample[sample.birthord == 1]
firsts.shape
    
    Out[26]:
In [27]:
    
others = sample[sample.birthord > 1]
others.shape
    
    Out[27]:
In [28]:
    
sum(firsts.prglngth.isnull())
    
    Out[28]:
In [29]:
    
sum(others.prglngth.isnull())
    
    Out[29]:
In [30]:
    
firsts.prglngth.value_counts().sort_index()
    
    Out[30]:
In [31]:
    
others.prglngth.value_counts().sort_index()
    
    Out[31]:
In [32]:
    
len(firsts.prglngth), len(others.prglngth)
    
    Out[32]:
In [33]:
    
firsts.prglngth.mean(), others.prglngth.mean()
    
    Out[33]:
In [34]:
    
diff = firsts.prglngth.mean() - others.prglngth.mean()
diff, diff * 7, diff * 7 * 24
    
    Out[34]:
In [35]:
    
firsts.prglngth.std(), others.prglngth.std()
    
    Out[35]:
In [36]:
    
se1 = firsts.prglngth.std() / math.sqrt(len(firsts.prglngth))
se1
    
    Out[36]:
In [37]:
    
se2 = others.prglngth.std() / math.sqrt(len(others.prglngth))
se2
    
    Out[37]:
In [38]:
    
diff / se1, diff / se2
    
    Out[38]:
In [39]:
    
cdf_firsts = thinkstats2.Cdf(firsts.prglngth, label='firsts')
thinkplot.PrePlot(2)
thinkplot.Cdf(cdf_firsts)
thinkplot.Config(xlabel='pregnancy length (weeks)',
                 ylabel='CDF',
                 loc='upper left')
    
    
In [40]:
    
cdf_others = thinkstats2.Cdf(others.prglngth, label='others')
thinkplot.Cdf(cdf_others)
thinkplot.Config(xlabel='pregnancy length (weeks)',
                 ylabel='CDF',
                 loc='upper left')
    
    
In [41]:
    
jitter = 0.5
length_firsts = thinkstats2.Jitter(firsts.prglngth, jitter)
cdf_firsts = thinkstats2.Cdf(length_firsts, label='firsts')
length_others = thinkstats2.Jitter(others.prglngth, jitter)
cdf_others = thinkstats2.Cdf(length_others, label='others')
thinkplot.PrePlot(2)
thinkplot.Cdf(cdf_firsts)
thinkplot.Cdf(cdf_others)
thinkplot.Config(xlabel='pregnancy length (weeks)',
                 xlim=[30, 45],
                 ylabel='CDF',
                 loc='upper left')
    
    
In [42]:
    
def RemainingDurationPmf(pmf, weeks):
    """Returns PMF of remaining duration conditioned on weeks.
    
    pmf: PMF of pregnancy length
    weeks: current weeks of pregnancy
    """
    new = thinkstats2.Pmf(label=pmf.label)
    for x, p in pmf.Items():
        if x >= weeks:
            new[x - weeks] = p
    new.Normalize()
    return new
    
In [43]:
    
pmf_firsts = thinkstats2.Pmf(length_firsts, label='firsts')
pmf_others = thinkstats2.Pmf(length_others, label='others')
    
In [44]:
    
def PlotRemaining(pmfs, weeks):
    """Plot CDF of remaining weeks.
    
    pmfs: list of PMF
    weeks: current weeks of pregnancy
    """
    cdfs = [RemainingDurationPmf(pmf, weeks).MakeCdf()
            for pmf in pmfs]
    thinkplot.PrePlot(len(cdfs))
    thinkplot.Cdfs(cdfs)
    
In [45]:
    
loc = 'lower right'
PlotRemaining([pmf_firsts, pmf_others], 30)
thinkplot.Config(xlabel='pregnancy length (weeks)',
                 ylabel='CDF',
                 loc=loc)
    
    
In [46]:
    
PlotRemaining([pmf_firsts, pmf_others], 32)
thinkplot.Config(xlabel='pregnancy length (weeks)',
                 ylabel='CDF',
                 loc=loc)
    
    
In [47]:
    
PlotRemaining([pmf_firsts, pmf_others], 34)
thinkplot.Config(xlabel='pregnancy length (weeks)',
                 ylabel='CDF',
                 loc=loc)
    
    
In [48]:
    
PlotRemaining([pmf_firsts, pmf_others], 36)
thinkplot.Config(xlabel='pregnancy length (weeks)',
                 ylabel='CDF',
                 loc=loc)
    
    
In [49]:
    
PlotRemaining([pmf_firsts, pmf_others], 38)
thinkplot.Config(xlabel='pregnancy length (weeks)',
                 ylabel='CDF',
                 loc=loc)
    
    
In [50]:
    
PlotRemaining([pmf_firsts, pmf_others], 39)
thinkplot.Config(xlabel='pregnancy length (weeks)',
                 ylabel='CDF',
                 loc=loc)
    
    
In [51]:
    
def PercentilesRemaining(pmfs, weeks):
    cdfs = [RemainingDurationPmf(pmf, weeks).MakeCdf()
            for pmf in pmfs]
    
    for cdf in cdfs:
        print([cdf.Percentile(p) * 7 for p in [25, 50, 75]])
    
In [52]:
    
PercentilesRemaining([pmf_firsts, pmf_others], 39)
    
    
In [53]:
    
percentiles = [50, 75, 25]
def PercentilesRemaining(pmf, ts):
    array = np.zeros((len(ts), len(percentiles)))
    
    for i, t in enumerate(ts):
        cdf = RemainingDurationPmf(pmf, t).MakeCdf()
        ps = [cdf.Percentile(p) * 7 for p in percentiles]
        #print(t, ps)
        array[i, :] = ps
        
    return array
    
In [54]:
    
ts = np.arange(36, 44)
ps_firsts = PercentilesRemaining(pmf_firsts, ts)
ps_firsts
    
    Out[54]:
In [55]:
    
ps_others = PercentilesRemaining(pmf_others, ts)
ps_others
    
    Out[55]:
In [56]:
    
nrows, ncols = ps_firsts.shape
thinkplot.PrePlot(3)
for i in range(ncols):
    thinkplot.Plot(ts, ps_firsts[:, i], label=percentiles[i])
thinkplot.PrePlot(3)
for i in range(ncols):
    thinkplot.Plot(ts, ps_others[:, i], linestyle='dashed')
    
thinkplot.Config(xlabel='t (weeks)',
                 ylabel='remaining time (days)',
                 legend=True,
                 loc='upper right')
    
    
In [57]:
    
sample = Resample(pregs)
sample.shape
    
    Out[57]:
In [59]:
    
firsts = sample[sample.birthord == 1] 
others = sample[sample.birthord > 1]
    
In [60]:
    
jitter = 0.5
lengths = thinkstats2.Jitter(firsts.prglngth, jitter)
pmf_firsts = thinkstats2.Pmf(lengths, label='firsts')
lengths = thinkstats2.Jitter(others.prglngth, jitter)
pmf_others = thinkstats2.Pmf(lengths, label='others')
    
In [61]:
    
def SelectFirsts(preg):
    return preg[preg.birthord == 1]
    
In [62]:
    
def SelectOthers(preg):
    return preg[preg.birthord > 1]
    
In [63]:
    
def MeanRemaining(pmf, t):
    rem_pmf = RemainingDurationPmf(pmf, t)
    mean = rem_pmf.Mean()
    return mean
MeanRemaining(pmf_firsts, 36)
    
    Out[63]:
In [64]:
    
def MedianRemaining(pmf, t):
    rem_pmf = RemainingDurationPmf(pmf, t)
    rem_cdf = rem_pmf.MakeCdf()
    median = rem_cdf.Percentile(50)
    return median
MedianRemaining(pmf_firsts, 36)
    
    Out[64]:
In [65]:
    
def ProbRemainingLess(pmf, t, weeks=1):
    rem_pmf = RemainingDurationPmf(pmf, t)
    rem_cdf = rem_pmf.MakeCdf()
    p = rem_cdf[weeks]
    return p
ProbRemainingLess(pmf_firsts, 36)
    
    Out[65]:
In [66]:
    
def Remaining(preg, iters, ts, select_func, stat_func):
    """Computes remaining lifetime versus weeks of pregnancy.
    
    preg: DataFrame of pregnancy data
    iters: int, how many resampling iterations to run
    ts: sequence of float, times to evaluate, in weeks
    select_func: function that selects a subset of pregnancies
    stat_func: function that computes summary statistic
    
    returns: array with one row for each percentile, one column
             for each item in ts
    """
    # make an array to hold one row per iteration,
    # one col per element of ts
    array = np.zeros((iters, len(ts)))
    for i in range(iters):
        sample = Resample(pregs)
        group = select_func(sample)
        lengths = thinkstats2.Jitter(group.prglngth, jitter=0.5)
        pmf = thinkstats2.Pmf(lengths)
        stats = [stat_func(pmf, t) for t in ts]
        array[i, :] = stats
    array = np.sort(array, axis=0)
    rows = [thinkstats2.PercentileRow(array, p) for p in [10, 50, 90]]
    return rows
    
In [71]:
    
def PlotStats(stat_func, iters, ts):
    """Plots a summary statistic versus weeks of pregnancy.
    
    stat_func: function that computes summary statistic
    iters: int, how many resampling iterations to run
    ts: sequence of float, times to evaluate, in weeks
    """
    # list of labels and select_funcs
    labels = [('firsts', SelectFirsts), 
              ('others', SelectOthers)]
    thinkplot.PrePlot(2)
    for label, select_func in labels:
        rows = Remaining(pmf_firsts, iters, ts, select_func, stat_func)
        thinkplot.FillBetween(ts, rows[0], rows[2], color='gray')
        thinkplot.Plot(ts, rows[1], label=label)
        print(label)
        for t, stat in zip(ts, rows[1]):
            print(t, stat)
    
In [74]:
    
iters = 101
ts = np.arange(36, 43.5, 0.5)
PlotStats(MeanRemaining, iters, ts)
thinkplot.Config(xlabel='week of pregnancy',
                 ylabel='remaining time (weeks)',
                 legend=True,
                 loc='upper right')
thinkplot.Save(root='first1', formats=['pdf', 'png'])
    
    
    
In [75]:
    
PlotStats(MedianRemaining, iters, ts)
thinkplot.Config(xlabel='t (weeks)',
                 ylabel='week of pregnancy',
                 legend=True,
                 loc='upper right')
thinkplot.Save(root='first2', formats=['pdf', 'png'])
    
    
    
In [76]:
    
PlotStats(ProbRemainingLess, iters, ts)
thinkplot.Config(xlabel='week of pregnancy',
                 ylabel='prob(birth within 1 week)',
                 legend=True,
                 loc='upper left')
thinkplot.Save(root='first3', formats=['pdf', 'png'])
    
    
    
In [ ]: