Are first babies more likely to be late?

License: Creative Commons Attribution 4.0 International



In [1]:

    
from __future__ import print_function, division

import marriage

import thinkstats2
import thinkplot

import math

import pandas as pd
import numpy as np

%matplotlib inline

Load the data:



In [2]:

    
def ReadFemPreg(dct_file, dat_file, usecols):
    """Reads the NSFG pregnancy data.

    dct_file: string file name
    dat_file: string file name

    returns: DataFrame
    """
    dct = thinkstats2.ReadStataDct(dct_file, encoding='ISO-8859-1')
    df = dct.ReadFixedWidth(dat_file, compression='gzip', usecols=usecols)
    return df



In [77]:

    
usecols = ['outcome', 'prglngth', 'birthord', 'finalwgt']
preg6 = ReadFemPreg('2002FemPreg.dct', '2002FemPreg.dat.gz', usecols)
print(preg6.shape)
preg6 = preg6[preg6.outcome == 1]



In [4]:

    
preg6.birthord.value_counts().sort_index()









    Out[4]:





1     4413
2     2874
3     1234
4      421
5      126
6       50
7       20
8        7
9        2
10       1
dtype: int64



In [5]:

    
preg6.prglngth.value_counts().sort_index()









    Out[5]:





0        1
4        1
9        1
13       1
17       2
18       1
19       1
20       1
21       2
22       7
23       1
24      13
25       3
26      35
27       3
28      32
29      21
30     138
31      27
32     115
33      49
34      60
35     311
36     321
37     455
38     607
39    4693
40    1116
41     587
42     328
43     148
44      46
45      10
46       1
47       1
48       7
50       2
dtype: int64

Make a list of DataFrames, one for each cycle:



In [6]:

    
sum(preg6.prglngth >= 27)









    Out[6]:





9078



In [7]:

    
preg6.finalwgt.describe()









    Out[7]:





count      9148.000000
mean       8404.364417
std        9142.507490
min         118.656790
25%        3995.837136
50%        6415.602101
75%        9555.442649
max      261879.953864
Name: finalwgt, dtype: float64



In [78]:

    
usecols = ['outcome', 'prglngth', 'birthord', 'wgtq1q16']
preg7 = ReadFemPreg('2006_2010_FemPregSetup.dct', '2006_2010_FemPreg.dat.gz', usecols)
print(preg7.shape)
preg7 = preg7[preg7.outcome == 1]
preg7['finalwgt'] = preg7.wgtq1q16
preg7.shape









    



(20492, 4)






    Out[78]:





(14292, 5)



In [9]:

    
preg7.birthord.value_counts().sort_index()









    Out[9]:





1     6683
2     4415
3     2030
4      734
5      269
6      102
7       36
8       13
9        7
10       2
11       1
dtype: int64



In [10]:

    
preg7.prglngth.value_counts().sort_index()









    Out[10]:





0        1
7        1
17       1
19       2
20       2
21       4
22      16
23       4
24      15
25      14
26      65
27      21
28      37
29      26
30     184
31      40
32     156
33      59
34     155
35     509
36     622
37     835
38    1304
39    6263
40    2280
41     907
42     553
43     185
44      19
45       3
46       4
48       3
52       1
57       1
dtype: int64



In [11]:

    
sum(preg7.prglngth >= 27)









    Out[11]:





14167



In [12]:

    
# note: min and max should be 38.700642694-49735.071265
preg7.finalwgt.describe()









    Out[12]:





count    14292.000000
mean      5306.068571
std       5967.788429
min         44.023984
25%       1509.629839
50%       3078.497086
75%       6594.905896
max      30226.354508
Name: finalwgt, dtype: float64



In [80]:

    
usecols = ['outcome', 'prglngth', 'birthord', 'wgt2011_2013']
preg8 = ReadFemPreg('2011_2013_FemPregSetup.dct', '2011_2013_FemPregData.dat.gz', usecols)
print(preg7.shape)
preg8 = preg8[preg8.outcome == 1]
preg8['finalwgt'] = preg8.wgt2011_2013
preg8.shape









    



(14292, 5)






    Out[80]:





(6670, 5)



In [14]:

    
preg8.birthord.value_counts().sort_index()









    Out[14]:





1    3141
2    2076
3     957
4     327
5     113
6      34
7      16
8       5
9       1
dtype: int64



In [15]:

    
preg8.prglngth.value_counts().sort_index()









    Out[15]:





17       1
20       2
21       2
22       8
23       3
24      12
25       6
26      21
27       7
28      22
29      19
30      61
31      24
32      90
33      30
34      97
35     219
36     283
37     406
38     706
39    2566
40    1306
41     426
42     277
43      63
44       8
45       2
46       1
47       2
dtype: int64



In [16]:

    
sum(preg8.prglngth >= 27)









    Out[16]:





6615



In [17]:

    
preg8.finalwgt.describe()









    Out[17]:





count     6670.000000
mean     11173.091142
std      12939.068547
min       1714.541000
25%       3985.078597
50%       6834.593716
75%      12984.239442
max      85207.950000
Name: finalwgt, dtype: float64



In [18]:

    
#resp8 = marriage.ReadFemResp2013()
#marriage.Validate2013(resp8)
#resp8 = resp8[resp8.parity >= 1]

#resp7 = marriage.ReadFemResp2010()
#marriage.Validate2010(resp7)
#resp7 = resp7[resp7.parity >= 1]

#resp6 = marriage.ReadFemResp2002()
#marriage.Validate2002(resp6)
#resp6 = resp6[resp6.parity >= 1]



In [81]:

    
#resps = [resp6, resp7, resp8]
pregs = [preg6, preg7, preg8]

for preg in pregs:
    print(len(preg))



In [20]:

    
def SummarizeCycle(df):
    ages = df.age.min(), df.age.max()
    ages= np.array(ages)
    
    intvws = df.cmintvw.min(), df.cmintvw.max()
    intvws = np.array(intvws) / 12 + 1900
    
    births = df.cmbirth.min(), df.cmbirth.max()
    births = np.array(births) / 12 + 1900

    print('# & ', intvws.astype(int), '&', len(df), '&', births.astype(int), r'\\')
    
#for resp in reversed(resps):
#    SummarizeCycle(resp)



In [21]:

    
def ResampleAndSelect(resp, preg):
    sample = thinkstats2.ResampleRowsWeighted(resp, column='finalwgt')
    print('1')
    
    dfs = [preg[preg.caseid == caseid] for caseid in sample.caseid]
    print('2')
    
    rows = pd.concat(dfs, ignore_index=True)
    print('3')

    return rows

#%time sample6 = ResampleAndSelect(resp6, preg6)
#sample6.shape



In [22]:

    
#grouped6 = preg6.groupby('caseid')



In [23]:

    
def ResampleAndSelect(resp, grouped):
    sample = thinkstats2.ResampleRowsWeighted(resp, column='finalwgt')
    print('1')

    dfs = [grouped.get_group(caseid) for caseid in sample.caseid]
    print('2', len(dfs))

    rows = pd.concat(dfs, ignore_index=True)
    print('3')
    return rows

#%time sample6 = ResampleAndSelect(resp6, grouped6)
#sample6.shape



In [24]:

    
def ResamplePreg(preg):
    sample = thinkstats2.ResampleRowsWeighted(preg, column='finalwgt')
    return sample

#%time sample7 = ResamplePreg(preg7)
#ample7.shape



In [25]:

    
def Resample(pregs):
    samples = [thinkstats2.ResampleRowsWeighted(preg, column='finalwgt') 
               for preg in pregs]
    sample = pd.concat(samples)
    return sample

sample = Resample(pregs)



In [26]:

    
firsts = sample[sample.birthord == 1]
firsts.shape









    Out[26]:





(13864, 6)



In [27]:

    
others = sample[sample.birthord > 1]
others.shape









    Out[27]:





(16246, 6)



In [28]:

    
sum(firsts.prglngth.isnull())









    Out[28]:





0



In [29]:

    
sum(others.prglngth.isnull())









    Out[29]:





0



In [30]:

    
firsts.prglngth.value_counts().sort_index()









    Out[30]:





0        7
17       5
20       1
21       4
22      14
23       1
24      13
25      17
26      51
27      12
28      76
29      23
30     204
31      42
32     164
33      79
34     150
35     434
36     570
37     688
38    1110
39    5728
40    2218
41    1234
42     700
43     255
44      54
45       5
46       2
47       1
48       2
dtype: int64



In [31]:

    
others.prglngth.value_counts().sort_index()









    Out[31]:





9        1
18       1
19      12
20       7
21       7
22       6
23       3
24       8
25       8
26      32
27      22
28      50
29      30
30     168
31      25
32     156
33      79
34     179
35     474
36     657
37     990
38    1600
39    7732
40    2604
41     796
42     422
43     128
44      37
45       4
46       1
48       5
50       2
dtype: int64



In [32]:

    
len(firsts.prglngth), len(others.prglngth)









    Out[32]:





(13864, 16246)



In [33]:

    
firsts.prglngth.mean(), others.prglngth.mean()









    Out[33]:





(38.6130986728217, 38.518342976732733)



In [34]:

    
diff = firsts.prglngth.mean() - others.prglngth.mean()
diff, diff * 7, diff * 7 * 24









    Out[34]:





(0.094755696088967056, 0.66328987262276939, 15.918956942946465)



In [35]:

    
firsts.prglngth.std(), others.prglngth.std()









    Out[35]:





(2.8703583448993912, 2.3963914827683821)



In [36]:

    
se1 = firsts.prglngth.std() / math.sqrt(len(firsts.prglngth))
se1









    Out[36]:





0.024377650370258221



In [37]:

    
se2 = others.prglngth.std() / math.sqrt(len(others.prglngth))
se2









    Out[37]:





0.01880115556593805



In [38]:

    
diff / se1, diff / se2









    Out[38]:





(3.8869905282000872, 5.0398868174164484)



In [39]:

    
cdf_firsts = thinkstats2.Cdf(firsts.prglngth, label='firsts')

thinkplot.PrePlot(2)
thinkplot.Cdf(cdf_firsts)
thinkplot.Config(xlabel='pregnancy length (weeks)',
                 ylabel='CDF',
                 loc='upper left')



In [40]:

    
cdf_others = thinkstats2.Cdf(others.prglngth, label='others')

thinkplot.Cdf(cdf_others)
thinkplot.Config(xlabel='pregnancy length (weeks)',
                 ylabel='CDF',
                 loc='upper left')



In [41]:

    
jitter = 0.5

length_firsts = thinkstats2.Jitter(firsts.prglngth, jitter)
cdf_firsts = thinkstats2.Cdf(length_firsts, label='firsts')

length_others = thinkstats2.Jitter(others.prglngth, jitter)
cdf_others = thinkstats2.Cdf(length_others, label='others')

thinkplot.PrePlot(2)
thinkplot.Cdf(cdf_firsts)
thinkplot.Cdf(cdf_others)
thinkplot.Config(xlabel='pregnancy length (weeks)',
                 xlim=[30, 45],
                 ylabel='CDF',
                 loc='upper left')



In [42]:

    
def RemainingDurationPmf(pmf, weeks):
    """Returns PMF of remaining duration conditioned on weeks.
    
    pmf: PMF of pregnancy length
    weeks: current weeks of pregnancy
    """
    new = thinkstats2.Pmf(label=pmf.label)
    for x, p in pmf.Items():
        if x >= weeks:
            new[x - weeks] = p
    new.Normalize()
    return new



In [43]:

    
pmf_firsts = thinkstats2.Pmf(length_firsts, label='firsts')
pmf_others = thinkstats2.Pmf(length_others, label='others')



In [44]:

    
def PlotRemaining(pmfs, weeks):
    """Plot CDF of remaining weeks.
    
    pmfs: list of PMF
    weeks: current weeks of pregnancy
    """
    cdfs = [RemainingDurationPmf(pmf, weeks).MakeCdf()
            for pmf in pmfs]

    thinkplot.PrePlot(len(cdfs))
    thinkplot.Cdfs(cdfs)



In [45]:

    
loc = 'lower right'
PlotRemaining([pmf_firsts, pmf_others], 30)
thinkplot.Config(xlabel='pregnancy length (weeks)',
                 ylabel='CDF',
                 loc=loc)



In [46]:

    
PlotRemaining([pmf_firsts, pmf_others], 32)
thinkplot.Config(xlabel='pregnancy length (weeks)',
                 ylabel='CDF',
                 loc=loc)



In [47]:

    
PlotRemaining([pmf_firsts, pmf_others], 34)
thinkplot.Config(xlabel='pregnancy length (weeks)',
                 ylabel='CDF',
                 loc=loc)



In [48]:

    
PlotRemaining([pmf_firsts, pmf_others], 36)
thinkplot.Config(xlabel='pregnancy length (weeks)',
                 ylabel='CDF',
                 loc=loc)



In [49]:

    
PlotRemaining([pmf_firsts, pmf_others], 38)
thinkplot.Config(xlabel='pregnancy length (weeks)',
                 ylabel='CDF',
                 loc=loc)



In [50]:

    
PlotRemaining([pmf_firsts, pmf_others], 39)
thinkplot.Config(xlabel='pregnancy length (weeks)',
                 ylabel='CDF',
                 loc=loc)



In [51]:

    
def PercentilesRemaining(pmfs, weeks):
    cdfs = [RemainingDurationPmf(pmf, weeks).MakeCdf()
            for pmf in pmfs]
    
    for cdf in cdfs:
        print([cdf.Percentile(p) * 7 for p in [25, 50, 75]])



In [52]:

    
PercentilesRemaining([pmf_firsts, pmf_others], 39)









    



[2.7143548802028761, 6.4367781041855068, 13.206703169606556]
[2.1736408667971858, 4.9317939465359615, 9.3739933152324184]



In [53]:

    
percentiles = [50, 75, 25]

def PercentilesRemaining(pmf, ts):
    array = np.zeros((len(ts), len(percentiles)))
    
    for i, t in enumerate(ts):
        cdf = RemainingDurationPmf(pmf, t).MakeCdf()
        ps = [cdf.Percentile(p) * 7 for p in percentiles]
        #print(t, ps)
        array[i, :] = ps
        
    return array



In [54]:

    
ts = np.arange(36, 44)
ps_firsts = PercentilesRemaining(pmf_firsts, ts)
ps_firsts









    Out[54]:





array([[ 22.64578082,  29.14642029,  17.95618349],
       [ 16.10550278,  22.63422464,  11.78280693],
       [  9.97194385,  16.53705261,   5.9922938 ],
       [  6.4367781 ,  13.20670317,   2.71435488],
       [  6.76981776,  12.44368589,   2.86739126],
       [  5.59852858,  10.38567513,   2.37625612],
       [  4.38732448,   7.85479168,   2.06292504],
       [  2.57863214,   6.10710894,   1.19931778]])



In [55]:

    
ps_others = PercentilesRemaining(pmf_others, ts)
ps_others









    Out[55]:





array([[ 21.60271001,  26.40089751,  17.18171224],
       [ 15.00369008,  19.79528516,  11.06307321],
       [  8.87411349,  13.48416292,   5.44552693],
       [  4.93179395,   9.37399332,   2.17364087],
       [  4.65239838,   9.90509012,   1.87870862],
       [  5.5198786 ,   9.65339663,   2.22971555],
       [  4.32478024,   7.90697449,   1.72204552],
       [  4.01518618,   7.66307581,   1.71902067]])



In [56]:

    
nrows, ncols = ps_firsts.shape

thinkplot.PrePlot(3)
for i in range(ncols):
    thinkplot.Plot(ts, ps_firsts[:, i], label=percentiles[i])

thinkplot.PrePlot(3)
for i in range(ncols):
    thinkplot.Plot(ts, ps_others[:, i], linestyle='dashed')
    
thinkplot.Config(xlabel='t (weeks)',
                 ylabel='remaining time (days)',
                 legend=True,
                 loc='upper right')



In [57]:

    
sample = Resample(pregs)
sample.shape









    Out[57]:





(30110, 6)



In [59]:

    
firsts = sample[sample.birthord == 1] 
others = sample[sample.birthord > 1]



In [60]:

    
jitter = 0.5
lengths = thinkstats2.Jitter(firsts.prglngth, jitter)
pmf_firsts = thinkstats2.Pmf(lengths, label='firsts')

lengths = thinkstats2.Jitter(others.prglngth, jitter)
pmf_others = thinkstats2.Pmf(lengths, label='others')



In [61]:

    
def SelectFirsts(preg):
    return preg[preg.birthord == 1]



In [62]:

    
def SelectOthers(preg):
    return preg[preg.birthord > 1]



In [63]:

    
def MeanRemaining(pmf, t):
    rem_pmf = RemainingDurationPmf(pmf, t)
    mean = rem_pmf.Mean()
    return mean

MeanRemaining(pmf_firsts, 36)









    Out[63]:





3.4346722690706315



In [64]:

    
def MedianRemaining(pmf, t):
    rem_pmf = RemainingDurationPmf(pmf, t)
    rem_cdf = rem_pmf.MakeCdf()
    median = rem_cdf.Percentile(50)
    return median

MedianRemaining(pmf_firsts, 36)









    Out[64]:





3.2662877239065011



In [65]:

    
def ProbRemainingLess(pmf, t, weeks=1):
    rem_pmf = RemainingDurationPmf(pmf, t)
    rem_cdf = rem_pmf.MakeCdf()
    p = rem_cdf[weeks]
    return p

ProbRemainingLess(pmf_firsts, 36)









    Out[65]:





0.048448823385729728



In [66]:

    
def Remaining(preg, iters, ts, select_func, stat_func):
    """Computes remaining lifetime versus weeks of pregnancy.
    
    preg: DataFrame of pregnancy data
    iters: int, how many resampling iterations to run
    ts: sequence of float, times to evaluate, in weeks
    select_func: function that selects a subset of pregnancies
    stat_func: function that computes summary statistic
    
    returns: array with one row for each percentile, one column
             for each item in ts
    """
    # make an array to hold one row per iteration,
    # one col per element of ts
    array = np.zeros((iters, len(ts)))

    for i in range(iters):
        sample = Resample(pregs)
        group = select_func(sample)

        lengths = thinkstats2.Jitter(group.prglngth, jitter=0.5)
        pmf = thinkstats2.Pmf(lengths)

        stats = [stat_func(pmf, t) for t in ts]
        array[i, :] = stats

    array = np.sort(array, axis=0)
    rows = [thinkstats2.PercentileRow(array, p) for p in [10, 50, 90]]
    return rows



In [71]:

    
def PlotStats(stat_func, iters, ts):
    """Plots a summary statistic versus weeks of pregnancy.
    
    stat_func: function that computes summary statistic
    iters: int, how many resampling iterations to run
    ts: sequence of float, times to evaluate, in weeks
    """
    # list of labels and select_funcs
    labels = [('firsts', SelectFirsts), 
              ('others', SelectOthers)]

    thinkplot.PrePlot(2)
    for label, select_func in labels:
        rows = Remaining(pmf_firsts, iters, ts, select_func, stat_func)
        thinkplot.FillBetween(ts, rows[0], rows[2], color='gray')
        thinkplot.Plot(ts, rows[1], label=label)
        print(label)
        for t, stat in zip(ts, rows[1]):
            print(t, stat)



In [74]:

    
iters = 101
ts = np.arange(36, 43.5, 0.5)
PlotStats(MeanRemaining, iters, ts)

thinkplot.Config(xlabel='week of pregnancy',
                 ylabel='remaining time (weeks)',
                 legend=True,
                 loc='upper right')

thinkplot.Save(root='first1', formats=['pdf', 'png'])









    



firsts
36.0 3.39754156441
36.5 2.97508980334
37.0 2.55456121193
37.5 2.13886585341
38.0 1.74537235835
38.5 1.42681323973
39.0 1.26628280514
39.5 1.23078085688
40.0 1.17889460956
40.5 1.09181678181
41.0 0.980629908801
41.5 0.862167177764
42.0 0.751218876485
42.5 0.68828895529
43.0 0.629491972579
others
36.0 3.13540208072
36.5 2.70930346011
37.0 2.29039746926
37.5 1.88055601698
38.0 1.49280931068
38.5 1.17409914359
39.0 1.0078923987
39.5 0.985300112973
40.0 0.998127077118
40.5 1.01971112502
41.0 1.01329044103
41.5 0.95862350461
42.0 0.911055084446
42.5 0.911300869622
43.0 0.98479776888
Writing first1.pdf
Writing first1.png






    





<matplotlib.figure.Figure at 0x7f576f2e0c50>



In [75]:

    
PlotStats(MedianRemaining, iters, ts)

thinkplot.Config(xlabel='t (weeks)',
                 ylabel='week of pregnancy',
                 legend=True,
                 loc='upper right')

thinkplot.Save(root='first2', formats=['pdf', 'png'])









    



firsts
36.0 3.241368552
36.5 2.77492320164
37.0 2.31279655389
37.5 1.86185466683
38.0 1.43469000445
38.5 1.08499873418
39.0 0.928918281925
39.5 0.942092187176
40.0 0.944034901894
40.5 0.893216546978
41.0 0.794109392844
41.5 0.680121224128
42.0 0.569214418639
42.5 0.506499558605
43.0 0.443257768096
others
36.0 3.08268446578
36.5 2.61050684262
37.0 2.14660335686
37.5 1.69113799843
38.0 1.26087655638
38.5 0.90146729939
39.0 0.707699287694
39.5 0.669730616568
40.0 0.675815864934
40.5 0.735595662149
41.0 0.753391859416
41.5 0.699055415717
42.0 0.623727458023
42.5 0.565583628248
43.0 0.515788745635
Writing first2.pdf
Writing first2.png






    





<matplotlib.figure.Figure at 0x7f57700b1490>



In [76]:

    
PlotStats(ProbRemainingLess, iters, ts)

thinkplot.Config(xlabel='week of pregnancy',
                 ylabel='prob(birth within 1 week)',
                 legend=True,
                 loc='upper left')

thinkplot.Save(root='first3', formats=['pdf', 'png'])









    



firsts
36.0 0.0518651832461
36.5 0.0631412127784
37.0 0.087215320911
37.5 0.157749488115
38.0 0.311731315043
38.5 0.466408539138
39.0 0.524979746152
39.5 0.521584861029
40.0 0.519954389966
40.5 0.54363256785
41.0 0.592375366569
41.5 0.665464382326
42.0 0.723489932886
42.5 0.774373259053
43.0 0.815533980583
others
36.0 0.0570483391844
36.5 0.0735008328706
37.0 0.10406256789
37.5 0.18672074508
38.0 0.364137483787
38.5 0.547933884298
39.0 0.631398013751
39.5 0.644706857261
40.0 0.631614654003
40.5 0.607536231884
41.0 0.608486017358
41.5 0.648367952522
42.0 0.687192118227
42.5 0.721030042918
43.0 0.739837398374
Writing first3.pdf
Writing first3.png






    





<matplotlib.figure.Figure at 0x7f579d60d610>



In [ ]: