validate_nsfg


Validate NSFG data


In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import seaborn as sns

import math

import matplotlib.pyplot as plt

import thinkstats2
import thinkplot
from thinkstats2 import Cdf

import survival
import marriage

Validating the 1982 data:


In [2]:
resp3 = marriage.ReadFemResp1982()
marriage.Validate1982(resp3)
resp3.shape


Out[2]:
(7969, 24)

In [3]:
resp3.head()


Out[3]:
finalwgt ageint mar2p cmmarrhx fmarital cmintvw cmbirth f18m1 cmdivorcx cmstphsbx ... widowed stillma cycle agemarry age missing year age_index agemarry_index birth_index
0 59452 25 NaN NaN 6 997 690 NaN NaN NaN ... False False 3 NaN 25.583333 False 57 25.0 NaN 50
1 16147 27 2258.0 944.0 5 997 673 5.0 NaN NaN ... False False 3 22.583333 27.000000 False 56 27.0 22.0 50
2 14618 39 2450.0 819.0 1 997 525 NaN NaN NaN ... False True 3 24.500000 39.333333 False 43 39.0 24.0 40
3 24772 26 NaN NaN 6 993 681 NaN NaN NaN ... False False 3 NaN 26.000000 False 56 26.0 NaN 50
4 66226 17 NaN NaN 6 997 790 NaN NaN NaN ... False False 3 NaN 17.250000 False 65 17.0 NaN 60

5 rows × 24 columns


In [4]:
sum(resp3.fmarno.isnull())


Out[4]:
3148

In [5]:
resp3.fmarno.value_counts().sort_index()


Out[5]:
0.0     170
1.0    3953
2.0     612
3.0      77
4.0       5
5.0       2
8.0       2
Name: fmarno, dtype: int64

In [6]:
resp3.widowed.value_counts()


Out[6]:
False    7850
True      119
Name: widowed, dtype: int64

In [ ]:


In [7]:
sum(resp3.cmdivorcx.isnull()), 6841+29+5+56


Out[7]:
(6931, 6931)

In [8]:
sum((resp3.cmdivorcx >= 400) & (resp3.cmdivorcx <= 948)), 583+128


Out[8]:
(711, 711)

In [9]:
sum((resp3.cmdivorcx >= 949) & (resp3.cmdivorcx <= 1000)), 311+16


Out[9]:
(327, 327)

In [10]:
sum(resp3.cmstphsbx.isnull()), 7571+1+6+31


Out[10]:
(7609, 7609)

In [11]:
sum((resp3.cmstphsbx >= 400) & (resp3.cmstphsbx <= 948)), 125+33


Out[11]:
(158, 158)

In [12]:
sum((resp3.cmstphsbx >= 949) & (resp3.cmstphsbx <= 1000)), 197+5


Out[12]:
(202, 202)

In [13]:
sum(resp3.divorced)


Out[13]:
1122

In [14]:
sum(~resp3.cmstphsbx.isnull())


Out[14]:
360

In [15]:
sum(~resp3.cmdivorcx.isnull())


Out[15]:
1038

Validating the 1988 data:


In [16]:
resp4 = marriage.ReadFemResp1988()
marriage.Validate1988(resp4)
resp4.shape


Out[16]:
(8450, 24)

In [17]:
resp4.fmarno.describe()


Out[17]:
count    8450.000000
mean        0.761183
std         0.711908
min         0.000000
25%         0.000000
50%         1.000000
75%         1.000000
max         5.000000
Name: fmarno, dtype: float64

In [18]:
resp4.fmarno.value_counts().sort_index()


Out[18]:
0    3160
1    4325
2     829
3     108
4      15
5      13
Name: fmarno, dtype: int64

In [19]:
resp4.evrmarry.value_counts().sort_index()


Out[19]:
False    3160
True     5290
Name: evrmarry, dtype: int64

In [20]:
resp4.separated.value_counts().sort_index()


Out[20]:
False    8120
True      330
Name: separated, dtype: int64

In [21]:
resp4.head()


Out[21]:
finalwgt ageint currentcm firstcm cmintvw cmbirth f23m1 cmdivorcx cmstphsbx fmarno ... widowed stillma cycle agemarry age missing year age_index agemarry_index birth_index
0 713792 28 NaN NaN 1061 723 NaN NaN NaN 0 ... False False 4 NaN 28.166667 False 60 28.0 NaN 60
1 367022 37 NaN NaN 1059 614 NaN NaN NaN 0 ... False False 4 NaN 37.083333 False 51 37.0 NaN 50
2 975924 21 NaN NaN 1057 796 NaN NaN NaN 0 ... False False 4 NaN 21.750000 False 66 21.0 NaN 60
3 587796 39 NaN 838.0 1057 581 2.0 943.0 NaN 1 ... False False 4 21.416667 39.666667 False 48 39.0 21.0 40
4 719633 31 974.0 882.0 1062 683 2.0 930.0 NaN 2 ... False False 4 16.583333 31.583333 False 56 31.0 16.0 50

5 rows × 24 columns


In [22]:
sum(resp4.cmdivorcx.isnull()), 6897+77


Out[22]:
(6974, 6974)

In [23]:
sum((resp4.cmdivorcx >= 685) & (resp4.cmdivorcx <= 1008)), 903+152


Out[23]:
(1055, 1055)

In [24]:
sum((resp4.cmdivorcx >= 1009) & (resp4.cmdivorcx <= 1064)), 399+22


Out[24]:
(421, 421)

In [25]:
sum(resp4.cmstphsbx.isnull()), 8113+29


Out[25]:
(8142, 8142)

In [26]:
sum((resp4.cmstphsbx >= 685) & (resp4.cmstphsbx <= 1008)), 75+31


Out[26]:
(106, 106)

In [27]:
sum((resp4.cmstphsbx >= 1009) & (resp4.cmstphsbx <= 1064)), 193+9


Out[27]:
(202, 202)

In [28]:
sum(resp4.divorced)


Out[28]:
1553

In [29]:
sum(~resp4.cmstphsbx.isnull())


Out[29]:
308

In [30]:
sum(~resp4.cmdivorcx.isnull())


Out[30]:
1476

Validating the 1995 data:


In [31]:
resp5 = marriage.ReadFemResp1995()
marriage.Validate1995(resp5)
resp5.shape


Out[31]:
(10847, 22)

In [32]:
resp5.head()


Out[32]:
cmintvw timesmar cmmarrhx cmbirth finalwgt marend01 cmdivorcx cmstphsbx marstat evrmarry ... widowed stillma cycle agemarry age missing year age_index agemarry_index birth_index
0 1149 NaN NaN 837 6776.3796 NaN NaN NaN 5 False ... False False 5 NaN 26.000000 False 69 26.0 NaN 60
1 1142 1.0 1024.0 781 841.8221 NaN NaN NaN 4 True ... False True 5 20.250000 30.083333 False 65 30.0 20.0 60
2 1145 NaN NaN 907 3617.3637 NaN NaN NaN 5 False ... False False 5 NaN 19.833333 False 75 19.0 NaN 70
3 1145 1.0 1133.0 771 10788.1271 NaN NaN NaN 4 True ... False True 5 30.166667 31.166667 False 64 31.0 30.0 60
4 1144 NaN NaN 927 5359.4150 NaN NaN NaN 5 False ... False False 5 NaN 18.083333 False 77 18.0 NaN 70

5 rows × 22 columns


In [33]:
sum(resp5.cmdivorcx.isnull()), 9601+1+1+22


Out[33]:
(9625, 9625)

In [34]:
sum((resp5.cmdivorcx >= 780) & (resp5.cmdivorcx <= 1092)), 1116


Out[34]:
(1116, 1116)

In [35]:
sum((resp5.cmdivorcx >= 1093) & (resp5.cmdivorcx <= 1150)), 106


Out[35]:
(106, 106)

In [36]:
sum(resp5.cmstphsbx.isnull()), 9601+1+15


Out[36]:
(9617, 9617)

In [37]:
sum((resp5.cmstphsbx >= 780) & (resp5.cmstphsbx <= 1092)), 1167


Out[37]:
(1167, 1167)

In [38]:
sum((resp5.cmstphsbx >= 1093) & (resp5.cmstphsbx <= 1150)), 63


Out[38]:
(63, 63)

In [39]:
resp5.marend01.value_counts().sort_index()


Out[39]:
1.0      33
2.0    1223
3.0      23
7.0       2
9.0       1
Name: marend01, dtype: int64

In [40]:
sum(resp5.divorced)


Out[40]:
33

In [41]:
sum(~resp5.cmstphsbx.isnull())


Out[41]:
1230

In [42]:
sum(~resp5.cmdivorcx.isnull())


Out[42]:
1222

In [43]:
sum((resp5.marend01==2) & (resp5.cmdivorcx.isnull()))


Out[43]:
19

In [44]:
sum((resp5.marend01==3) & (resp5.cmdivorcx.isnull()))


Out[44]:
5

In [45]:
sum((~resp5.divorced) & (~resp5.cmstphsbx.isnull()))


Out[45]:
1230

In [46]:
resp5.marstat.value_counts().sort_index()


Out[46]:
1    5291
2      93
3    1008
4     452
5    4003
Name: marstat, dtype: int64

In [47]:
sum((resp5.marstat==4) & (~resp5.divorced))   # separated and not divorced


Out[47]:
446

In [48]:
sum((resp5.marstat==4) & (~resp5.divorced) & (~resp5.cmstphsbx.isnull()))


Out[48]:
90

Validating the 2002 data:


In [49]:
resp6 = marriage.ReadFemResp2002()
marriage.Validate2002(resp6)
resp6.shape


Out[49]:
(7643, 26)

In [50]:
resp6.head()


Out[50]:
caseid cmbirth evrmarry cmmarrhx cmdivorcx parity rmarital fmarno mardat01 mardis01 ... widowed stillma cycle agemarry age missing year age_index agemarry_index birth_index
0 2298 902 False NaN NaN 4 2 0 NaN NaN ... False False 6 NaN 27.666667 False 75 27.0 NaN 70
1 5012 718 True 974.0 1077.0 1 1 2 974.0 1077.0 ... False False 6 21.333333 42.916667 False 59 42.0 21.0 50
2 11586 708 True 910.0 938.0 1 4 1 910.0 921.0 ... False False 6 16.833333 43.833333 False 58 43.0 16.0 50
3 6794 1042 False NaN NaN 0 6 0 NaN NaN ... False False 6 NaN 16.000000 False 86 16.0 NaN 80
4 616 991 False NaN NaN 0 6 0 NaN NaN ... False False 6 NaN 20.166667 False 82 20.0 NaN 80

5 rows × 26 columns


In [51]:
sum(resp6.cmdivorcx.isnull()), 6639+2+20


Out[51]:
(6639, 6661)

In [52]:
sum((resp6.cmdivorcx >= 301) & (resp6.cmdivorcx <= 1164)), 607


Out[52]:
(607, 607)

In [53]:
sum((resp6.cmdivorcx >= 1165) & (resp6.cmdivorcx <= 1239)), 375


Out[53]:
(375, 375)

In [54]:
resp6.marend01.value_counts().sort_index()


Out[54]:
1.0    1232
2.0     260
3.0      58
Name: marend01, dtype: int64

In [55]:
sum(resp6.divorced), sum(resp6.loc[resp6.divorced, 'mar1diss'].isnull())


Out[55]:
(1232, 0)

In [56]:
sum(resp6.separated), sum(resp6.loc[resp6.separated, 'mar1diss'].isnull())


Out[56]:
(260, 0)

In [57]:
sum(resp6.widowed), sum(resp6.loc[resp6.widowed, 'mar1diss'].isnull())


Out[57]:
(58, 0)

In [58]:
sum(resp6.stillma), sum(resp6.loc[resp6.stillma, 'mar1diss'].isnull())


Out[58]:
(2576, 0)

In [59]:
sum(resp6.evrmarry), 1232+260+58+2576


Out[59]:
(4126, 4126)

Validating the 2010 data:


In [60]:
resp7 = marriage.ReadFemResp2010()
marriage.Validate2010(resp7)
resp7.shape


Out[60]:
(12279, 27)

In [61]:
resp7.head()


Out[61]:
caseid cmbirth evrmarry cmmarrhx cmdivorcx rmarital parity fmarno mardat01 mardis01 ... stillma finalwgt cycle agemarry age missing year age_index agemarry_index birth_index
0 34156 799 False NaN NaN 6 0 0 NaN NaN ... False 2571.376599 7 NaN 43.666667 False 66 43.0 NaN 60
1 40081 925 True 1314.0 NaN 1 0 1 1314.0 NaN ... True 11716.317848 7 32.416667 33.166667 False 77 33.0 32.0 70
2 32817 958 False NaN NaN 6 0 0 NaN NaN ... False 6794.156444 7 NaN 27.416667 False 79 27.0 NaN 70
3 39968 869 False NaN NaN 6 0 0 NaN NaN ... False 5469.435481 7 NaN 37.833333 False 72 37.0 NaN 70
4 27121 1004 False NaN NaN 6 0 0 NaN NaN ... False 6544.538107 7 NaN 23.416667 False 83 23.0 NaN 80

5 rows × 27 columns


In [62]:
sum(resp7.cmdivorcx.isnull()), 10705+1+19


Out[62]:
(10705, 10725)

In [63]:
sum((resp7.cmdivorcx >= 522) & (resp7.cmdivorcx <= 1278)), 1286


Out[63]:
(1286, 1286)

In [64]:
sum((resp7.cmdivorcx >= 1279) & (resp7.cmdivorcx <= 1290)), 106


Out[64]:
(106, 106)

In [65]:
sum((resp7.cmdivorcx >= 1291) & (resp7.cmdivorcx <= 1308)), 112


Out[65]:
(112, 112)

In [66]:
sum((resp7.cmdivorcx >= 1309) & (resp7.cmdivorcx <= 1326)), 50


Out[66]:
(50, 50)

In [67]:
resp7.marend01.value_counts().sort_index()


Out[67]:
1.0    1574
2.0     405
3.0      68
Name: marend01, dtype: int64

In [68]:
sum(resp7.divorced), sum(resp7.loc[resp7.divorced, 'mar1diss'].isnull())


Out[68]:
(1574, 0)

In [69]:
sum(resp7.separated), sum(resp7.loc[resp7.separated, 'mar1diss'].isnull())


Out[69]:
(405, 0)

In [70]:
sum(resp7.widowed), sum(resp7.loc[resp7.widowed, 'mar1diss'].isnull())


Out[70]:
(68, 0)

In [71]:
sum(resp7.stillma), sum(resp7.loc[resp7.stillma, 'mar1diss'].isnull())


Out[71]:
(3487, 0)

In [72]:
sum(resp7.evrmarry), 1574+405+68+3487


Out[72]:
(5534, 5534)

Validating the 2013 data:


In [73]:
resp8 = marriage.ReadFemResp2013()
marriage.Validate2013(resp8)
resp8.shape


Out[73]:
(5601, 27)

In [74]:
resp8.head()


Out[74]:
caseid cmbirth evrmarry cmmarrhx cmdivorcx rmarital parity fmarno mardat01 mardis01 ... stillma finalwgt cycle agemarry age missing year age_index agemarry_index birth_index
0 50002 922 True 1303.0 NaN 5 1 1 1303.0 1353.0 ... False 1973.648419 8 31.750000 36.250000 False 76 36.0 31.0 70
1 50004 986 True 1215.0 1243.0 5 0 2 1215.0 1219.0 ... False 2584.350089 8 19.083333 29.750000 False 82 29.0 19.0 80
2 50005 1001 False NaN NaN 2 1 0 NaN NaN ... False 6533.272794 8 NaN 30.000000 False 83 30.0 NaN 80
3 50008 834 False NaN NaN 6 3 0 NaN NaN ... False 2543.439624 8 NaN 43.333333 False 69 43.0 NaN 60
4 50013 1127 False NaN NaN 6 0 0 NaN NaN ... False 6283.773455 8 NaN 19.750000 False 93 19.0 NaN 90

5 rows × 27 columns


In [75]:
sum(resp8.cmdivorcx.isnull()), 4851+2+24


Out[75]:
(4851, 4877)

In [76]:
sum((resp8.cmdivorcx >= 380) & (resp8.cmdivorcx <= 1340)), 658


Out[76]:
(658, 658)

In [77]:
sum((resp8.cmdivorcx >= 1341) & (resp8.cmdivorcx <= 1352)), 48


Out[77]:
(48, 48)

In [78]:
sum((resp8.cmdivorcx >= 1353) & (resp8.cmdivorcx <= 1365)), 18


Out[78]:
(18, 18)

In [79]:
resp8.marend01.value_counts().sort_index()


Out[79]:
1.0    755
2.0    214
3.0     26
Name: marend01, dtype: int64

In [80]:
sum(resp8.divorced), sum(resp8.loc[resp8.divorced, 'mar1diss'].isnull())


Out[80]:
(755, 0)

In [81]:
sum(resp8.separated), sum(resp8.loc[resp8.separated, 'mar1diss'].isnull())


Out[81]:
(214, 0)

In [82]:
sum(resp8.widowed), sum(resp8.loc[resp8.widowed, 'mar1diss'].isnull())


Out[82]:
(26, 0)

In [83]:
sum(resp8.stillma), sum(resp8.loc[resp8.stillma, 'mar1diss'].isnull())


Out[83]:
(1457, 0)

In [84]:
sum(resp8.evrmarry), 755+214+26+1457


Out[84]:
(2452, 2452)

Validating the 2015 data:


In [85]:
resp9 = marriage.ReadFemResp2015()
marriage.Validate2015(resp9)
resp9.shape


Out[85]:
(5699, 27)

In [86]:
resp9.head()


Out[86]:
caseid cmbirth evrmarry cmmarrhx cmdivorcx rmarital parity fmarno mardat01 mardis01 ... stillma finalwgt cycle agemarry age missing year age_index agemarry_index birth_index
0 60418 921 True 1159.0 1269.0 2 3 1 1159.0 1258.0 ... False 3554.964843 9 19.833333 38.333333 False 76 38.0 19.0 70
1 60419 942 True 1207.0 1253.0 2 0 1 1207.0 1253.0 ... False 2484.535358 9 22.083333 37.166667 False 78 37.0 22.0 70
2 60420 981 False NaN NaN 6 1 0 NaN NaN ... False 2903.782914 9 NaN 33.916667 False 81 33.0 NaN 80
3 60423 1010 False NaN NaN 6 2 0 NaN NaN ... False 2738.498331 9 NaN 31.416667 False 84 31.0 NaN 80
4 60426 897 True 1256.0 NaN 1 1 1 1256.0 NaN ... True 9682.211381 9 29.916667 39.250000 False 74 39.0 29.0 70

5 rows × 27 columns


In [87]:
sum(resp9.cmdivorcx.isnull()), 4946+3+18


Out[87]:
(4946, 4967)

In [88]:
sum((resp9.cmdivorcx >= 378) & (resp9.cmdivorcx <= 1340)), 563


Out[88]:
(563, 563)

In [89]:
sum((resp9.cmdivorcx >= 1341) & (resp9.cmdivorcx <= 1352)), 56


Out[89]:
(56, 56)

In [90]:
sum((resp9.cmdivorcx >= 1353) & (resp9.cmdivorcx <= 1364)), 48


Out[90]:
(48, 48)

In [91]:
sum((resp9.cmdivorcx >= 1365) & (resp9.cmdivorcx <= 1376)), 46


Out[91]:
(46, 46)

In [92]:
sum((resp9.cmdivorcx >= 1377) & (resp9.cmdivorcx <= 1389)), 19


Out[92]:
(19, 19)

In [93]:
resp9.marend01.value_counts().sort_index()


Out[93]:
1.0    756
2.0    169
3.0     28
Name: marend01, dtype: int64

In [94]:
sum(resp9.divorced), sum(resp9.loc[resp9.divorced, 'mar1diss'].isnull())


Out[94]:
(756, 0)

In [95]:
sum(resp9.separated), sum(resp9.loc[resp9.separated, 'mar1diss'].isnull())


Out[95]:
(169, 0)

In [96]:
sum(resp9.widowed), sum(resp9.loc[resp9.widowed, 'mar1diss'].isnull())


Out[96]:
(28, 0)

In [97]:
sum(resp9.stillma), sum(resp9.loc[resp9.stillma, 'mar1diss'].isnull())


Out[97]:
(1448, 0)

In [98]:
sum(resp9.evrmarry), 756+169+28+1448


Out[98]:
(2401, 2401)

Validating 2017 data for marriage analysis


In [99]:
resp10 = marriage.ReadFemResp2017()
marriage.Validate2017(resp10)

In [100]:
resp10.shape


Out[100]:
(5554, 27)

In [101]:
sum(resp10.evrmarry)


Out[101]:
2582

In [102]:
resp10.agemarry.value_counts().max()


Out[102]:
29

In [103]:
resp10.head()


Out[103]:
caseid evrmarry ager rmarital parity fmarno mardat01 mardis01 marend01 mar1diss ... stillma finalwgt cycle agemarry age missing year age_index agemarry_index birth_index
0 70623 False 16 6 0 0 NaN NaN NaN NaN ... False 3631.827872 10 NaN 16.0 False 99 16.0 NaN 90
1 70624 True 40 4 0 1 2004.0 2007.0 1.0 3.0 ... False 18827.876206 10 27.833333 40.0 False 76 40.0 27.0 70
2 70625 False 15 6 0 0 NaN NaN NaN NaN ... False 3354.443830 10 NaN 15.0 False 101 15.0 NaN 100
3 70627 True 35 1 2 1 2008.0 NaN NaN 5.0 ... True 19877.457610 10 26.833333 35.0 False 81 35.0 26.0 80
4 70628 True 28 1 3 1 2010.0 NaN NaN 5.0 ... True 4221.017695 10 20.583333 28.0 False 89 28.0 20.0 80

5 rows × 27 columns


In [104]:
thinkplot.Cdf(Cdf(resp10.age))
thinkplot.Config(xlabel='Age', ylabel='CDF')



In [105]:
thinkplot.Cdf(Cdf(resp10.agemarry))
thinkplot.Config(xlabel='Age at first marriage', ylabel='CDF')



In [106]:
thinkplot.Cdf(Cdf(resp10.finalwgt))
thinkplot.Config(xlabel='Sampling weight', ylabel='CDF')


Make a list of DataFrames, one for each cycle:


In [107]:
resps = [resp10, resp9, resp8, resp7, resp6, resp5, resp4, resp3]
#resps = [resp10]

Make a table showing the number of respondents in each cycle:


In [108]:
def SummarizeCycle(df):
    ages = df.age.min(), df.age.max()
    ages= np.array(ages)
    
    intvws = df.cmintvw.min(), df.cmintvw.max()
    intvws = np.array(intvws) / 12 + 1900
    
    births = df.cmbirth.min(), df.cmbirth.max()
    births = np.array(births) / 12 + 1900

    print('# & ', intvws.astype(int), '&', len(df), '&', births.astype(int), r'\\')
    
for resp in reversed(resps):
    SummarizeCycle(resp)


# &  [1982 1983] & 7969 & [1937 1968] \\
# &  [1988 1988] & 8450 & [1943 1973] \\
# &  [1995 1995] & 10847 & [1950 1980] \\
# &  [2002 2003] & 7643 & [1957 1988] \\
# &  [2006 2010] & 12279 & [1961 1995] \\
# &  [2011 2013] & 5601 & [1966 1998] \\
# &  [2013 2015] & 5699 & [1968 2000] \\
# &  [2015 2017] & 5554 & [1966 2002] \\

Check for missing values in agemarry:


In [109]:
def CheckAgeVars(df):
    print(sum(df[df.evrmarry].agemarry.isnull()))
    
for resp in resps:
    CheckAgeVars(resp)


0
11
17
16
37
0
22
0

Combine the DataFrames (but remember that this is not resampled properly):


In [134]:
df = pd.concat(resps, ignore_index=True, sort=False)
len(df)


Out[134]:
64042

Double check missing data:


In [135]:
sum(df.missing)


Out[135]:
103

Generate a table with the number of respondents in each cohort:


In [136]:
marriage.DigitizeResp(df)
grouped = df.groupby('birth_index')
for name, group in iter(grouped):
    print(name, '&', len(group), '&', int(group.age.min()), '--', int(group.age_index.max()),
                '&', len(group[group.evrmarry]), '&', sum(group.missing), r'\\')


30 & 325 & 42 -- 45 & 310 & 0 \\
40 & 3608 & 32 -- 45 & 3287 & 9 \\
50 & 10631 & 22 -- 45 & 8667 & 18 \\
60 & 14951 & 15 -- 50 & 8813 & 33 \\
70 & 15016 & 14 -- 47 & 8075 & 30 \\
80 & 12397 & 14 -- 37 & 4208 & 13 \\
90 & 6695 & 15 -- 27 & 517 & 0 \\
100 & 419 & 15 -- 17 & 0 & 0 \\

In [138]:
df.to_hdf('FemMarriageData.hdf', 'FemMarriageData')

In [139]:
%time nsfg_female = pd.read_hdf('FemMarriageData.hdf', 'FemMarriageData')


CPU times: user 14.4 ms, sys: 3.94 ms, total: 18.3 ms
Wall time: 18 ms

Male data


In [8]:
male2017 = marriage.ReadMaleResp2017()
male2017.head()


Out[8]:
caseid evrmarry ager rmarital fmarno mardat01 marend01 mar1diss wgt2015_2017 cmintvw ... stillma finalwgt cycle agemarry age missing year age_index agemarry_index birth_index
0 70622 True 29 1 1 2016.0 NaN 1.0 11801.703471 1399 ... True 11801.703471 10 28.416667 29.0 False 87 29.0 28.0 80
1 70626 False 18 6 0 NaN NaN NaN 5153.910962 1391 ... False 5153.910962 10 NaN 18.0 False 97 18.0 NaN 90
2 70629 False 23 6 0 NaN NaN NaN 10293.100099 1405 ... False 10293.100099 10 NaN 23.0 False 94 23.0 NaN 90
3 70631 False 17 6 0 NaN NaN NaN 14236.029179 1403 ... False 14236.029179 10 NaN 17.0 False 99 17.0 NaN 90
4 70636 False 37 6 0 NaN NaN NaN 11320.650257 1393 ... False 11320.650257 10 NaN 37.0 False 79 37.0 NaN 70

5 rows × 25 columns


In [3]:
male2017.agemarry.describe()


Out[3]:
count    1770.000000
mean       25.683663
std         5.292507
min        13.166667
25%        21.750000
50%        24.833333
75%        28.833333
max        47.000000
Name: agemarry, dtype: float64

In [4]:
male2017.age.describe()


Out[4]:
count    4540.000000
mean       30.455286
std        10.145992
min        15.000000
25%        21.000000
50%        30.000000
75%        39.000000
max        50.000000
Name: age, dtype: float64

In [5]:
male2017.missing.describe()


Out[5]:
count      4540
unique        1
top       False
freq       4540
Name: missing, dtype: object

In [6]:
male2017.year.describe()


Out[6]:
count    4540.000000
mean       85.779515
std        10.162161
min        65.000000
25%        77.000000
50%        86.000000
75%        95.000000
max       102.000000
Name: year, dtype: float64

In [ ]:


In [ ]:


In [1]:
male2002 = marriage.ReadMaleResp2002()
male2002.head()


Out[1]:
caseid cmbirth timesmar fmarit evrmarry marrend4 cmdivw mardat01 finalwgt cmintvw ... widowed stillma cycle agemarry age missing year age_index agemarry_index birth_index
0 12178 1052 NaN 5 False NaN NaN NaN 7085.207973 1233 ... False False 6 NaN 15.083333 False 87 15.0 NaN 80
1 3233 701 1.0 1 True NaN NaN 1007.0 24372.450063 1237 ... False True 6 25.500000 44.666667 False 58 44.0 25.0 50
2 12170 707 1.0 1 True NaN NaN 1199.0 7073.132109 1234 ... False True 6 41.000000 43.916667 False 58 43.0 41.0 50
3 6629 1052 NaN 5 False NaN NaN NaN 3105.158992 1234 ... False False 6 NaN 15.166667 False 87 15.0 NaN 80
4 6531 787 1.0 3 True NaN NaN 1107.0 4679.548824 1235 ... False False 6 26.666667 37.333333 False 65 37.0 26.0 60

5 rows × 24 columns


In [2]:
sum(male2002.divorced) + sum(male2002.separated) + sum(male2002.widowed)


Out[2]:
576

In [3]:
sum(male2002.evrmarry)


Out[3]:
1752

In [ ]:


In [ ]:


In [4]:
male2010 = marriage.ReadMaleResp2010()
male2010.head()


Out[4]:
caseid cmbirth evrmarry cmdivw rmarital fmarno mardat01 marend01 mar1diss wgtq1q16 ... stillma finalwgt cycle agemarry age missing year age_index agemarry_index birth_index
0 36133 1043 False NaN 6 0 NaN NaN NaN 7270.577055 ... False 7270.577055 7 NaN 23.333333 False 86 23.0 NaN 80
1 36636 1021 False NaN 6 0 NaN NaN NaN 7601.733277 ... False 7601.733277 7 NaN 25.083333 False 85 25.0 NaN 80
2 41125 1118 False NaN 6 0 NaN NaN NaN 4587.699601 ... False 4587.699601 7 NaN 17.000000 False 93 17.0 NaN 90
3 23607 948 False NaN 6 0 NaN NaN NaN 26566.111007 ... False 26566.111007 7 NaN 28.250000 False 78 28.0 NaN 70
4 40814 1018 False NaN 2 0 NaN NaN NaN 5709.710317 ... False 5709.710317 7 NaN 25.333333 False 84 25.0 NaN 80

5 rows × 25 columns


In [5]:
male2013 = marriage.ReadMaleResp2013()
male2013.head()


Out[5]:
caseid cmbirth evrmarry cmdivw rmarital fmarno mardat01 marend01 mar1diss wgt2011_2013 ... stillma finalwgt cycle agemarry age missing year age_index agemarry_index birth_index
0 50001 1027 True 1346.0 4 1 1237.0 1.0 87.0 8111.081751 ... False 8111.081751 8 17.500000 26.750000 False 85 26.0 17.0 80
1 50003 1160 False NaN 6 0 NaN NaN NaN 4082.414347 ... False 4082.414347 8 NaN 16.166667 False 96 16.0 NaN 90
2 50006 1025 True NaN 1 1 1335.0 NaN 25.0 25550.455475 ... True 25550.455475 8 25.833333 27.916667 False 85 27.0 25.0 80
3 50007 1028 False NaN 6 0 NaN NaN NaN 6754.912773 ... False 6754.912773 8 NaN 26.666667 False 85 26.0 NaN 80
4 50009 855 True NaN 1 1 1152.0 NaN 210.0 49442.342886 ... True 49442.342886 8 24.750000 42.250000 False 71 42.0 24.0 70

5 rows × 25 columns


In [6]:
male2015 = marriage.ReadMaleResp2015()
male2015.head()


Out[6]:
caseid cmbirth evrmarry cmdivw rmarital fmarno mardat01 marend01 mar1diss wgt2013_2015 ... stillma finalwgt cycle agemarry age missing year age_index agemarry_index birth_index
0 60417 926 False NaN 2 0 NaN NaN NaN 7693.012917 ... False 7693.012917 9 NaN 37.750000 False 77 37.0 NaN 70
1 60421 1161 False NaN 6 0 NaN NaN NaN 9643.271157 ... False 9643.271157 9 NaN 18.000000 False 96 18.0 NaN 90
2 60422 980 False NaN 2 0 NaN NaN NaN 9650.615384 ... False 9650.615384 9 NaN 33.000000 False 81 33.0 NaN 80
3 60424 990 True NaN 4 1 1326.0 1.0 45.0 11590.039167 ... False 11590.039167 9 28.000000 32.166667 False 82 32.0 28.0 80
4 60425 1143 True NaN 1 1 1376.0 NaN 7.0 6864.914673 ... True 6864.914673 9 19.416667 20.000000 False 95 20.0 19.0 90

5 rows × 25 columns


In [10]:
males = [male2002, male2010, male2013, male2015, male2017]
df2 = pd.concat(males, ignore_index=True, sort=False)
len(df2)


Out[10]:
29192

In [11]:
sum(df2.missing)


Out[11]:
0

In [12]:
marriage.DigitizeResp(df2)
grouped = df2.groupby('birth_index')
for name, group in iter(grouped):
    print(name, '&', len(group), '&', int(group.age.min()), '--', int(group.age_index.max()),
                '&', len(group[group.evrmarry]), '&', sum(group.missing), r'\\')


50 & 322 & 42 -- 45 & 224 & 0 \\
60 & 3970 & 32 -- 50 & 2755 & 0 \\
70 & 8282 & 22 -- 47 & 4883 & 0 \\
80 & 9861 & 15 -- 37 & 2445 & 0 \\
90 & 6364 & 15 -- 27 & 243 & 0 \\
100 & 393 & 15 -- 17 & 0 & 0 \\

In [13]:
df2['complete'] = df2.evrmarry
df2['complete_var'] = df2.agemarry_index
df2['ongoing_var'] = df2.age_index
df2['complete_missing'] = df2.complete & df2.complete_var.isnull()
df2['ongoing_missing'] = ~df2.complete & df2.ongoing_var.isnull()

In [14]:
print(sum(df2.complete_missing), sum(df2.ongoing_missing))


0 0

In [15]:
df2.to_hdf('MaleMarriageData.hdf', 'MaleMarriageData')

In [16]:
%time nsfg_male = pd.read_hdf('MaleMarriageData.hdf', 'MaleMarriageData')


CPU times: user 14.7 ms, sys: 162 µs, total: 14.9 ms
Wall time: 13.8 ms

In [ ]: