In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import seaborn as sns
import math
import matplotlib.pyplot as plt
import thinkstats2
import thinkplot
from thinkstats2 import Cdf
import survival
import marriage
Validating the 1982 data:
In [2]:
resp3 = marriage.ReadFemResp1982()
marriage.Validate1982(resp3)
resp3.shape
Out[2]:
In [3]:
resp3.head()
Out[3]:
In [4]:
sum(resp3.fmarno.isnull())
Out[4]:
In [5]:
resp3.fmarno.value_counts().sort_index()
Out[5]:
In [6]:
resp3.widowed.value_counts()
Out[6]:
In [ ]:
In [7]:
sum(resp3.cmdivorcx.isnull()), 6841+29+5+56
Out[7]:
In [8]:
sum((resp3.cmdivorcx >= 400) & (resp3.cmdivorcx <= 948)), 583+128
Out[8]:
In [9]:
sum((resp3.cmdivorcx >= 949) & (resp3.cmdivorcx <= 1000)), 311+16
Out[9]:
In [10]:
sum(resp3.cmstphsbx.isnull()), 7571+1+6+31
Out[10]:
In [11]:
sum((resp3.cmstphsbx >= 400) & (resp3.cmstphsbx <= 948)), 125+33
Out[11]:
In [12]:
sum((resp3.cmstphsbx >= 949) & (resp3.cmstphsbx <= 1000)), 197+5
Out[12]:
In [13]:
sum(resp3.divorced)
Out[13]:
In [14]:
sum(~resp3.cmstphsbx.isnull())
Out[14]:
In [15]:
sum(~resp3.cmdivorcx.isnull())
Out[15]:
Validating the 1988 data:
In [16]:
resp4 = marriage.ReadFemResp1988()
marriage.Validate1988(resp4)
resp4.shape
Out[16]:
In [17]:
resp4.fmarno.describe()
Out[17]:
In [18]:
resp4.fmarno.value_counts().sort_index()
Out[18]:
In [19]:
resp4.evrmarry.value_counts().sort_index()
Out[19]:
In [20]:
resp4.separated.value_counts().sort_index()
Out[20]:
In [21]:
resp4.head()
Out[21]:
In [22]:
sum(resp4.cmdivorcx.isnull()), 6897+77
Out[22]:
In [23]:
sum((resp4.cmdivorcx >= 685) & (resp4.cmdivorcx <= 1008)), 903+152
Out[23]:
In [24]:
sum((resp4.cmdivorcx >= 1009) & (resp4.cmdivorcx <= 1064)), 399+22
Out[24]:
In [25]:
sum(resp4.cmstphsbx.isnull()), 8113+29
Out[25]:
In [26]:
sum((resp4.cmstphsbx >= 685) & (resp4.cmstphsbx <= 1008)), 75+31
Out[26]:
In [27]:
sum((resp4.cmstphsbx >= 1009) & (resp4.cmstphsbx <= 1064)), 193+9
Out[27]:
In [28]:
sum(resp4.divorced)
Out[28]:
In [29]:
sum(~resp4.cmstphsbx.isnull())
Out[29]:
In [30]:
sum(~resp4.cmdivorcx.isnull())
Out[30]:
Validating the 1995 data:
In [31]:
resp5 = marriage.ReadFemResp1995()
marriage.Validate1995(resp5)
resp5.shape
Out[31]:
In [32]:
resp5.head()
Out[32]:
In [33]:
sum(resp5.cmdivorcx.isnull()), 9601+1+1+22
Out[33]:
In [34]:
sum((resp5.cmdivorcx >= 780) & (resp5.cmdivorcx <= 1092)), 1116
Out[34]:
In [35]:
sum((resp5.cmdivorcx >= 1093) & (resp5.cmdivorcx <= 1150)), 106
Out[35]:
In [36]:
sum(resp5.cmstphsbx.isnull()), 9601+1+15
Out[36]:
In [37]:
sum((resp5.cmstphsbx >= 780) & (resp5.cmstphsbx <= 1092)), 1167
Out[37]:
In [38]:
sum((resp5.cmstphsbx >= 1093) & (resp5.cmstphsbx <= 1150)), 63
Out[38]:
In [39]:
resp5.marend01.value_counts().sort_index()
Out[39]:
In [40]:
sum(resp5.divorced)
Out[40]:
In [41]:
sum(~resp5.cmstphsbx.isnull())
Out[41]:
In [42]:
sum(~resp5.cmdivorcx.isnull())
Out[42]:
In [43]:
sum((resp5.marend01==2) & (resp5.cmdivorcx.isnull()))
Out[43]:
In [44]:
sum((resp5.marend01==3) & (resp5.cmdivorcx.isnull()))
Out[44]:
In [45]:
sum((~resp5.divorced) & (~resp5.cmstphsbx.isnull()))
Out[45]:
In [46]:
resp5.marstat.value_counts().sort_index()
Out[46]:
In [47]:
sum((resp5.marstat==4) & (~resp5.divorced)) # separated and not divorced
Out[47]:
In [48]:
sum((resp5.marstat==4) & (~resp5.divorced) & (~resp5.cmstphsbx.isnull()))
Out[48]:
Validating the 2002 data:
In [49]:
resp6 = marriage.ReadFemResp2002()
marriage.Validate2002(resp6)
resp6.shape
Out[49]:
In [50]:
resp6.head()
Out[50]:
In [51]:
sum(resp6.cmdivorcx.isnull()), 6639+2+20
Out[51]:
In [52]:
sum((resp6.cmdivorcx >= 301) & (resp6.cmdivorcx <= 1164)), 607
Out[52]:
In [53]:
sum((resp6.cmdivorcx >= 1165) & (resp6.cmdivorcx <= 1239)), 375
Out[53]:
In [54]:
resp6.marend01.value_counts().sort_index()
Out[54]:
In [55]:
sum(resp6.divorced), sum(resp6.loc[resp6.divorced, 'mar1diss'].isnull())
Out[55]:
In [56]:
sum(resp6.separated), sum(resp6.loc[resp6.separated, 'mar1diss'].isnull())
Out[56]:
In [57]:
sum(resp6.widowed), sum(resp6.loc[resp6.widowed, 'mar1diss'].isnull())
Out[57]:
In [58]:
sum(resp6.stillma), sum(resp6.loc[resp6.stillma, 'mar1diss'].isnull())
Out[58]:
In [59]:
sum(resp6.evrmarry), 1232+260+58+2576
Out[59]:
Validating the 2010 data:
In [60]:
resp7 = marriage.ReadFemResp2010()
marriage.Validate2010(resp7)
resp7.shape
Out[60]:
In [61]:
resp7.head()
Out[61]:
In [62]:
sum(resp7.cmdivorcx.isnull()), 10705+1+19
Out[62]:
In [63]:
sum((resp7.cmdivorcx >= 522) & (resp7.cmdivorcx <= 1278)), 1286
Out[63]:
In [64]:
sum((resp7.cmdivorcx >= 1279) & (resp7.cmdivorcx <= 1290)), 106
Out[64]:
In [65]:
sum((resp7.cmdivorcx >= 1291) & (resp7.cmdivorcx <= 1308)), 112
Out[65]:
In [66]:
sum((resp7.cmdivorcx >= 1309) & (resp7.cmdivorcx <= 1326)), 50
Out[66]:
In [67]:
resp7.marend01.value_counts().sort_index()
Out[67]:
In [68]:
sum(resp7.divorced), sum(resp7.loc[resp7.divorced, 'mar1diss'].isnull())
Out[68]:
In [69]:
sum(resp7.separated), sum(resp7.loc[resp7.separated, 'mar1diss'].isnull())
Out[69]:
In [70]:
sum(resp7.widowed), sum(resp7.loc[resp7.widowed, 'mar1diss'].isnull())
Out[70]:
In [71]:
sum(resp7.stillma), sum(resp7.loc[resp7.stillma, 'mar1diss'].isnull())
Out[71]:
In [72]:
sum(resp7.evrmarry), 1574+405+68+3487
Out[72]:
Validating the 2013 data:
In [73]:
resp8 = marriage.ReadFemResp2013()
marriage.Validate2013(resp8)
resp8.shape
Out[73]:
In [74]:
resp8.head()
Out[74]:
In [75]:
sum(resp8.cmdivorcx.isnull()), 4851+2+24
Out[75]:
In [76]:
sum((resp8.cmdivorcx >= 380) & (resp8.cmdivorcx <= 1340)), 658
Out[76]:
In [77]:
sum((resp8.cmdivorcx >= 1341) & (resp8.cmdivorcx <= 1352)), 48
Out[77]:
In [78]:
sum((resp8.cmdivorcx >= 1353) & (resp8.cmdivorcx <= 1365)), 18
Out[78]:
In [79]:
resp8.marend01.value_counts().sort_index()
Out[79]:
In [80]:
sum(resp8.divorced), sum(resp8.loc[resp8.divorced, 'mar1diss'].isnull())
Out[80]:
In [81]:
sum(resp8.separated), sum(resp8.loc[resp8.separated, 'mar1diss'].isnull())
Out[81]:
In [82]:
sum(resp8.widowed), sum(resp8.loc[resp8.widowed, 'mar1diss'].isnull())
Out[82]:
In [83]:
sum(resp8.stillma), sum(resp8.loc[resp8.stillma, 'mar1diss'].isnull())
Out[83]:
In [84]:
sum(resp8.evrmarry), 755+214+26+1457
Out[84]:
Validating the 2015 data:
In [85]:
resp9 = marriage.ReadFemResp2015()
marriage.Validate2015(resp9)
resp9.shape
Out[85]:
In [86]:
resp9.head()
Out[86]:
In [87]:
sum(resp9.cmdivorcx.isnull()), 4946+3+18
Out[87]:
In [88]:
sum((resp9.cmdivorcx >= 378) & (resp9.cmdivorcx <= 1340)), 563
Out[88]:
In [89]:
sum((resp9.cmdivorcx >= 1341) & (resp9.cmdivorcx <= 1352)), 56
Out[89]:
In [90]:
sum((resp9.cmdivorcx >= 1353) & (resp9.cmdivorcx <= 1364)), 48
Out[90]:
In [91]:
sum((resp9.cmdivorcx >= 1365) & (resp9.cmdivorcx <= 1376)), 46
Out[91]:
In [92]:
sum((resp9.cmdivorcx >= 1377) & (resp9.cmdivorcx <= 1389)), 19
Out[92]:
In [93]:
resp9.marend01.value_counts().sort_index()
Out[93]:
In [94]:
sum(resp9.divorced), sum(resp9.loc[resp9.divorced, 'mar1diss'].isnull())
Out[94]:
In [95]:
sum(resp9.separated), sum(resp9.loc[resp9.separated, 'mar1diss'].isnull())
Out[95]:
In [96]:
sum(resp9.widowed), sum(resp9.loc[resp9.widowed, 'mar1diss'].isnull())
Out[96]:
In [97]:
sum(resp9.stillma), sum(resp9.loc[resp9.stillma, 'mar1diss'].isnull())
Out[97]:
In [98]:
sum(resp9.evrmarry), 756+169+28+1448
Out[98]:
Validating 2017 data for marriage analysis
In [99]:
resp10 = marriage.ReadFemResp2017()
marriage.Validate2017(resp10)
In [100]:
resp10.shape
Out[100]:
In [101]:
sum(resp10.evrmarry)
Out[101]:
In [102]:
resp10.agemarry.value_counts().max()
Out[102]:
In [103]:
resp10.head()
Out[103]:
In [104]:
thinkplot.Cdf(Cdf(resp10.age))
thinkplot.Config(xlabel='Age', ylabel='CDF')
In [105]:
thinkplot.Cdf(Cdf(resp10.agemarry))
thinkplot.Config(xlabel='Age at first marriage', ylabel='CDF')
In [106]:
thinkplot.Cdf(Cdf(resp10.finalwgt))
thinkplot.Config(xlabel='Sampling weight', ylabel='CDF')
Make a list of DataFrames, one for each cycle:
In [107]:
resps = [resp10, resp9, resp8, resp7, resp6, resp5, resp4, resp3]
#resps = [resp10]
Make a table showing the number of respondents in each cycle:
In [108]:
def SummarizeCycle(df):
ages = df.age.min(), df.age.max()
ages= np.array(ages)
intvws = df.cmintvw.min(), df.cmintvw.max()
intvws = np.array(intvws) / 12 + 1900
births = df.cmbirth.min(), df.cmbirth.max()
births = np.array(births) / 12 + 1900
print('# & ', intvws.astype(int), '&', len(df), '&', births.astype(int), r'\\')
for resp in reversed(resps):
SummarizeCycle(resp)
Check for missing values in agemarry
:
In [109]:
def CheckAgeVars(df):
print(sum(df[df.evrmarry].agemarry.isnull()))
for resp in resps:
CheckAgeVars(resp)
Combine the DataFrames (but remember that this is not resampled properly):
In [134]:
df = pd.concat(resps, ignore_index=True, sort=False)
len(df)
Out[134]:
Double check missing data:
In [135]:
sum(df.missing)
Out[135]:
Generate a table with the number of respondents in each cohort:
In [136]:
marriage.DigitizeResp(df)
grouped = df.groupby('birth_index')
for name, group in iter(grouped):
print(name, '&', len(group), '&', int(group.age.min()), '--', int(group.age_index.max()),
'&', len(group[group.evrmarry]), '&', sum(group.missing), r'\\')
In [138]:
df.to_hdf('FemMarriageData.hdf', 'FemMarriageData')
In [139]:
%time nsfg_female = pd.read_hdf('FemMarriageData.hdf', 'FemMarriageData')
In [8]:
male2017 = marriage.ReadMaleResp2017()
male2017.head()
Out[8]:
In [3]:
male2017.agemarry.describe()
Out[3]:
In [4]:
male2017.age.describe()
Out[4]:
In [5]:
male2017.missing.describe()
Out[5]:
In [6]:
male2017.year.describe()
Out[6]:
In [ ]:
In [ ]:
In [1]:
male2002 = marriage.ReadMaleResp2002()
male2002.head()
Out[1]:
In [2]:
sum(male2002.divorced) + sum(male2002.separated) + sum(male2002.widowed)
Out[2]:
In [3]:
sum(male2002.evrmarry)
Out[3]:
In [ ]:
In [ ]:
In [4]:
male2010 = marriage.ReadMaleResp2010()
male2010.head()
Out[4]:
In [5]:
male2013 = marriage.ReadMaleResp2013()
male2013.head()
Out[5]:
In [6]:
male2015 = marriage.ReadMaleResp2015()
male2015.head()
Out[6]:
In [10]:
males = [male2002, male2010, male2013, male2015, male2017]
df2 = pd.concat(males, ignore_index=True, sort=False)
len(df2)
Out[10]:
In [11]:
sum(df2.missing)
Out[11]:
In [12]:
marriage.DigitizeResp(df2)
grouped = df2.groupby('birth_index')
for name, group in iter(grouped):
print(name, '&', len(group), '&', int(group.age.min()), '--', int(group.age_index.max()),
'&', len(group[group.evrmarry]), '&', sum(group.missing), r'\\')
In [13]:
df2['complete'] = df2.evrmarry
df2['complete_var'] = df2.agemarry_index
df2['ongoing_var'] = df2.age_index
df2['complete_missing'] = df2.complete & df2.complete_var.isnull()
df2['ongoing_missing'] = ~df2.complete & df2.ongoing_var.isnull()
In [14]:
print(sum(df2.complete_missing), sum(df2.ongoing_missing))
In [15]:
df2.to_hdf('MaleMarriageData.hdf', 'MaleMarriageData')
In [16]:
%time nsfg_male = pd.read_hdf('MaleMarriageData.hdf', 'MaleMarriageData')
In [ ]: