In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import seaborn as sns
import math
import matplotlib.pyplot as plt
from matplotlib import pylab
from scipy.interpolate import interp1d
from scipy.misc import derivative
import thinkstats2
import thinkplot
from thinkstats2 import Cdf
import survival
import marriage
In [ ]:
%time nsfg_female = pd.read_hdf('FemMarriageData.hdf', 'FemMarriageData')
In [180]:
resp10 = marriage.ReadFemResp2017()
marriage.Validate2017(resp10)
resp9 = marriage.ReadFemResp2015()
marriage.Validate2015(resp9)
resp8 = marriage.ReadFemResp2013()
marriage.Validate2013(resp8)
resp7 = marriage.ReadFemResp2010()
marriage.Validate2010(resp7)
resp6 = marriage.ReadFemResp2002()
marriage.Validate2002(resp6)
married_resps = [resp[resp.evrmarry] for resp in [resp6, resp7, resp8, resp9, resp10]]
In [181]:
for df in married_resps:
df['complete'] = df.divorced
df['complete_var'] = df.mar1diss / 12
df['ongoing_var'] = df.mar1diss / 12
df['complete_missing'] = df.complete & df.complete_var.isnull()
df['ongoing_missing'] = ~df.complete & df.ongoing_var.isnull()
print(sum(df.complete_missing), sum(df.ongoing_missing))
# combine the 90s and 80s cohorts
# df.loc[df.birth_index==90, 'birth_index'] = 80
In [183]:
df = pd.concat(married_resps, ignore_index=True, sort=False)
len(df)
Out[183]:
In [184]:
complete = df.loc[df.complete, 'complete_var']
complete.describe()
Out[184]:
In [185]:
ongoing = df.loc[~df.complete, 'complete_var']
ongoing.describe()
Out[185]:
In [186]:
thinkplot.Cdf(thinkstats2.Cdf(complete))
thinkplot.Cdf(thinkstats2.Cdf(ongoing))
thinkplot.Config(xlabel='Duration of marriage (years)',
ylabel='CDF')
In [187]:
hf = survival.EstimateHazardFunction(complete, ongoing, verbose=True)
In [188]:
hf, sf = marriage.EstimateSurvival(df)
In [189]:
thinkplot.Plot(hf)
In [190]:
thinkplot.Plot(sf)
In [192]:
#from lifelines import KaplanMeierFitter
#kmf = KaplanMeierFitter()
#kmf.fit(df.complete_var, event_observed=df.complete)
#kmf.survival_function_.plot(color='red')
#thinkplot.Config(xlim=[0, 30])
In [193]:
colors = sns.color_palette("colorblind", 5)
cohorts = [80, 70, 60, 50, 90]
colormap = dict(zip(cohorts, colors))
In [194]:
grouped = df.groupby('birth_index')
for name, group in iter(grouped):
hf, sf = marriage.EstimateSurvival(group)
thinkplot.Plot(sf, label=name, color=colormap[name])
thinkplot.Config(title='Women in the U.S.',
xlabel='Duration of marriage (years)',
ylabel='Fraction still married',
xlim=[0, 30], ylim=[0.3, 1])
In [195]:
last = grouped.get_group(80)
complete = last[last.complete].complete_var
ongoing = last[~last.complete].ongoing_var
hf = survival.EstimateHazardFunction(complete, ongoing, verbose=True)
So far:
Doesn't take into account sampling weights.
Doesn't take into account agemarry
.
Vulnerable to small errors in tail.
In [196]:
%time sf_map = marriage.EstimateSurvivalByCohort(married_resps, iters=101)
In [197]:
marriage.PlotSurvivalFunctions(sf_map, colormap=colormap)
thinkplot.Config(title='Women in the U.S.',
xlabel='Duration of marriage (years)',
ylabel='Fraction still married',
xlim=[0, 30], ylim=[0.3, 1],
legend=True, loc='upper right', frameon=False)
In [198]:
for name, group in iter(grouped):
cdf = thinkstats2.Cdf(group.agemarry)
thinkplot.Cdf(cdf, label=name, color=colormap[name])
In [199]:
name = 80
last = grouped.get_group(name)
thinkplot.Cdf(thinkstats2.Cdf(last.agemarry), label=name, color=colormap[name])
for name, group in iter(grouped):
if name != '80':
matched = marriage.PropensityMatch(last, group, colname='agemarry')
thinkplot.Cdf(thinkstats2.Cdf(matched.agemarry), label=name, color=colormap[name])
In [ ]:
%time sf_map = marriage.EstimateSurvivalByCohort(married_resps, iters=21, prop_match=80)
In [ ]:
marriage.PlotSurvivalFunctions(sf_map, colormap=colormap)
thinkplot.Config(title='Women in the U.S.',
xlabel='Duration of marriage (years)',
ylabel='Fraction still married',
xlim=[0, 30], ylim=[0.3, 1],
legend=True, loc='upper right', frameon=False)
In [ ]:
%time sf_map = marriage.EstimateSurvivalByCohort(married_resps, iters=21, error_rate=0.1)
In [ ]:
marriage.PlotSurvivalFunctions(sf_map, colormap=colormap)
thinkplot.Config(title='Women in the U.S.',
xlabel='Duration of marriage (years)',
ylabel='Fraction still married',
xlim=[0, 30], ylim=[0.3, 1],
legend=True, loc='upper right', frameon=False)
In [ ]:
%time sf_map = marriage.EstimateSurvivalByCohort(married_resps, iters=21, prop_match=80)
In [ ]:
marriage.PlotSurvivalFunctions(sf_map, colormap=colormap)
thinkplot.Config(title='Women in the U.S.',
xlabel='Duration of marriage (years)',
ylabel='Fraction still married',
xlim=[0, 30], ylim=[0.3, 1],
legend=True, loc='upper right', frameon=False)
In [ ]:
def MakeTable(sf_map, ages):
t = []
for name, sf_seq in sorted(sf_map.items()):
ts, ss = marriage.MakeSurvivalCI(sf_seq, [50])
ss = ss[0]
vals = [np.interp(age, ts, ss, right=np.nan) for age in ages]
t.append((name, vals))
return t
In [ ]:
def MakePercentageTable(sf_map, ages=[6, 16, 26]):
t = MakeTable(sf_map, ages)
for name, sf_seq in sorted(sf_map.items()):
ts, ss = marriage.MakeSurvivalCI(sf_seq, [50])
ss = ss[0]
vals = [np.interp(age, ts, ss, right=np.nan) for age in ages]
print(name, '&', ' & '.join('%0.0f' % (val*100) for val in vals), r'\\')
MakePercentageTable(sf_map)
In [ ]:
male2010 = marriage.ReadMaleResp2010()
male2010.head()
In [ ]:
male2013 = marriage.ReadMaleResp2013()
male2013.head()
In [ ]:
male2015 = marriage.ReadMaleResp2015()
male2015.head()
In [ ]:
males = [male2010, male2013, male2015]
df2 = pd.concat(males, ignore_index=True)
len(df2)
In [ ]:
In [ ]:
married_males = [resp[resp.evrmarry] for resp in [male2010, male2013, male2015]]
In [ ]:
for df in married_males:
df['complete'] = df.divorced
df['complete_var'] = df.mar1diss / 12
df['ongoing_var'] = df.mar1diss / 12
df['complete_missing'] = df.complete & df.complete_var.isnull()
df['ongoing_missing'] = ~df.complete & df.ongoing_var.isnull()
print(sum(df.complete_missing), sum(df.ongoing_missing))
# combine the 90s and 80s cohorts
# df.loc[df.birth_index==90, 'birth_index'] = 80
In [ ]:
df = pd.concat(married_males, ignore_index=True)
len(df)
In [ ]:
complete = df.loc[df.complete, 'complete_var']
complete.describe()
In [ ]:
ongoing = df.loc[~df.complete, 'complete_var']
ongoing.describe()
In [ ]:
thinkplot.Cdf(thinkstats2.Cdf(complete))
thinkplot.Cdf(thinkstats2.Cdf(ongoing))
thinkplot.Config(xlabel='Duration of marriage (years)',
ylabel='CDF')
In [ ]:
hf = survival.EstimateHazardFunction(complete, ongoing, verbose=True)
In [ ]:
hf, sf = marriage.EstimateSurvival(df)
In [ ]:
thinkplot.Plot(hf)
In [ ]:
thinkplot.Plot(sf)
In [ ]:
from lifelines import KaplanMeierFitter
kmf = KaplanMeierFitter()
kmf.fit(df.complete_var, event_observed=df.complete)
kmf.survival_function_.plot(color='red')
thinkplot.Config(xlim=[0, 30])
In [ ]:
colors = sns.color_palette("colorblind", 5)
cohorts = [80, 70, 60, 50, 90]
colormap = dict(zip(cohorts, colors))
In [ ]:
grouped = df.groupby('birth_index')
for name, group in iter(grouped):
hf, sf = marriage.EstimateSurvival(group)
thinkplot.Plot(sf, label=name, color=colormap[name])
thinkplot.Config(title='Women in the U.S.',
xlabel='Duration of marriage (years)',
ylabel='Fraction still married',
xlim=[0, 30], ylim=[0.3, 1])
In [ ]:
last = grouped.get_group(80)
complete = last[last.complete].complete_var
ongoing = last[~last.complete].ongoing_var
hf = survival.EstimateHazardFunction(complete, ongoing, verbose=True)
So far:
Doesn't take into account sampling weights.
Doesn't take into account agemarry
.
Vulnerable to small errors in tail.
In [ ]:
reload(marriage)
%time sf_map = marriage.EstimateSurvivalByCohort(married_males, iters=101)
In [ ]:
reload(marriage)
marriage.PlotSurvivalFunctions(sf_map, colormap=colormap)
thinkplot.Config(title='Men in the U.S.',
xlabel='Duration of marriage (years)',
ylabel='Fraction still married',
xlim=[0, 30], ylim=[0.3, 1],
legend=True, loc='upper right', frameon=False)
In [ ]:
for name, group in iter(grouped):
cdf = thinkstats2.Cdf(group.agemarry)
thinkplot.Cdf(cdf, label=name, color=colormap[name])
In [ ]:
name = 80
last = grouped.get_group(name)
thinkplot.Cdf(thinkstats2.Cdf(last.agemarry), label=name, color=colormap[name])
for name, group in iter(grouped):
if name != '80':
matched = marriage.PropensityMatch(last, group, colname='agemarry')
thinkplot.Cdf(thinkstats2.Cdf(matched.agemarry), label=name, color=colormap[name])
In [ ]:
reload(marriage)
%time sf_map = marriage.EstimateSurvivalByCohort(married_males, iters=21, prop_match=80)
In [ ]:
reload(marriage)
marriage.PlotSurvivalFunctions(sf_map, colormap=colormap)
thinkplot.Config(title='Men in the U.S.',
xlabel='Duration of marriage (years)',
ylabel='Fraction still married',
xlim=[0, 30], ylim=[0.3, 1],
legend=True, loc='upper right', frameon=False)
In [ ]:
reload(marriage)
%time sf_map = marriage.EstimateSurvivalByCohort(married_males, iters=21, error_rate=0.1)
In [ ]:
reload(marriage)
marriage.PlotSurvivalFunctions(sf_map, colormap=colormap)
thinkplot.Config(title='Men in the U.S.',
xlabel='Duration of marriage (years)',
ylabel='Fraction still married',
xlim=[0, 30], ylim=[0.3, 1],
legend=True, loc='upper right', frameon=False)
In [ ]:
reload(marriage)
%time sf_map = marriage.EstimateSurvivalByCohort(married_males, iters=21, prop_match=80)
In [ ]:
reload(marriage)
marriage.PlotSurvivalFunctions(sf_map, colormap=colormap)
thinkplot.Config(title='Men in the U.S.',
xlabel='Duration of marriage (years)',
ylabel='Fraction still married',
xlim=[0, 30], ylim=[0.3, 1],
legend=True, loc='upper right', frameon=False)
In [ ]:
def MakeTable(sf_map, ages):
t = []
for name, sf_seq in sorted(sf_map.items()):
ts, ss = marriage.MakeSurvivalCI(sf_seq, [50])
ss = ss[0]
vals = [np.interp(age, ts, ss, right=np.nan) for age in ages]
t.append((name, vals))
return t
In [ ]:
def MakePercentageTable(sf_map, ages=[6, 16, 26]):
t = MakeTable(sf_map, ages)
for name, sf_seq in sorted(sf_map.items()):
ts, ss = marriage.MakeSurvivalCI(sf_seq, [50])
ss = ss[0]
vals = [np.interp(age, ts, ss, right=np.nan) for age in ages]
print(name, '&', ' & '.join('%0.0f' % (val*100) for val in vals), r'\\')
MakePercentageTable(sf_map)
In [ ]:
resps = [resp6, resp7, resp8, resp9, male2010, male2013, male2015]
In [ ]:
married_resps = [resp[resp.evrmarry] for resp in resps]
In [ ]:
for df in married_resps:
df['complete'] = df.divorced
df['complete_var'] = df.mar1diss / 12
df['ongoing_var'] = df.mar1diss / 12
df['complete_missing'] = df.complete & df.complete_var.isnull()
df['ongoing_missing'] = ~df.complete & df.ongoing_var.isnull()
print(len(df), sum(df.complete_missing), sum(df.ongoing_missing))
# combine the 90s and 80s cohorts
# df.loc[df.birth_index==90, 'birth_index'] = 80
In [ ]:
df = pd.concat(married_resps, ignore_index=True)
len(df)
In [ ]:
complete = df.loc[df.complete, 'complete_var']
complete.describe()
In [ ]:
ongoing = df.loc[~df.complete, 'complete_var']
ongoing.describe()
In [ ]:
thinkplot.Cdf(thinkstats2.Cdf(complete))
thinkplot.Cdf(thinkstats2.Cdf(ongoing))
thinkplot.Config(xlabel='Duration of marriage (years)',
ylabel='CDF')
In [ ]:
hf = survival.EstimateHazardFunction(complete, ongoing, verbose=True)
In [ ]:
hf, sf = marriage.EstimateSurvival(df)
In [ ]:
thinkplot.Plot(hf)
In [ ]:
thinkplot.Plot(sf)
In [ ]:
from lifelines import KaplanMeierFitter
kmf = KaplanMeierFitter()
kmf.fit(df.complete_var, event_observed=df.complete)
kmf.survival_function_.plot(color='red')
thinkplot.Config(xlim=[0, 30])
In [ ]:
colors = sns.color_palette("colorblind", 5)
cohorts = [80, 70, 60, 50, 90]
colormap = dict(zip(cohorts, colors))
In [ ]:
grouped = df.groupby('birth_index')
for name, group in iter(grouped):
hf, sf = marriage.EstimateSurvival(group)
thinkplot.Plot(sf, label=name, color=colormap[name])
thinkplot.Config(title='Adults in the U.S.',
xlabel='Duration of marriage (years)',
ylabel='Fraction still married',
xlim=[0, 30], ylim=[0.3, 1])
In [ ]:
last = grouped.get_group(80)
complete = last[last.complete].complete_var
ongoing = last[~last.complete].ongoing_var
hf = survival.EstimateHazardFunction(complete, ongoing, verbose=True)
So far:
Doesn't take into account sampling weights.
Doesn't take into account agemarry
.
Vulnerable to small errors in tail.
In [ ]:
reload(marriage)
%time sf_map = marriage.EstimateSurvivalByCohort(married_resps, iters=101)
In [ ]:
reload(marriage)
marriage.PlotSurvivalFunctions(sf_map, colormap=colormap)
thinkplot.Config(title='Adults in the U.S.',
xlabel='Duration of marriage (years)',
ylabel='Fraction still married',
xlim=[0, 30], ylim=[0.3, 1],
legend=True, loc='upper right', frameon=False)
In [ ]:
for name, group in iter(grouped):
cdf = thinkstats2.Cdf(group.agemarry)
thinkplot.Cdf(cdf, label=name, color=colormap[name])
In [ ]:
name = 80
last = grouped.get_group(name)
thinkplot.Cdf(thinkstats2.Cdf(last.agemarry), label=name, color=colormap[name])
for name, group in iter(grouped):
if name != '80':
matched = marriage.PropensityMatch(last, group, colname='agemarry')
thinkplot.Cdf(thinkstats2.Cdf(matched.agemarry), label=name, color=colormap[name])
In [ ]:
reload(marriage)
%time sf_map = marriage.EstimateSurvivalByCohort(married_resps, iters=101, prop_match=80)
In [ ]:
reload(marriage)
marriage.PlotSurvivalFunctions(sf_map, colormap=colormap)
thinkplot.Config(title='Adults in the U.S.',
xlabel='Duration of marriage (years)',
ylabel='Fraction still married',
xlim=[0, 30], ylim=[0.3, 1],
legend=True, loc='upper right', frameon=False)
thinkplot.Save('divorce1', clf=False, formats=['png'])
In [ ]:
def MakeTable(sf_map, ages):
t = []
for name, sf_seq in sorted(sf_map.items()):
ts, ss = marriage.MakeSurvivalCI(sf_seq, [50])
ss = ss[0]
vals = [np.interp(age, ts, ss, right=np.nan) for age in ages]
t.append((name, vals))
return t
In [ ]:
def MakePercentageTable(sf_map, ages=[7, 16, 26]):
t = MakeTable(sf_map, ages)
for name, sf_seq in sorted(sf_map.items()):
ts, ss = marriage.MakeSurvivalCI(sf_seq, [50])
ss = ss[0]
vals = [np.interp(age, ts, ss, right=np.nan) for age in ages]
print(name, '&', ' & '.join('%0.0f' % (val*100) for val in vals), r'\\')
MakePercentageTable(sf_map)
In [ ]:
group = grouped.get_group(80)
In [ ]:
thinkplot.Cdf(thinkstats2.Cdf(group.agemarry))
In [ ]:
group.agemarry.describe()
In [ ]:
youngmarry = group[group.agemarry <= 21]
oldmarry = group[group.agemarry > 21]
In [ ]:
len(youngmarry), len(oldmarry)
In [ ]:
hf, sf = marriage.EstimateSurvival(youngmarry)
thinkplot.Plot(sf, label='Married age <= 21')
hf, sf = marriage.EstimateSurvival(oldmarry)
thinkplot.Plot(sf, label='Married age > 21')
thinkplot.Config(title='Adults in the U.S., born in 1990s',
xlabel='Duration of marriage (years)',
ylabel='Fraction still married',
xlim=[0, 20], ylim=[0.3, 1],
legend=True, loc='upper right', frameon=False)
In [ ]:
In [ ]: