Divorce


In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import seaborn as sns

import math

import matplotlib.pyplot as plt
from matplotlib import pylab

from scipy.interpolate import interp1d
from scipy.misc import derivative

import thinkstats2
import thinkplot
from thinkstats2 import Cdf

import survival
import marriage

Divorce


In [ ]:
%time nsfg_female = pd.read_hdf('FemMarriageData.hdf', 'FemMarriageData')

In [180]:
resp10 = marriage.ReadFemResp2017()
marriage.Validate2017(resp10)

resp9 = marriage.ReadFemResp2015()
marriage.Validate2015(resp9)

resp8 = marriage.ReadFemResp2013()
marriage.Validate2013(resp8)

resp7 = marriage.ReadFemResp2010()
marriage.Validate2010(resp7)

resp6 = marriage.ReadFemResp2002()
marriage.Validate2002(resp6)

married_resps = [resp[resp.evrmarry] for resp in [resp6, resp7, resp8, resp9, resp10]]

In [181]:
for df in married_resps:
    df['complete'] = df.divorced
    df['complete_var'] = df.mar1diss / 12
    df['ongoing_var'] = df.mar1diss / 12
    df['complete_missing'] = df.complete & df.complete_var.isnull()
    df['ongoing_missing'] = ~df.complete & df.ongoing_var.isnull()
    print(sum(df.complete_missing), sum(df.ongoing_missing))
    
    # combine the 90s and 80s cohorts
    # df.loc[df.birth_index==90, 'birth_index'] = 80


/home/downey/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
/home/downey/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
/home/downey/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
/home/downey/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
/home/downey/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
0 0
0 0
0 0
0 0
0 0

In [183]:
df = pd.concat(married_resps, ignore_index=True, sort=False)
len(df)


Out[183]:
17095

In [184]:
complete = df.loc[df.complete, 'complete_var']
complete.describe()


Out[184]:
count    5135.000000
mean        4.138526
std         4.185355
min         0.000000
25%         0.708333
50%         2.916667
75%         6.166667
max        26.750000
Name: complete_var, dtype: float64

In [185]:
ongoing = df.loc[~df.complete, 'complete_var']
ongoing.describe()


Out[185]:
count    11960.000000
mean         7.169077
std          6.309525
min          0.000000
25%          1.500000
50%          5.750000
75%         11.583333
max         29.000000
Name: complete_var, dtype: float64

In [186]:
thinkplot.Cdf(thinkstats2.Cdf(complete))
thinkplot.Cdf(thinkstats2.Cdf(ongoing))
thinkplot.Config(xlabel='Duration of marriage (years)',
                 ylabel='CDF')



In [187]:
hf = survival.EstimateHazardFunction(complete, ongoing, verbose=True)


0	17095	50	45	0.0029
0.0833	17000	172	198	0.01
0.167	16630	169	183	0.01
0.25	16278	122	182	0.0075
0.333	15974	117	177	0.0073
0.417	15680	501	1381	0.032
0.5	13798	60	70	0.0043
0.583	13668	53	76	0.0039
0.667	13539	40	54	0.003
0.75	13445	53	60	0.0039
0.833	13332	49	67	0.0037
0.917	13216	59	90	0.0045
1	13067	75	58	0.0057
1.08	12934	54	75	0.0042
1.17	12805	54	62	0.0042
1.25	12689	62	65	0.0049
1.33	12562	52	63	0.0041
1.42	12447	41	70	0.0033
1.5	12336	40	49	0.0032
1.58	12247	48	63	0.0039
1.67	12136	46	74	0.0038
1.75	12016	46	55	0.0038
1.83	11915	53	52	0.0044
1.92	11810	38	63	0.0032
2	11709	57	77	0.0049
2.08	11575	41	80	0.0035
2.17	11454	35	70	0.0031
2.25	11349	40	69	0.0035
2.33	11240	49	86	0.0044
2.42	11105	54	64	0.0049
2.5	10987	37	66	0.0034
2.58	10884	43	69	0.004
2.67	10772	42	45	0.0039
2.75	10685	40	68	0.0037
2.83	10577	43	54	0.0041
2.92	10480	52	57	0.005
3	10371	44	72	0.0042
3.08	10255	48	58	0.0047
3.17	10149	34	73	0.0034
3.25	10042	45	55	0.0045
3.33	9942	46	63	0.0046
3.42	9833	31	65	0.0032
3.5	9737	25	54	0.0026
3.58	9658	32	67	0.0033
3.67	9559	43	59	0.0045
3.75	9457	40	58	0.0042
3.83	9359	37	57	0.004
3.92	9265	49	73	0.0053
4	9143	47	46	0.0051
4.08	9050	40	51	0.0044
4.17	8959	27	54	0.003
4.25	8878	37	49	0.0042
4.33	8792	29	54	0.0033
4.42	8709	30	78	0.0034
4.5	8601	20	46	0.0023
4.58	8535	38	53	0.0045
4.67	8444	30	47	0.0036
4.75	8367	34	49	0.0041
4.83	8284	39	53	0.0047
4.92	8192	29	52	0.0035
5	8111	26	54	0.0032
5.08	8031	34	51	0.0042
5.17	7946	34	37	0.0043
5.25	7875	22	58	0.0028
5.33	7795	32	52	0.0041
5.42	7711	23	52	0.003
5.5	7636	33	49	0.0043
5.58	7554	25	48	0.0033
5.67	7481	24	41	0.0032
5.75	7416	25	48	0.0034
5.83	7343	22	52	0.003
5.92	7269	26	55	0.0036
6	7188	29	50	0.004
6.08	7109	28	64	0.0039
6.17	7017	24	54	0.0034
6.25	6939	29	41	0.0042
6.33	6869	30	55	0.0044
6.42	6784	16	41	0.0024
6.5	6727	24	47	0.0036
6.58	6656	26	39	0.0039
6.67	6591	22	45	0.0033
6.75	6524	20	42	0.0031
6.83	6462	27	58	0.0042
6.92	6377	25	41	0.0039
7	6311	29	43	0.0046
7.08	6239	19	48	0.003
7.17	6172	28	40	0.0045
7.25	6104	22	47	0.0036
7.33	6035	18	37	0.003
7.42	5980	25	54	0.0042
7.5	5901	13	44	0.0022
7.58	5844	17	45	0.0029
7.67	5782	14	43	0.0024
7.75	5725	18	51	0.0031
7.83	5656	8	44	0.0014
7.92	5604	8	41	0.0014
8	5555	24	54	0.0043
8.08	5477	11	41	0.002
8.17	5425	15	46	0.0028
8.25	5364	19	44	0.0035
8.33	5301	16	44	0.003
8.42	5241	16	54	0.0031
8.5	5171	6	43	0.0012
8.58	5122	7	35	0.0014
8.67	5080	6	31	0.0012
8.75	5043	18	49	0.0036
8.83	4976	12	33	0.0024
8.92	4931	19	44	0.0039
9	4868	15	53	0.0031
9.08	4800	12	46	0.0025
9.17	4742	11	36	0.0023
9.25	4695	15	36	0.0032
9.33	4644	16	35	0.0034
9.42	4593	11	49	0.0024
9.5	4533	12	28	0.0026
9.58	4493	13	43	0.0029
9.67	4437	15	47	0.0034
9.75	4375	11	45	0.0025
9.83	4319	6	43	0.0014
9.92	4270	8	44	0.0019
10	4218	11	33	0.0026
10.1	4174	15	51	0.0036
10.2	4108	7	46	0.0017
10.2	4055	9	37	0.0022
10.3	4009	6	42	0.0015
10.4	3961	10	38	0.0025
10.5	3913	14	41	0.0036
10.6	3858	4	33	0.001
10.7	3821	9	34	0.0024
10.8	3778	14	38	0.0037
10.8	3726	7	33	0.0019
10.9	3686	3	39	0.00081
11	3644	16	40	0.0044
11.1	3588	7	37	0.002
11.2	3544	4	39	0.0011
11.2	3501	9	30	0.0026
11.3	3462	10	33	0.0029
11.4	3419	5	28	0.0015
11.5	3386	2	35	0.00059
11.6	3349	5	45	0.0015
11.7	3299	8	29	0.0024
11.8	3262	6	35	0.0018
11.8	3221	6	33	0.0019
11.9	3182	12	46	0.0038
12	3124	7	29	0.0022
12.1	3088	6	44	0.0019
12.2	3038	6	39	0.002
12.2	2993	1	34	0.00033
12.3	2958	6	31	0.002
12.4	2921	7	37	0.0024
12.5	2877	3	48	0.001
12.6	2826	6	36	0.0021
12.7	2784	8	24	0.0029
12.8	2752	3	42	0.0011
12.8	2707	5	26	0.0018
12.9	2676	7	34	0.0026
13	2635	4	37	0.0015
13.1	2594	3	53	0.0012
13.2	2538	5	23	0.002
13.2	2510	8	34	0.0032
13.3	2468	7	46	0.0028
13.4	2415	4	39	0.0017
13.5	2372	3	24	0.0013
13.6	2345	6	37	0.0026
13.7	2302	7	29	0.003
13.8	2266	7	33	0.0031
13.8	2226	5	33	0.0022
13.9	2188	5	33	0.0023
14	2150	5	29	0.0023
14.1	2116	7	25	0.0033
14.2	2084	0	26	0
14.2	2058	6	28	0.0029
14.3	2024	4	27	0.002
14.4	1993	3	29	0.0015
14.5	1961	3	27	0.0015
14.6	1931	7	33	0.0036
14.7	1891	5	36	0.0026
14.8	1850	4	28	0.0022
14.8	1818	3	27	0.0017
14.9	1788	3	30	0.0017
15	1755	1	19	0.00057
15.1	1735	0	31	0
15.2	1704	4	30	0.0023
15.2	1670	4	27	0.0024
15.3	1639	4	25	0.0024
15.4	1610	3	26	0.0019
15.5	1581	1	28	0.00063
15.6	1552	1	29	0.00064
15.7	1522	3	28	0.002
15.8	1491	2	36	0.0013
15.8	1453	2	23	0.0014
15.9	1428	3	26	0.0021
16	1399	7	20	0.005
16.1	1372	1	22	0.00073
16.2	1349	2	24	0.0015
16.2	1323	6	19	0.0045
16.3	1298	2	23	0.0015
16.4	1273	1	20	0.00079
16.5	1252	2	15	0.0016
16.6	1235	2	16	0.0016
16.7	1217	1	27	0.00082
16.8	1189	2	30	0.0017
16.8	1157	2	23	0.0017
16.9	1132	0	29	0
17	1103	3	15	0.0027
17.1	1085	4	21	0.0037
17.2	1060	3	15	0.0028
17.2	1042	2	16	0.0019
17.3	1024	1	11	0.00098
17.4	1012	4	12	0.004
17.5	996	4	20	0.004
17.6	972	1	14	0.001
17.7	957	1	21	0.001
17.8	935	0	13	0
17.8	922	2	17	0.0022
17.9	903	2	20	0.0022
18	881	3	11	0.0034
18.1	867	4	19	0.0046
18.2	844	3	12	0.0036
18.2	829	1	10	0.0012
18.3	818	1	16	0.0012
18.4	801	0	18	0
18.5	783	2	14	0.0026
18.6	767	1	11	0.0013
18.7	755	0	17	0
18.8	738	0	9	0
18.8	729	0	10	0
18.9	719	0	15	0
19	704	1	10	0.0014
19.1	693	0	18	0
19.2	675	0	15	0
19.2	660	2	10	0.003
19.3	648	0	10	0
19.4	638	0	12	0
19.5	626	3	11	0.0048
19.6	612	3	6	0.0049
19.7	603	2	22	0.0033
19.8	579	1	14	0.0017
19.8	564	0	11	0
19.9	553	1	11	0.0018
20	541	0	13	0
20.1	528	1	15	0.0019
20.2	512	4	12	0.0078
20.2	496	1	14	0.002
20.3	481	0	13	0
20.4	468	0	8	0
20.5	460	0	12	0
20.6	448	1	10	0.0022
20.7	437	0	8	0
20.8	429	0	6	0
20.8	423	1	10	0.0024
20.9	412	1	12	0.0024
21	399	2	17	0.005
21.1	380	0	14	0
21.2	366	1	11	0.0027
21.2	354	1	10	0.0028
21.3	343	0	14	0
21.4	329	2	8	0.0061
21.5	319	1	7	0.0031
21.6	311	0	4	0
21.7	307	0	14	0
21.8	293	1	7	0.0034
21.8	285	0	12	0
21.9	273	0	13	0
22	260	0	9	0
22.1	251	0	2	0
22.2	249	1	2	0.004
22.2	246	0	6	0
22.3	240	0	2	0
22.4	238	2	4	0.0084
22.5	232	0	9	0
22.6	223	0	6	0
22.7	217	0	5	0
22.8	212	0	4	0
22.8	208	0	7	0
22.9	201	0	6	0
23	195	0	8	0
23.1	187	0	4	0
23.2	183	0	4	0
23.2	179	1	6	0.0056
23.3	172	0	6	0
23.4	166	1	5	0.006
23.5	160	0	9	0
23.6	151	0	7	0
23.7	144	0	3	0
23.8	141	0	1	0
23.8	140	0	4	0
23.9	136	1	8	0.0074
24	127	0	3	0
24.1	124	0	6	0
24.2	118	0	2	0
24.2	116	1	5	0.0086
24.3	110	0	6	0
24.4	104	0	5	0
24.5	99	0	1	0
24.6	98	0	4	0
24.7	94	0	3	0
24.8	91	0	3	0
24.8	88	0	1	0
24.9	87	0	4	0
25	83	0	4	0
25.1	79	0	3	0
25.2	76	0	3	0
25.2	73	0	5	0
25.3	68	0	3	0
25.4	65	0	3	0
25.5	62	0	3	0
25.6	59	0	1	0
25.7	58	0	2	0
25.8	56	0	2	0
25.9	54	0	6	0
26	48	1	3	0.021
26.1	44	0	5	0
26.2	39	0	4	0
26.2	35	0	4	0
26.3	31	0	4	0
26.4	27	0	2	0
26.5	25	0	1	0
26.8	24	1	2	0.042
26.8	21	0	1	0
26.9	20	0	1	0
27	19	0	2	0
27.1	17	0	2	0
27.2	15	0	2	0
27.2	13	0	2	0
27.3	11	0	2	0
27.6	9	0	1	0
27.8	8	0	1	0
28.1	7	0	1	0
28.2	6	0	1	0
28.3	5	0	1	0
28.6	4	0	1	0
28.7	3	0	1	0
28.9	2	0	1	0
29	1	0	1	0

In [188]:
hf, sf = marriage.EstimateSurvival(df)

In [189]:
thinkplot.Plot(hf)



In [190]:
thinkplot.Plot(sf)



In [192]:
#from lifelines import KaplanMeierFitter

#kmf = KaplanMeierFitter()
#kmf.fit(df.complete_var, event_observed=df.complete)
#kmf.survival_function_.plot(color='red')
#thinkplot.Config(xlim=[0, 30])

In [193]:
colors = sns.color_palette("colorblind", 5)
cohorts = [80, 70, 60, 50, 90]
colormap = dict(zip(cohorts, colors))

In [194]:
grouped = df.groupby('birth_index')
for name, group in iter(grouped):
    hf, sf = marriage.EstimateSurvival(group)
    thinkplot.Plot(sf, label=name, color=colormap[name])

thinkplot.Config(title='Women in the U.S.',
                 xlabel='Duration of marriage (years)',
                 ylabel='Fraction still married',
                 xlim=[0, 30], ylim=[0.3, 1])



In [195]:
last = grouped.get_group(80)
complete = last[last.complete].complete_var
ongoing = last[~last.complete].ongoing_var
hf = survival.EstimateHazardFunction(complete, ongoing, verbose=True)


0	4208	9	17	0.0021
0.0833	4182	48	76	0.011
0.167	4058	56	92	0.014
0.25	3910	35	98	0.009
0.333	3777	38	92	0.01
0.417	3647	116	576	0.032
0.5	2955	13	33	0.0044
0.583	2909	11	36	0.0038
0.667	2862	10	30	0.0035
0.75	2822	11	29	0.0039
0.833	2782	9	27	0.0032
0.917	2746	18	37	0.0066
1	2691	19	28	0.0071
1.08	2644	11	34	0.0042
1.17	2599	9	31	0.0035
1.25	2559	11	35	0.0043
1.33	2513	8	30	0.0032
1.42	2475	10	29	0.004
1.5	2436	9	24	0.0037
1.58	2403	12	34	0.005
1.67	2357	12	40	0.0051
1.75	2305	10	26	0.0043
1.83	2269	11	16	0.0048
1.92	2242	6	29	0.0027
2	2207	11	33	0.005
2.08	2163	7	32	0.0032
2.17	2124	7	30	0.0033
2.25	2087	6	30	0.0029
2.33	2051	10	43	0.0049
2.42	1998	9	25	0.0045
2.5	1964	7	24	0.0036
2.58	1933	9	34	0.0047
2.67	1890	10	20	0.0053
2.75	1860	7	31	0.0038
2.83	1822	8	20	0.0044
2.92	1794	8	28	0.0045
3	1758	6	34	0.0034
3.08	1718	11	21	0.0064
3.17	1686	5	31	0.003
3.25	1650	11	28	0.0067
3.33	1611	12	24	0.0074
3.42	1575	7	26	0.0044
3.5	1542	8	22	0.0052
3.58	1512	4	26	0.0026
3.67	1482	9	30	0.0061
3.75	1443	6	26	0.0042
3.83	1411	7	27	0.005
3.92	1377	8	32	0.0058
4	1337	5	17	0.0037
4.08	1315	8	14	0.0061
4.17	1293	6	25	0.0046
4.25	1262	3	28	0.0024
4.33	1231	5	23	0.0041
4.42	1203	5	26	0.0042
4.5	1172	4	13	0.0034
4.58	1155	5	21	0.0043
4.67	1129	3	17	0.0027
4.75	1109	9	16	0.0081
4.83	1084	5	23	0.0046
4.92	1056	5	22	0.0047
5	1029	3	16	0.0029
5.08	1010	8	27	0.0079
5.17	975	10	13	0.01
5.25	952	3	21	0.0032
5.33	928	4	20	0.0043
5.42	904	4	20	0.0044
5.5	880	3	18	0.0034
5.58	859	2	17	0.0023
5.67	840	2	18	0.0024
5.75	820	2	15	0.0024
5.83	803	3	13	0.0037
5.92	787	1	17	0.0013
6	769	2	13	0.0026
6.08	754	3	18	0.004
6.17	733	6	15	0.0082
6.25	712	2	22	0.0028
6.33	688	7	16	0.01
6.42	665	2	13	0.003
6.5	650	3	9	0.0046
6.58	638	0	8	0
6.67	630	1	10	0.0016
6.75	619	0	15	0
6.83	604	4	13	0.0066
6.92	587	2	7	0.0034
7	578	2	10	0.0035
7.08	566	2	10	0.0035
7.17	554	8	15	0.014
7.25	531	1	10	0.0019
7.33	520	0	8	0
7.42	512	3	17	0.0059
7.5	492	0	13	0
7.58	479	3	14	0.0063
7.67	462	1	7	0.0022
7.75	454	2	15	0.0044
7.83	437	0	5	0
7.92	432	2	12	0.0046
8	418	1	11	0.0024
8.08	406	1	9	0.0025
8.17	396	3	9	0.0076
8.25	384	1	8	0.0026
8.33	375	0	12	0
8.42	363	0	12	0
8.5	351	0	10	0
8.58	341	1	3	0.0029
8.67	337	0	3	0
8.75	334	1	5	0.003
8.83	328	0	4	0
8.92	324	1	5	0.0031
9	318	1	9	0.0031
9.08	308	1	9	0.0032
9.17	298	0	6	0
9.25	292	0	6	0
9.33	286	2	6	0.007
9.42	278	0	7	0
9.5	271	0	6	0
9.58	265	0	9	0
9.67	256	1	8	0.0039
9.75	247	1	11	0.004
9.83	235	1	7	0.0043
9.92	227	1	10	0.0044
10	216	0	4	0
10.1	212	0	8	0
10.2	204	1	4	0.0049
10.2	199	0	5	0
10.3	194	0	9	0
10.4	185	0	8	0
10.5	177	0	4	0
10.6	173	0	3	0
10.7	170	0	4	0
10.8	166	1	6	0.006
10.8	159	0	4	0
10.9	155	0	7	0
11	148	2	8	0.014
11.1	138	0	6	0
11.2	132	0	4	0
11.2	128	0	2	0
11.3	126	1	2	0.0079
11.4	123	0	2	0
11.5	121	0	4	0
11.6	117	0	4	0
11.7	113	0	5	0
11.8	108	0	5	0
11.8	103	0	5	0
11.9	98	1	1	0.01
12	96	0	5	0
12.1	91	0	4	0
12.2	87	0	2	0
12.2	85	0	2	0
12.3	83	1	6	0.012
12.4	76	2	1	0.026
12.5	73	0	5	0
12.6	68	0	4	0
12.7	64	0	1	0
12.8	63	0	4	0
12.8	59	0	2	0
12.9	57	0	3	0
13	54	0	2	0
13.1	52	0	3	0
13.2	49	0	2	0
13.3	47	0	3	0
13.4	44	0	3	0
13.5	41	0	1	0
13.7	40	0	4	0
13.8	36	0	2	0
13.8	34	0	1	0
13.9	33	0	4	0
14.1	29	1	1	0.034
14.2	27	0	1	0
14.2	26	0	2	0
14.3	24	0	1	0
14.4	23	0	2	0
14.5	21	0	1	0
14.6	20	0	3	0
14.7	17	0	1	0
14.9	16	0	1	0
15	15	0	1	0
15.2	14	0	2	0
15.2	12	0	1	0
15.4	11	0	1	0
15.5	10	0	1	0
15.8	9	0	2	0
16.1	7	0	1	0
16.7	6	0	2	0
16.8	4	0	1	0
16.9	3	0	1	0
17	2	0	1	0
18.8	1	0	1	0

So far:

  1. Doesn't take into account sampling weights.

  2. Doesn't take into account agemarry.

  3. Vulnerable to small errors in tail.


In [196]:
%time sf_map = marriage.EstimateSurvivalByCohort(married_resps, iters=101)


CPU times: user 6.19 s, sys: 7.9 ms, total: 6.19 s
Wall time: 6.19 s

In [197]:
marriage.PlotSurvivalFunctions(sf_map, colormap=colormap)

thinkplot.Config(title='Women in the U.S.',
                     xlabel='Duration of marriage (years)',
                     ylabel='Fraction still married',
                     xlim=[0, 30], ylim=[0.3, 1],
                     legend=True, loc='upper right', frameon=False)



In [198]:
for name, group in iter(grouped):
    cdf = thinkstats2.Cdf(group.agemarry)
    thinkplot.Cdf(cdf, label=name, color=colormap[name])



In [199]:
name = 80
last = grouped.get_group(name)
thinkplot.Cdf(thinkstats2.Cdf(last.agemarry), label=name, color=colormap[name])

for name, group in iter(grouped):
    if name != '80':
        matched = marriage.PropensityMatch(last, group, colname='agemarry')
        thinkplot.Cdf(thinkstats2.Cdf(matched.agemarry), label=name, color=colormap[name])


/home/downey/anaconda3/lib/python3.6/site-packages/scipy/stats/_distn_infrastructure.py:876: RuntimeWarning: invalid value encountered in greater_equal
  return (self.a <= x) & (x <= self.b)
/home/downey/anaconda3/lib/python3.6/site-packages/scipy/stats/_distn_infrastructure.py:876: RuntimeWarning: invalid value encountered in less_equal
  return (self.a <= x) & (x <= self.b)
/home/downey/MarriageNSFG/marriage.py:113: RuntimeWarning: invalid value encountered in less
  return np.random.choice(group.index, 1, p=weights)[0]

In [ ]:
%time sf_map = marriage.EstimateSurvivalByCohort(married_resps, iters=21, prop_match=80)

In [ ]:
marriage.PlotSurvivalFunctions(sf_map, colormap=colormap)

thinkplot.Config(title='Women in the U.S.',
                     xlabel='Duration of marriage (years)',
                     ylabel='Fraction still married',
                     xlim=[0, 30], ylim=[0.3, 1],
                     legend=True, loc='upper right', frameon=False)

In [ ]:
%time sf_map = marriage.EstimateSurvivalByCohort(married_resps, iters=21, error_rate=0.1)

In [ ]:
marriage.PlotSurvivalFunctions(sf_map, colormap=colormap)

thinkplot.Config(title='Women in the U.S.',
                     xlabel='Duration of marriage (years)',
                     ylabel='Fraction still married',
                     xlim=[0, 30], ylim=[0.3, 1],
                     legend=True, loc='upper right', frameon=False)

In [ ]:
%time sf_map = marriage.EstimateSurvivalByCohort(married_resps, iters=21, prop_match=80)

In [ ]:
marriage.PlotSurvivalFunctions(sf_map, colormap=colormap)

thinkplot.Config(title='Women in the U.S.',
                     xlabel='Duration of marriage (years)',
                     ylabel='Fraction still married',
                     xlim=[0, 30], ylim=[0.3, 1],
                     legend=True, loc='upper right', frameon=False)

In [ ]:
def MakeTable(sf_map, ages):
    t = []
    for name, sf_seq in sorted(sf_map.items()):
        ts, ss = marriage.MakeSurvivalCI(sf_seq, [50])
        ss = ss[0]
        vals = [np.interp(age, ts, ss, right=np.nan) for age in ages]
        t.append((name, vals))
    return t

In [ ]:
def MakePercentageTable(sf_map, ages=[6, 16, 26]):
    t = MakeTable(sf_map, ages)
    for name, sf_seq in sorted(sf_map.items()):
        ts, ss = marriage.MakeSurvivalCI(sf_seq, [50])
        ss = ss[0]
        vals = [np.interp(age, ts, ss, right=np.nan) for age in ages]
        print(name, '&', ' & '.join('%0.0f' % (val*100) for val in vals), r'\\')
        
MakePercentageTable(sf_map)

Male respondents


In [ ]:
male2010 = marriage.ReadMaleResp2010()
male2010.head()

In [ ]:
male2013 = marriage.ReadMaleResp2013()
male2013.head()

In [ ]:
male2015 = marriage.ReadMaleResp2015()
male2015.head()

In [ ]:
males = [male2010, male2013, male2015]
df2 = pd.concat(males, ignore_index=True)
len(df2)

In [ ]:


In [ ]:
married_males = [resp[resp.evrmarry] for resp in [male2010, male2013, male2015]]

In [ ]:
for df in married_males:
    df['complete'] = df.divorced
    df['complete_var'] = df.mar1diss / 12
    df['ongoing_var'] = df.mar1diss / 12
    df['complete_missing'] = df.complete & df.complete_var.isnull()
    df['ongoing_missing'] = ~df.complete & df.ongoing_var.isnull()
    print(sum(df.complete_missing), sum(df.ongoing_missing))
    
    # combine the 90s and 80s cohorts
    # df.loc[df.birth_index==90, 'birth_index'] = 80

In [ ]:
df = pd.concat(married_males, ignore_index=True)
len(df)

In [ ]:
complete = df.loc[df.complete, 'complete_var']
complete.describe()

In [ ]:
ongoing = df.loc[~df.complete, 'complete_var']
ongoing.describe()

In [ ]:
thinkplot.Cdf(thinkstats2.Cdf(complete))
thinkplot.Cdf(thinkstats2.Cdf(ongoing))
thinkplot.Config(xlabel='Duration of marriage (years)',
                 ylabel='CDF')

In [ ]:
hf = survival.EstimateHazardFunction(complete, ongoing, verbose=True)

In [ ]:
hf, sf = marriage.EstimateSurvival(df)

In [ ]:
thinkplot.Plot(hf)

In [ ]:
thinkplot.Plot(sf)

In [ ]:
from lifelines import KaplanMeierFitter


kmf = KaplanMeierFitter()
kmf.fit(df.complete_var, event_observed=df.complete)
kmf.survival_function_.plot(color='red')
thinkplot.Config(xlim=[0, 30])

In [ ]:
colors = sns.color_palette("colorblind", 5)
cohorts = [80, 70, 60, 50, 90]
colormap = dict(zip(cohorts, colors))

In [ ]:
grouped = df.groupby('birth_index')
for name, group in iter(grouped):
    hf, sf = marriage.EstimateSurvival(group)
    thinkplot.Plot(sf, label=name, color=colormap[name])

thinkplot.Config(title='Women in the U.S.',
                 xlabel='Duration of marriage (years)',
                 ylabel='Fraction still married',
                 xlim=[0, 30], ylim=[0.3, 1])

In [ ]:
last = grouped.get_group(80)
complete = last[last.complete].complete_var
ongoing = last[~last.complete].ongoing_var
hf = survival.EstimateHazardFunction(complete, ongoing, verbose=True)

So far:

  1. Doesn't take into account sampling weights.

  2. Doesn't take into account agemarry.

  3. Vulnerable to small errors in tail.


In [ ]:
reload(marriage)

%time sf_map = marriage.EstimateSurvivalByCohort(married_males, iters=101)

In [ ]:
reload(marriage)

marriage.PlotSurvivalFunctions(sf_map, colormap=colormap)

thinkplot.Config(title='Men in the U.S.',
                     xlabel='Duration of marriage (years)',
                     ylabel='Fraction still married',
                     xlim=[0, 30], ylim=[0.3, 1],
                     legend=True, loc='upper right', frameon=False)

In [ ]:
for name, group in iter(grouped):
    cdf = thinkstats2.Cdf(group.agemarry)
    thinkplot.Cdf(cdf, label=name, color=colormap[name])

In [ ]:
name = 80
last = grouped.get_group(name)
thinkplot.Cdf(thinkstats2.Cdf(last.agemarry), label=name, color=colormap[name])

for name, group in iter(grouped):
    if name != '80':
        matched = marriage.PropensityMatch(last, group, colname='agemarry')
        thinkplot.Cdf(thinkstats2.Cdf(matched.agemarry), label=name, color=colormap[name])

In [ ]:
reload(marriage)

%time sf_map = marriage.EstimateSurvivalByCohort(married_males, iters=21, prop_match=80)

In [ ]:
reload(marriage)

marriage.PlotSurvivalFunctions(sf_map, colormap=colormap)

thinkplot.Config(title='Men in the U.S.',
                     xlabel='Duration of marriage (years)',
                     ylabel='Fraction still married',
                     xlim=[0, 30], ylim=[0.3, 1],
                     legend=True, loc='upper right', frameon=False)

In [ ]:
reload(marriage)

%time sf_map = marriage.EstimateSurvivalByCohort(married_males, iters=21, error_rate=0.1)

In [ ]:
reload(marriage)

marriage.PlotSurvivalFunctions(sf_map, colormap=colormap)

thinkplot.Config(title='Men in the U.S.',
                     xlabel='Duration of marriage (years)',
                     ylabel='Fraction still married',
                     xlim=[0, 30], ylim=[0.3, 1],
                     legend=True, loc='upper right', frameon=False)

In [ ]:
reload(marriage)

%time sf_map = marriage.EstimateSurvivalByCohort(married_males, iters=21, prop_match=80)

In [ ]:
reload(marriage)

marriage.PlotSurvivalFunctions(sf_map, colormap=colormap)

thinkplot.Config(title='Men in the U.S.',
                     xlabel='Duration of marriage (years)',
                     ylabel='Fraction still married',
                     xlim=[0, 30], ylim=[0.3, 1],
                     legend=True, loc='upper right', frameon=False)

In [ ]:
def MakeTable(sf_map, ages):
    t = []
    for name, sf_seq in sorted(sf_map.items()):
        ts, ss = marriage.MakeSurvivalCI(sf_seq, [50])
        ss = ss[0]
        vals = [np.interp(age, ts, ss, right=np.nan) for age in ages]
        t.append((name, vals))
    return t

In [ ]:
def MakePercentageTable(sf_map, ages=[6, 16, 26]):
    t = MakeTable(sf_map, ages)
    for name, sf_seq in sorted(sf_map.items()):
        ts, ss = marriage.MakeSurvivalCI(sf_seq, [50])
        ss = ss[0]
        vals = [np.interp(age, ts, ss, right=np.nan) for age in ages]
        print(name, '&', ' & '.join('%0.0f' % (val*100) for val in vals), r'\\')
        
MakePercentageTable(sf_map)

All adults


In [ ]:
resps = [resp6, resp7, resp8, resp9, male2010, male2013, male2015]

In [ ]:
married_resps = [resp[resp.evrmarry] for resp in resps]

In [ ]:
for df in married_resps:
    df['complete'] = df.divorced
    df['complete_var'] = df.mar1diss / 12
    df['ongoing_var'] = df.mar1diss / 12
    df['complete_missing'] = df.complete & df.complete_var.isnull()
    df['ongoing_missing'] = ~df.complete & df.ongoing_var.isnull()
    print(len(df), sum(df.complete_missing), sum(df.ongoing_missing))
    
    # combine the 90s and 80s cohorts
    # df.loc[df.birth_index==90, 'birth_index'] = 80

In [ ]:
df = pd.concat(married_resps, ignore_index=True)
len(df)

In [ ]:
complete = df.loc[df.complete, 'complete_var']
complete.describe()

In [ ]:
ongoing = df.loc[~df.complete, 'complete_var']
ongoing.describe()

In [ ]:
thinkplot.Cdf(thinkstats2.Cdf(complete))
thinkplot.Cdf(thinkstats2.Cdf(ongoing))
thinkplot.Config(xlabel='Duration of marriage (years)',
                 ylabel='CDF')

In [ ]:
hf = survival.EstimateHazardFunction(complete, ongoing, verbose=True)

In [ ]:
hf, sf = marriage.EstimateSurvival(df)

In [ ]:
thinkplot.Plot(hf)

In [ ]:
thinkplot.Plot(sf)

In [ ]:
from lifelines import KaplanMeierFitter


kmf = KaplanMeierFitter()
kmf.fit(df.complete_var, event_observed=df.complete)
kmf.survival_function_.plot(color='red')
thinkplot.Config(xlim=[0, 30])

In [ ]:
colors = sns.color_palette("colorblind", 5)
cohorts = [80, 70, 60, 50, 90]
colormap = dict(zip(cohorts, colors))

In [ ]:
grouped = df.groupby('birth_index')
for name, group in iter(grouped):
    hf, sf = marriage.EstimateSurvival(group)
    thinkplot.Plot(sf, label=name, color=colormap[name])

thinkplot.Config(title='Adults in the U.S.',
                 xlabel='Duration of marriage (years)',
                 ylabel='Fraction still married',
                 xlim=[0, 30], ylim=[0.3, 1])

In [ ]:
last = grouped.get_group(80)
complete = last[last.complete].complete_var
ongoing = last[~last.complete].ongoing_var
hf = survival.EstimateHazardFunction(complete, ongoing, verbose=True)

So far:

  1. Doesn't take into account sampling weights.

  2. Doesn't take into account agemarry.

  3. Vulnerable to small errors in tail.


In [ ]:
reload(marriage)

%time sf_map = marriage.EstimateSurvivalByCohort(married_resps, iters=101)

In [ ]:
reload(marriage)

marriage.PlotSurvivalFunctions(sf_map, colormap=colormap)

thinkplot.Config(title='Adults in the U.S.',
                     xlabel='Duration of marriage (years)',
                     ylabel='Fraction still married',
                     xlim=[0, 30], ylim=[0.3, 1],
                     legend=True, loc='upper right', frameon=False)

In [ ]:
for name, group in iter(grouped):
    cdf = thinkstats2.Cdf(group.agemarry)
    thinkplot.Cdf(cdf, label=name, color=colormap[name])

In [ ]:
name = 80
last = grouped.get_group(name)
thinkplot.Cdf(thinkstats2.Cdf(last.agemarry), label=name, color=colormap[name])

for name, group in iter(grouped):
    if name != '80':
        matched = marriage.PropensityMatch(last, group, colname='agemarry')
        thinkplot.Cdf(thinkstats2.Cdf(matched.agemarry), label=name, color=colormap[name])

In [ ]:
reload(marriage)

%time sf_map = marriage.EstimateSurvivalByCohort(married_resps, iters=101, prop_match=80)

In [ ]:
reload(marriage)

marriage.PlotSurvivalFunctions(sf_map, colormap=colormap)

thinkplot.Config(title='Adults in the U.S.',
                 xlabel='Duration of marriage (years)',
                 ylabel='Fraction still married',
                 xlim=[0, 30], ylim=[0.3, 1],
                 legend=True, loc='upper right', frameon=False)

thinkplot.Save('divorce1', clf=False, formats=['png'])

In [ ]:
def MakeTable(sf_map, ages):
    t = []
    for name, sf_seq in sorted(sf_map.items()):
        ts, ss = marriage.MakeSurvivalCI(sf_seq, [50])
        ss = ss[0]
        vals = [np.interp(age, ts, ss, right=np.nan) for age in ages]
        t.append((name, vals))
    return t

In [ ]:
def MakePercentageTable(sf_map, ages=[7, 16, 26]):
    t = MakeTable(sf_map, ages)
    for name, sf_seq in sorted(sf_map.items()):
        ts, ss = marriage.MakeSurvivalCI(sf_seq, [50])
        ss = ss[0]
        vals = [np.interp(age, ts, ss, right=np.nan) for age in ages]
        print(name, '&', ' & '.join('%0.0f' % (val*100) for val in vals), r'\\')
        
MakePercentageTable(sf_map)

In [ ]:
group = grouped.get_group(80)

In [ ]:
thinkplot.Cdf(thinkstats2.Cdf(group.agemarry))

In [ ]:
group.agemarry.describe()

In [ ]:
youngmarry = group[group.agemarry <= 21]
oldmarry = group[group.agemarry > 21]

In [ ]:
len(youngmarry), len(oldmarry)

In [ ]:
hf, sf = marriage.EstimateSurvival(youngmarry)
thinkplot.Plot(sf, label='Married age <= 21')
hf, sf = marriage.EstimateSurvival(oldmarry)
thinkplot.Plot(sf, label='Married age > 21')
thinkplot.Config(title='Adults in the U.S., born in 1990s',
                 xlabel='Duration of marriage (years)',
                 ylabel='Fraction still married',
                 xlim=[0, 20], ylim=[0.3, 1],
                 legend=True, loc='upper right', frameon=False)

In [ ]:


In [ ]: