In [11]:
%matplotlib inline
import numpy as np
import pandas
import nsfg
import thinkstats2
import thinkplot
import survival
import chap01soln
Survival Curve, S(t) maps from a duration, t, to the probability of surviving longer than t.
$$ S(t) = 1-\text{CDF}(t) $$where CDF(t) is the probability of a lifetime less than or equal to t
In [2]:
preg = nsfg.ReadFemPreg()
complete = preg.query('outcome in [1,3,4]').prglngth
cdf = thinkstats2.Cdf(complete, label='cdf')
In [3]:
##note: property is a method that can be invoked as if
##it were a variable.
class SurvivalFunction(object):
def __init__(self, cdf, label=''):
self.cdf = cdf
self.label = label or cdf.label
@property
def ts(self):
"""
sequence of lifetimes
"""
return self.cdf.xs
@property
def ss(self):
"""
survival curve
"""
return 1 - self.cdf.ps
def __getitem__(self, t):
return self.Prob(t)
def Prob(self, t):
return 1 - self.cdf.Prob(t)
In [4]:
sf = survival.SurvivalFunction(cdf)
##fraction of pregs that proceed past the first trimester.
sf[13]
Out[4]:
In [7]:
thinkplot.Plot(cdf)
thinkplot.Plot(sf)
thinkplot.Show()
Hazard Function - maps from a time, t, to the fraction of pregnancies that continue until t and then end at t. Numerator is equal to PMF(t)
$$ \lambda(t) = \frac{S(t)-S(t+1)}{S(t)} $$
In [8]:
hf = sf.MakeHazard()
##of all pregnancies that proceed until week 39, hf[39] end
##in week 39
hf[39]
Out[8]:
In [9]:
thinkplot.Plot(hf)
The goal of this chapter is to use NSFG data to quantify how long respondents "survive" until they get married for the first time. The range of respondents is 14 to 44 years. For women who have been married, the date of the first marriage is known. For women not married, all we know is their age at the time of interview.
Kaplan-Meier estimation - we can use the data to estimate the hazard function then convert the hazard function to a survival curve. For each age we consider:
In [13]:
def EstimateHazardFunction(complete, ongoing, label=''):
"""
complete: set of complete observations (marriage age)
ongoing: set of incomplete observations (age of resp)
"""
n = len(complete)
hist_complete = thinkstats2.Hist(complete)
sf_complete = SurvivalFunction(thinkstats2.Cdf(complete))
m = len(ongoing)
sf_ongoing = SurvivalFunction(thinkstats2.Cdf(ongoing))
lams = {}
##at_risk measures number of resps whose outcomes are not known at t
##ended = number of respondents married at age t
##n * sf_complete[t], num respondents married after age t
##m * sf_ongoing[t], num of unmarried resps inverviewed after t
for t, ended in sorted(hist_complete.Items()):
at_risk = ended + n * sf_complete[t] + m * sf_ongoing[t]
lams[t] = ended / at_risk
return survival.HazardFunction(lams, label=label)
In [14]:
resp = chap01soln.ReadFemResp()
resp.cmmarrhx.replace([9997, 9998, 9999], np.nan, inplace=True)
resp['agemarry'] = (resp.cmmarrhx - resp.cmbirth) / 12.0
resp['age'] = (resp.cmintvw - resp.cmbirth) / 12.0
complete = resp[resp.evrmarry==1].agemarry
ongoing = resp[resp.evrmarry==0].age
hf = EstimateHazardFunction(complete, ongoing)
thinkplot.Plot(hf)
In [15]:
# HazardFunction:
def MakeSurvival(self):
"""
ts: sequence of times where hazard function is estimated
ss: cumulative product of comp hazard function
"""
ts = self.series.index
ss = (1 - self.series).cumprod()
cdf = thinkstats2.Cdf(ts, 1 - ss)
sf = SurvivalFunction(cdf)
return sf
In [16]:
sf = hf.MakeSurvival()
thinkplot.Plot(sf)
In [25]:
##Sampling Error:
def ResampleSurvival(resp, iters=101):
low, high = resp.agemarry.min(), resp.agemarry.max()
ts = np.arange(low, high, 1/12.0)
ss_seq = []
for i in range(iters):
sample = thinkstats2.ResampleRowsWeighted(resp)
hf, sf = survival.EstimateSurvival(sample)
ss_seq.append(sf.Probs(ts))
low, high = thinkstats2.PercentileRows(ss_seq, [5,95])
thinkplot.FillBetween(ts, low, high)
In [26]:
sf = hf.MakeSurvival()
ResampleSurvival(resp)
thinkplot.Plot(sf)
##discrepancy indicates that sample weights
##have a substantial effect
left part of the graph has data for all respondents, but right part of the graph only has the oldest respondents. If the relevant characterisitics of respondents are not changing over time, that's fine, but in this case there are probably generational shifts.
We can investigate effect by grouping respondents according to their decade of birth. These groups are called cohorts
In [31]:
month0 = pandas.to_datetime('1899-12-15')
dates = [month0 + pandas.DateOffset(months=cm)
for cm in resp.cmbirth]
resp['decade'] = (pandas.DatetimeIndex(dates).year - 1900) // 10
In [35]:
def EstimateSurvivalByDecade(groups):
for name, group, in groups:
hf, sf = survival.EstimateSurvival(group)
thinkplot.Plot(sf)
for i in range(1):
samples = [thinkstats2.ResampleRowsWeighted(resp)
for resp in resps]
sample = pandas.concat(samples, ignore_index=True)
groups = sample.groupby('decade')
EstimateSurvivalByDecade(groups)
In [36]:
survival.PlotResampledByDecade(resps)
To extrapolate, we can "borrow" data from a pervious cohort...
In [37]:
# class HazardFunction
def Extend(self, other):
last = self.series.index[-1]
more = other.series[other.series.index > last]
serlf.series = pandas.concat([self.series, more])
In [38]:
survival.PlotResampledByDecade(resps, predict_flag=True)
To plot expected remaining lifetime, we make a pmf from the survival function and then for each value of t, we cut off previous values and get the mean of the remaining values in the PMF.
memoryless - when the remaining lifetime function levels out completely, so past has no effect on the previous predictions. "Any day now." If you're still in the game now, anything can happen.
NBUE - "New better than used in expectation." Young women have decreasing remaining "lifetimes." New parts expected to last longer
UBNE - "Used better than new in expectaton" The older the part, the longer it is expected to last. Also newborns and cancer patients.
In [39]:
resp5 = survival.ReadFemResp1995()
resp6 = survival.ReadFemResp2002()
resp7 = survival.ReadFemResp2010()
resps = [resp5, resp6, resp7]
In [120]:
def EstimateDivorceSS(resp, cleanData=False):
if cleanData:
resp.cmmarrhx.replace([9997, 9998, 9999], np.nan, inplace=True)
resp.cmdivorcx.replace([9998,9999], np.nan, inplace=True)
resp['agemarry'] = (resp.cmmarrhx - resp.cmbirth) / 12.0
resp['age'] = (resp.cmintvw - resp.cmbirth) / 12.0
resp['durmarr'] = (resp.cmdivorcx - resp.cmmarrhx) / 12.0
resp['sincemarr'] = (resp.age - resp.agemarry)
complete = resp[resp.durmarr.notnull()].durmarr
ongoing = resp[resp.durmarr.isnull()][resp.evrmarry==1].sincemarr
hf = EstimateHazardFunction(complete, ongoing)
ss = hf.MakeSurvival()
return hf, ss
In [121]:
resp = chap01soln.ReadFemResp()
hf, ss = EstimateDivorceSS(resp, cleanData=True)
thinkplot.Plot(ss)
In [125]:
def PlotConfidenceIntervalSS(resp, iters=101, func=EstimateDivorceSS):
low = 0
high = resp[resp.durmarr.isnull()].sincemarr.max()
print 'high',high
ts = np.arange(low, high, 1/12.0)
ss_seq = []
for i in range(iters):
sample = thinkstats2.ResampleRowsWeighted(resp)
sample = sample.reset_index()
hf, sf = func(sample)
ss_seq.append(sf.Probs(ts))
low, high = thinkstats2.PercentileRows(ss_seq, [5,95])
thinkplot.FillBetween(ts, low, high)
thinkplot.Plot(ss)
PlotConfidenceIntervalSS(resp)
In [126]:
resp5 = survival.ReadFemResp1995()
resp6 = survival.ReadFemResp2002()
resp7 = survival.ReadFemResp2010()
resps = [resp5, resp6, resp7]
In [147]:
def EstimateSurvivalByDecade(groups, **options):
thinkplot.PrePlot(len(groups))
for _, group in groups:
_, sf = EstimateDivorceSS(group, cleanData=True)
thinkplot.Plot(sf, **options)
def PlotPredictionsByDecade(groups, **options):
hfs = []
for _, group in groups:
hf, sf = EstimateDivorceSS(group, cleanData=True)
hfs.append(hf)
thinkplot.PrePlot(len(hfs))
for i, hf in enumerate(hfs):
if i > 0:
hf.Extend(hfs[i-1])
sf = hf.MakeSurvival()
thinkplot.Plot(sf, **options)
def PlotResampledDivorceByDecade(resps, iters=11, predict_flag=False, omit=None):
for i in range(iters):
samples = [thinkstats2.ResampleRowsWeighted(resp)
for resp in resps]
sample = pandas.concat(samples, ignore_index=True)
groups = sample.groupby('decade')
if omit:
groups = [(name, group) for name, group in groups
if name not in omit]
if i == 0:
survival.AddLabelsByDecade(groups, alpha=0.7)
if predict_flag:
EstimateSurvivalByDecade(groups, alpha=0.2)
try:
PlotPredictionsByDecade(groups, alpha=0.2)
except IndexError:
pass
else:
print "not predicting"
EstimateSurvivalByDecade(groups, alpha=0.2)
In [148]:
thinkplot.Config(title="without predictions")
PlotResampledDivorceByDecade(resps, predict_flag=False)
thinkplot.Show()
In [144]:
thinkplot.Config(title='with predictions')
PlotResampledDivorceByDecade(resps, predict_flag=True)
thinkplot.Show()
In [160]:
def CleanData2(resps):
for r in resps:
r['marrgroup'] = r.agemarry // 5
In [167]:
def marr_AddLabelsByDecade(groups, **options):
"""Draws fake points in order to add labels to the legend.
groups: GroupBy object
"""
thinkplot.PrePlot(len(groups))
for name, _ in groups:
label = '%d' % ((name + 1) * 5)
thinkplot.Plot([15], [1], label=label, **options)
def Marr_PlotResampledDivorceByDecade(resps, iters=11, predict_flag=False, omit=None):
CleanData2(resps)
for i in range(iters):
samples = [thinkstats2.ResampleRowsWeighted(resp)
for resp in resps]
sample = pandas.concat(samples, ignore_index=True)
groups = sample.groupby('marrgroup')
if omit:
groups = [(name, group) for name, group in groups
if name not in omit]
if i == 0:
marr_AddLabelsByDecade(groups, alpha=0.7)
if predict_flag:
EstimateSurvivalByDecade(groups, alpha=0.2)
try:
PlotPredictionsByDecade(groups, alpha=0.2)
except IndexError:
pass
else:
EstimateSurvivalByDecade(groups, alpha=0.2)
In [169]:
Marr_PlotResampledDivorceByDecade(resps)
thinkplot.Config(title="without prediction")
thinkplot.Show()
In [171]:
Marr_PlotResampledDivorceByDecade([resp7], omit=[0,1,2,3,4,5])
thinkplot.Config(title="without prediction")
thinkplot.Show()
In [189]:
hf, sf = EstimateDivorceSS(resp, cleanData=True)
func = lambda pmf: pmf.Percentile(50)
rem_life = sf.RemainingLifetime(func=func)
thinkplot.Plot(rem_life)
Note that this whole study suffers from the assumption that all people who are currently married will get divorced. I might have been better off using the total duration of the marriage for everybody without assuming there will be a divorce.
In [ ]: