통계적 사고 (2판) 연습문제 (thinkstats2.com, think-stat.xwmooc.org)
Allen Downey / 이광춘(xwMOOC)
In [3]:
%matplotlib inline
from __future__ import print_function
import pandas
import numpy as np
import thinkplot
import thinkstats2
import survival
def CleanData(resp):
"""Cleans respondent data.
resp: DataFrame
"""
resp.cmdivorcx.replace([9998, 9999], np.nan, inplace=True)
resp['notdivorced'] = resp.cmdivorcx.isnull().astype(int)
resp['duration'] = (resp.cmdivorcx - resp.cmmarrhx) / 12.0
resp['durationsofar'] = (resp.cmintvw - resp.cmmarrhx) / 12.0
month0 = pandas.to_datetime('1899-12-15')
dates = [month0 + pandas.DateOffset(months=cm)
for cm in resp.cmbirth]
resp['decade'] = (pandas.DatetimeIndex(dates).year - 1900) // 10
def ResampleDivorceCurve(resps):
"""Plots divorce curves based on resampled data.
resps: list of respondent DataFrames
"""
for _ in range(41):
samples = [thinkstats2.ResampleRowsWeighted(resp)
for resp in resps]
sample = pandas.concat(samples, ignore_index=True)
PlotDivorceCurveByDecade(sample, color='#225EA8', alpha=0.1)
thinkplot.Show(xlabel='years',
axis=[0, 28, 0, 1])
def ResampleDivorceCurveByDecade(resps):
"""Plots divorce curves for each birth cohort.
resps: list of respondent DataFrames
"""
for i in range(41):
samples = [thinkstats2.ResampleRowsWeighted(resp)
for resp in resps]
sample = pandas.concat(samples, ignore_index=True)
groups = sample.groupby('decade')
if i == 0:
survival.AddLabelsByDecade(groups, alpha=0.7)
EstimateSurvivalByDecade(groups, alpha=0.1)
thinkplot.Show(xlabel='years',
axis=[0, 28, 0, 1])
def EstimateSurvivalByDecade(groups, **options):
"""Groups respondents by decade and plots survival curves.
groups: GroupBy object
"""
thinkplot.PrePlot(len(groups))
for name, group in groups:
print(name, len(group))
_, sf = EstimateSurvival(group)
thinkplot.Plot(sf, **options)
def EstimateSurvival(resp):
"""Estimates the survival curve.
resp: DataFrame of respondents
returns: pair of HazardFunction, SurvivalFunction
"""
complete = resp[resp.notdivorced == 0].duration
ongoing = resp[resp.notdivorced == 1].durationsofar
hf = survival.EstimateHazardFunction(complete, ongoing)
sf = hf.MakeSurvival()
return hf, sf
def main():
resp6 = survival.ReadFemResp2002()
CleanData(resp6)
married6 = resp6[resp6.evrmarry==1]
resp7 = survival.ReadFemResp2010()
CleanData(resp7)
married7 = resp7[resp7.evrmarry==1]
ResampleDivorceCurveByDecade([married6, married7])
if __name__ == '__main__':
main()
In [ ]: