In [1]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import numpy as np
import matplotlib
from matplotlib import pyplot as plt
import pandas as pd
from datetime import date, datetime
from lifelines import KaplanMeierFitter, CoxPHFitter, NelsonAalenFitter

matplotlib.rcParams['figure.figsize'] = (12.0, 6.0)
plt.style.use('seaborn-deep')

Definition of censoring and death

Quitting is death, all else is censoring. This is different than the original article's author's rules, who stated that switching roles within a cabinent is an "event".


In [2]:
raw_df = pd.read_csv("https://raw.githubusercontent.com/fivethirtyeight/data/master/cabinet-turnover/cabinet-turnover.csv",
                    na_values=['Still in office', '#VALUE!']
                    )
TODAY = datetime.today().date()

INAUG_DATES = {
    'Trump': date(2017, 1, 20),
    'Obama': date(2009, 1, 20),
    'Bush 43': date(2001, 1, 20),
    'Clinton': date(1993, 1, 20),
    'Bush 41': date(1989, 1, 20),
    'Reagan': date(1981, 1, 20),
    'Carter': date(1977, 1, 20)
}

presidential_terms = pd.DataFrame(list(INAUG_DATES.items()))
presidential_terms.columns = ['president', 'president_start_date']
presidential_terms['president_end_date'] = presidential_terms['president_start_date'].shift(1).fillna(TODAY)
presidential_terms


Out[2]:
president president_start_date president_end_date
0 Trump 2017-01-20 2019-03-05
1 Obama 2009-01-20 2017-01-20
2 Bush 43 2001-01-20 2009-01-20
3 Clinton 1993-01-20 2001-01-20
4 Bush 41 1989-01-20 1993-01-20
5 Reagan 1981-01-20 1989-01-20
6 Carter 1977-01-20 1981-01-20

In [3]:
def fill_end(series):
    end, president = series
    if pd.notnull(end) and end.endswith('admin'):
        next_pres ,_ = end.split(' ')
        if next_pres == 'Bush':
            next_pres = next_pres + ' 43' if president == 'Clinton' else next_pres + ' 41'
        return INAUG_DATES[next_pres].strftime('%m/%d/%y')
    else:
        return end
    
def fill_start(series):
    end, president = series
    if pd.notnull(end) and end.endswith('admin'):
        prev_pres ,_ = end.split(' ')
        if prev_pres == 'Bush':
            prev_pres = prev_pres + ' 43' if president == 'Obama' else prev_pres + ' 41'
        return INAUG_DATES[president].strftime('%m/%d/%y')
    else:
        return end
    
    
raw_df['end'] = raw_df[['end', 'president']].apply(fill_end, axis=1)
raw_df['start'] = raw_df[['start', 'president']].apply(fill_start, axis=1)

raw_df['end'] = pd.to_datetime(raw_df['end']).dt.date
raw_df['end'] = raw_df['end'].fillna(TODAY)
raw_df['start'] = pd.to_datetime(raw_df['start']).dt.date

In [4]:
raw_df = raw_df.merge(presidential_terms, left_on='president', right_on='president')
raw_df['event'] = (raw_df['end'] < raw_df['president_end_date']) & pd.notnull(raw_df['end'])

In [5]:
# we need to "collapse" individuals into rows, because they may change positions, but that's not quitting...
def collapse(df):
    return df.groupby('appointee', as_index=False).aggregate({
        'start': 'min', 'end': 'max', 'event': 'all', 'president': 'last', 'president_end_date': 'last'
    })

raw_df = raw_df.groupby('president', as_index=False).apply(collapse).reset_index(drop=True)
raw_df['T'] = (raw_df['end'] - raw_df['start']).dt.days

In [6]:
raw_df.tail(20)


Out[6]:
appointee start end event president president_end_date T
267 Jeff Sessions 2017-02-09 2018-11-07 True Trump 2019-03-05 636
268 Jim Mattis 2017-01-20 2018-12-31 True Trump 2019-03-05 710
269 John Kelly 2017-01-20 2018-12-31 True Trump 2019-03-05 710
270 Kirstjen Nielsen 2017-12-06 2019-03-05 False Trump 2019-03-05 454
271 Linda McMahon 2017-02-14 2019-03-05 False Trump 2019-03-05 749
272 Mick Mulvaney 2017-02-16 2019-03-05 False Trump 2019-03-05 747
273 Mike Pence 2017-01-20 2019-03-05 False Trump 2019-03-05 774
274 Mike Pompeo 2017-01-23 2019-03-05 False Trump 2019-03-05 771
275 Nikki Haley 2017-01-27 2018-12-31 True Trump 2019-03-05 703
276 Reince Priebus 2017-01-20 2017-07-28 True Trump 2019-03-05 189
277 Rex Tillerson 2017-02-01 2018-03-31 True Trump 2019-03-05 423
278 Rick Perry 2017-03-02 2019-03-05 False Trump 2019-03-05 733
279 Robert Lighthizer 2017-05-15 2019-03-05 False Trump 2019-03-05 659
280 Robert Wilkie 2018-07-30 2019-03-05 False Trump 2019-03-05 218
281 Ryan Zinke 2017-03-01 2019-01-02 True Trump 2019-03-05 672
282 Scott Pruitt 2017-02-17 2018-07-06 True Trump 2019-03-05 504
283 Sonny Perdue 2017-04-25 2019-03-05 False Trump 2019-03-05 679
284 Steve Mnuchin 2017-02-13 2019-03-05 False Trump 2019-03-05 750
285 Tom Price 2017-02-10 2017-09-29 True Trump 2019-03-05 231
286 Wilbur Ross 2017-02-28 2019-03-05 False Trump 2019-03-05 735

In [7]:
naf = NelsonAalenFitter()
ax = naf.fit(raw_df['T'],raw_df['event']).plot()

from lifelines import PiecewiseExponentialFitter
pf = PiecewiseExponentialFitter(breakpoints=[1440, 1500])
pf.fit(raw_df['T'], raw_df['event'])
pf.plot(ax=ax)
pf.print_summary(4)


<lifelines.PiecewiseExponentialFitter: fitted with 287 observations, 129 censored>
number of subjects = 287
  number of events = 158
    log-likelihood = -1310.480
        hypothesis = lambda_0_ != 1, lambda_1_ != 1, lambda_2_ != 1

---
               coef  se(coef)  lower 0.95  upper 0.95      p  -log2(p)
lambda_0_ 2872.6704  280.8217   2322.2701   3423.0708 <5e-05   79.1248
lambda_1_  154.6505   30.4338     95.0013    214.2998 <5e-05   21.1000
lambda_2_ 1101.0836  189.0864    730.4810   1471.6862 <5e-05   27.3221

In [8]:
kmf = KaplanMeierFitter()

ax = plt.subplot()

for name, df_ in raw_df[['president','event', 'T']].groupby('president'):
    kmf.fit(df_['T'], df_['event'], label=name)
    ax = kmf.plot(ax=ax, ci_show=False)



In [9]:
ax = plt.subplot()

for name, df_ in raw_df[['president','event', 'T']].groupby('president'):
    kmf.fit(df_['T'], df_['event'], label=name)
    if name == 'Trump':
        ax = kmf.plot(ax=ax, c='r')
    else:
        ax = kmf.plot(ax=ax, c='grey', alpha=0.5, ci_show=False)



In [10]:
raw_df[['president','event', 'T']]

naf = NelsonAalenFitter()

ax = plt.subplot()

for name, df_ in raw_df[['president','event', 'T']].groupby('president'):
    if name in ['Trump', 'Carter']:
        naf.fit(df_['T'], df_['event'], label=name)
        ax = naf.plot(ax=ax)



In [11]:
raw_df['year'] = raw_df['start'].apply(lambda d: int(d.year))
raw_df['year'] -= raw_df['year'].mean()
raw_df['year**2'] = raw_df['year']**2



regression_df = raw_df[['president', 'T', 'event', 'year', 'year**2']]
regression_df = pd.get_dummies(regression_df, columns=['president'])
del regression_df['president_Clinton']

In [12]:
cph = CoxPHFitter()
cph.fit(regression_df, 'T', 'event')
cph.print_summary(3)


<lifelines.CoxPHFitter: fitted with 287 observations, 129 censored>
      duration col = 'T'
         event col = 'event'
number of subjects = 287
  number of events = 158
    log-likelihood = -724.975
  time fit was run = 2019-03-05 20:03:33 UTC

---
                    coef  exp(coef)  se(coef)      z     p  -log2(p)  lower 0.95  upper 0.95
year               0.110      1.116     0.059  1.870 0.061     4.024      -0.005       0.225
year**2           -0.005      0.995     0.002 -2.092 0.036     4.779      -0.010      -0.000
president_Bush 41  1.449      4.257     0.489  2.962 0.003     8.355       0.490       2.407
president_Bush 43 -0.625      0.535     0.539 -1.160 0.246     2.023      -1.681       0.431
president_Carter   4.600     99.502     1.431  3.215 0.001     9.580       1.795       7.405
president_Obama   -1.074      0.342     1.042 -1.030 0.303     1.723      -3.117       0.969
president_Reagan   2.366     10.658     0.886  2.672 0.008     7.050       0.630       4.102
president_Trump    0.763      2.145     1.674  0.456 0.648     0.625      -2.518       4.044
---
Concordance = 0.603
Log-likelihood ratio test = 22.407 on 8 df, -log2(p)=7.890

In [13]:
cph.check_assumptions(regression_df)


Proportional hazard assumption looks okay.

In [ ]:


In [ ]:


In [15]:
from lifelines import *

wf = WeibullAFTFitter(penalizer=0.0)
wf.fit(regression_df, 'T', 'event')
wf.print_summary(3)


<lifelines.WeibullAFTFitter: fitted with 287 observations, 129 censored>
      duration col = 'T'
         event col = 'event'
number of subjects = 287
  number of events = 158
    log-likelihood = -1319.942
  time fit was run = 2019-03-05 20:03:44 UTC

---
                            coef  exp(coef)  se(coef)      z       p  -log2(p)  lower 0.95  upper 0.95
lambda_ year              -0.031      0.970     0.026 -1.186   0.236     2.085      -0.081       0.020
        year**2            0.003      1.003     0.001  2.190   0.029     5.132       0.000       0.005
        president_Bush 41 -0.525      0.591     0.209 -2.512   0.012     6.379      -0.935      -0.115
        president_Bush 43  0.077      1.080     0.245  0.313   0.754     0.407      -0.404       0.557
        president_Carter  -1.837      0.159     0.629 -2.922   0.003     8.166      -3.069      -0.605
        president_Obama    0.109      1.115     0.474  0.229   0.819     0.289      -0.821       1.038
        president_Reagan  -0.930      0.395     0.393 -2.366   0.018     5.798      -1.701      -0.160
        president_Trump   -0.956      0.384     0.790 -1.211   0.226     2.145      -2.503       0.592
        _intercept         7.463   1742.774     0.116 64.480 <0.0005       inf       7.236       7.690
rho_    _intercept         0.724      2.064     0.068 10.672 <0.0005    85.911       0.591       0.857
---
Concordance = 0.599
Log-likelihood ratio test = 19.811 on 8 df, -log2(p)=6.496

In [ ]:


In [16]:
lnf = LogNormalAFTFitter(penalizer=0.0000)
lnf.fit(regression_df, 'T', 'event')
lnf.print_summary(3)


<lifelines.LogNormalAFTFitter: fitted with 287 observations, 129 censored>
      duration col = 'T'
         event col = 'event'
number of subjects = 287
  number of events = 158
    log-likelihood = -1328.054
  time fit was run = 2019-03-05 20:03:46 UTC

---
                           coef  exp(coef)  se(coef)      z       p  -log2(p)  lower 0.95  upper 0.95
mu_    year              -0.034      0.966     0.027 -1.268   0.205     2.288      -0.087       0.019
       year**2            0.003      1.003     0.001  1.995   0.046     4.441       0.000       0.005
       president_Bush 41 -0.456      0.634     0.241 -1.891   0.059     4.092      -0.929       0.017
       president_Bush 43  0.221      1.247     0.276  0.801   0.423     1.240      -0.320       0.762
       president_Carter  -1.768      0.171     0.676 -2.616   0.009     6.811      -3.093      -0.443
       president_Obama    0.231      1.260     0.502  0.461   0.645     0.632      -0.753       1.215
       president_Reagan  -0.887      0.412     0.445 -1.996   0.046     4.445      -1.759      -0.016
       president_Trump   -0.680      0.507     0.802 -0.848   0.396     1.335      -2.252       0.892
       _intercept         7.171   1301.074     0.132 54.238 <0.0005       inf       6.912       7.430
sigma_ _intercept        -0.343      0.710     0.060 -5.742 <0.0005    26.671      -0.459      -0.226
---
Concordance = 0.611
Log-likelihood ratio test = 15.857 on 8 df, -log2(p)=4.491

In [17]:
llf = LogLogisticAFTFitter(penalizer=0.000)
llf.fit(regression_df, 'T', 'event')
llf.print_summary(3)


<lifelines.LogLogisticAFTFitter: fitted with 287 observations, 129 censored>
      duration col = 'T'
         event col = 'event'
number of subjects = 287
  number of events = 158
    log-likelihood = -1321.774
  time fit was run = 2019-03-05 20:03:46 UTC

---
                           coef  exp(coef)  se(coef)      z       p  -log2(p)  lower 0.95  upper 0.95
alpha_ year              -0.032      0.969     0.026 -1.234   0.217     2.203      -0.083       0.019
       year**2            0.002      1.002     0.001  2.005   0.045     4.474       0.000       0.005
       president_Bush 41 -0.412      0.662     0.228 -1.804   0.071     3.810      -0.860       0.036
       president_Bush 43  0.183      1.201     0.255  0.719   0.472     1.083      -0.316       0.683
       president_Carter  -1.741      0.175     0.660 -2.636   0.008     6.898      -3.035      -0.447
       president_Obama    0.225      1.252     0.471  0.478   0.633     0.661      -0.697       1.147
       president_Reagan  -0.836      0.433     0.423 -1.977   0.048     4.381      -1.666      -0.007
       president_Trump   -0.745      0.475     0.763 -0.976   0.329     1.604      -2.242       0.751
       _intercept         7.188   1324.064     0.125 57.503 <0.0005       inf       6.943       7.433
beta_  _intercept         0.966      2.627     0.068 14.229 <0.0005   150.202       0.833       1.099
---
Concordance = 0.610
Log-likelihood ratio test = 18.367 on 8 df, -log2(p)=5.746

In [18]:
wf.plot_covariate_groups(['year', 'year**2'], values=[[x, x**2] for x in linspace(-21, 21, 10)], cmap='coolwarm')


Out[18]:
<matplotlib.axes._subplots.AxesSubplot at 0x11809a2b0>

In [19]:
wf.plot_covariate_groups(covariates=regression_df.filter(like='president').columns.tolist(), 
                         values=np.eye(6))


Out[19]:
<matplotlib.axes._subplots.AxesSubplot at 0x11bae45f8>

In [ ]:


In [ ]: