notebook.community

Edit and run



In [1]:

    
import lifelines
import pymc as pm
import pyBMA
import matplotlib.pyplot as plt
import numpy as np
from math import log
from datetime import datetime
import pandas as pd
%matplotlib inline

The first step in any data analysis is acquiring and munging the data

An example data set can be found at: https://jakecoltman.gitlab.io/website/post/pydata/

Download the file output.txt and transform it into a format like below where the event column should be 0 if there's only one entry for an id, and 1 if there are two entries:

End date = datetime.datetime(2016, 5, 3, 20, 36, 8, 92165)

id,time_to_convert,age,male,event,search,brand



In [10]:

    
running_id = 0
output = [[0]]
with open("E:/output.txt") as file_open:
    for row in file_open.read().split("\n"):
        cols = row.split(",")
        if cols[0] == output[-1][0]:
            output[-1].append(cols[1])
            output[-1].append(True)
        else:
            output.append(cols)
    output = output[1:]
    
for row in output:
    if len(row) == 6:
        row += [datetime(2016, 5, 3, 20, 36, 8, 92165), False]
output = output[1:-1]



In [13]:

    
def convert_to_days(dt):
    day_diff = dt / np.timedelta64(1, 'D')
    if day_diff == 0:
        return 23.0
    else: 
        return day_diff

df = pd.DataFrame(output, columns=["id", "advert_time", "male","age","search","brand","conversion_time","event"])
df["lifetime"] = pd.to_datetime(df["conversion_time"]) - pd.to_datetime(df["advert_time"])
df["lifetime"] = df["lifetime"].apply(convert_to_days)
df["male"] = df["male"].astype(int)
df["search"] = df["search"].astype(int)
df["brand"] = df["brand"].astype(int)
df["age"] = df["age"].astype(int)
df["event"] = df["event"].astype(int)
df = df.drop('advert_time', 1)
df = df.drop('conversion_time', 1)
df = df.set_index("id")
df = df.dropna(thresh=2)
df.median()
df









    Out[13]:






  
    
      
      male
      age
      search
      brand
      event
      lifetime
    
    
      id
      
      
      
      
      
      
    
  
  
    
      1
      0
      59
      1
      1
      0
      18.358052
    
    
      2
      0
      41
      0
      0
      0
      19.613458
    
    
      3
      0
      56
      0
      1
      1
      10.007883
    
    
      4
      0
      47
      0
      0
      1
      11.601139
    
    
      5
      1
      44
      1
      0
      1
      11.815086
    
    
      6
      1
      29
      1
      0
      0
      21.031488
    
    
      7
      0
      22
      0
      0
      1
      13.039954
    
    
      8
      0
      23
      0
      1
      1
      8.593106
    
    
      9
      1
      19
      0
      0
      1
      1.965708
    
    
      10
      0
      54
      0
      0
      1
      12.745363
    
    
      11
      0
      20
      1
      0
      1
      14.968616
    
    
      12
      0
      56
      0
      0
      1
      7.581745
    
    
      13
      0
      53
      0
      0
      1
      5.788385
    
    
      14
      0
      49
      0
      0
      1
      14.253003
    
    
      15
      0
      48
      1
      0
      0
      18.254036
    
    
      16
      0
      19
      1
      1
      1
      21.719913
    
    
      17
      0
      34
      1
      0
      0
      18.651031
    
    
      18
      0
      22
      1
      0
      0
      20.116080
    
    
      19
      0
      38
      1
      0
      0
      20.770570
    
    
      20
      1
      32
      1
      0
      1
      12.220795
    
    
      21
      0
      42
      0
      0
      1
      10.867250
    
    
      22
      1
      26
      1
      1
      1
      19.342920
    
    
      23
      0
      57
      1
      0
      1
      17.028378
    
    
      24
      1
      46
      1
      1
      1
      14.920933
    
    
      25
      0
      58
      1
      0
      0
      20.671843
    
    
      26
      1
      50
      1
      1
      1
      15.711533
    
    
      27
      0
      49
      1
      0
      0
      18.648141
    
    
      28
      0
      27
      1
      0
      0
      21.813143
    
    
      29
      0
      38
      0
      1
      1
      17.475041
    
    
      30
      0
      37
      1
      0
      1
      15.558864
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      2971
      1
      33
      1
      0
      1
      12.139852
    
    
      2972
      1
      56
      0
      0
      1
      1.915499
    
    
      2973
      1
      27
      0
      1
      1
      1.668352
    
    
      2974
      0
      37
      1
      0
      1
      14.942464
    
    
      2975
      1
      28
      1
      1
      1
      17.154657
    
    
      2976
      1
      52
      1
      1
      1
      11.467616
    
    
      2977
      0
      43
      1
      1
      0
      22.375457
    
    
      2978
      0
      43
      0
      0
      1
      3.552823
    
    
      2979
      0
      25
      0
      0
      1
      11.401859
    
    
      2980
      0
      38
      1
      0
      0
      18.574942
    
    
      2981
      0
      55
      1
      0
      0
      21.206353
    
    
      2982
      0
      58
      1
      0
      1
      13.556694
    
    
      2983
      0
      56
      0
      1
      1
      8.015187
    
    
      2984
      1
      36
      0
      0
      1
      6.175789
    
    
      2985
      0
      58
      0
      1
      1
      21.832655
    
    
      2986
      0
      54
      1
      0
      1
      19.588053
    
    
      2987
      1
      24
      1
      0
      1
      16.310147
    
    
      2988
      0
      35
      0
      0
      1
      13.900499
    
    
      2989
      1
      47
      0
      0
      1
      5.327094
    
    
      2990
      0
      32
      1
      1
      0
      19.743018
    
    
      2991
      0
      50
      1
      0
      0
      18.470577
    
    
      2992
      0
      29
      0
      1
      0
      19.807723
    
    
      2993
      0
      58
      1
      0
      1
      16.500628
    
    
      2994
      0
      26
      1
      1
      1
      17.933190
    
    
      2995
      0
      37
      0
      1
      1
      18.706615
    
    
      2996
      0
      50
      0
      0
      1
      5.912346
    
    
      2997
      0
      51
      0
      1
      1
      14.606427
    
    
      2998
      0
      49
      0
      0
      1
      10.257404
    
    
      2999
      1
      34
      1
      0
      1
      12.975489
    
    
      3000
      0
      37
      1
      0
      0
      18.119690
    
  

3000 rows × 6 columns



In [ ]:

    
###Parametric Bayes
#Shout out to Cam Davidson-Pilon



In [16]:

    
## Example fully worked model using toy data
## Adapted from http://blog.yhat.com/posts/estimating-user-lifetimes-with-pymc.html
## Note that we've made some corrections
censor = np.array(df["event"].apply(lambda x: 0 if x else 1).tolist())
alpha = pm.Uniform("alpha", 0,50) 
beta = pm.Uniform("beta", 0,50) 

@pm.observed
def survival(value=df["lifetime"], alpha = alpha, beta = beta ):
    return sum( (1-censor)*(np.log( alpha/beta) + (alpha-1)*np.log(value/beta)) - (value/beta)**(alpha))


mcmc = pm.MCMC([alpha, beta, survival ] )
mcmc.sample(10000)









    



 [-----------------100%-----------------] 10000 of 10000 complete in 24.9 sec



In [17]:

    
pm.Matplot.plot(mcmc)
mcmc.trace("alpha")[:]









    



Plotting alpha
Plotting beta






    Out[17]:





array([ 8.97671912,  8.97671912,  8.97671912, ...,  1.79408852,
        1.79408852,  1.79408852])

Problems:

2 - Try to fit your data from section 1 
3 - Use the results to plot the distribution of the median
--------
4 - Try adjusting the number of samples, the burn parameter and the amount of thinning to correct get good answers
5 - Try adjusting the prior and see how it affects the estimate
--------
6 - Try to fit a different distribution to the data
7 - Compare answers
Bonus - test the hypothesis that the true median is greater than a certain amount

For question 2, note that the median of a Weibull is:

$$β(log 2)^{1/α}$$



In [18]:

    
#Solution to question 4:
def weibull_median(alpha, beta):
    return beta * ((log(2)) ** ( 1 / alpha))
plt.hist([weibull_median(x[0], x[1]) for x in zip(mcmc.trace("alpha"), mcmc.trace("beta"))])









    Out[18]:





(array([  1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   9.81600000e+03,   1.80000000e+02,
          3.00000000e+00]),
 array([  4.88512654,   6.28886044,   7.69259433,   9.09632823,
         10.50006213,  11.90379603,  13.30752992,  14.71126382,
         16.11499772,  17.51873162,  18.92246552]),
 <a list of 10 Patch objects>)



In [20]:

    
#Solution to question 4:
### Increasing the burn parameter allows us to discard results before convergence
### Thinning the results removes autocorrelation

mcmc = pm.MCMC([alpha, beta, survival ] )
mcmc.sample(10000, burn = 3000, thin = 20)

pm.Matplot.plot(mcmc)









    



 [-----------------100%-----------------] 10000 of 10000 complete in 22.0 secPlotting alpha
Plotting beta



In [22]:

    
#Solution to Q5
## Adjusting the priors impacts the overall result
## If we give a looser, less informative prior then we end up with a broader, shorter distribution
## If we give much more informative priors, then we get a tighter, taller distribution

censor = np.array(df["event"].apply(lambda x: 0 if x else 1).tolist())

## Note the narrowing of the prior
alpha = pm.Normal("alpha", 1.7, 10000) 
beta = pm.Normal("beta", 18.5, 10000) 

####Uncomment this to see the result of looser priors
## Note this ends up pretty much the same as we're already very loose
#alpha = pm.Uniform("alpha", 0, 30) 
#beta = pm.Uniform("beta", 0, 30) 

@pm.observed
def survival(value=df["lifetime"], alpha = alpha, beta = beta ):
    return sum( (1-censor)*(np.log( alpha/beta) + (alpha-1)*np.log(value/beta)) - (value/beta)**(alpha))

mcmc = pm.MCMC([alpha, beta, survival ] )
mcmc.sample(10000, burn = 5000, thin = 20)
pm.Matplot.plot(mcmc)
#plt.hist([weibull_median(x[0], x[1]) for x in zip(mcmc.trace("alpha"), mcmc.trace("beta"))])









    



 [-----------------100%-----------------] 10000 of 10000 complete in 18.7 secPlotting alpha
Plotting beta



In [143]:

    
## Solution to bonus
## Super easy to do in the Bayesian framework, all we need to do is look at what % of samples
## meet our criteria
medians = [weibull_median(x[0], x[1]) for x in zip(mcmc.trace("alpha"), mcmc.trace("beta"))]
testing_value = 15.6
number_of_greater_samples = sum([x >= testing_value for x in medians])
100 * (number_of_greater_samples / len(medians))









    Out[143]:





64.0



In [ ]:

    
#Cox model

If we want to look at covariates, we need a new approach. We'll use Cox proprtional hazards. More information here.



In [23]:

    
#Fitting solution
cf = lifelines.CoxPHFitter()
cf.fit(df, 'lifetime', event_col = 'event')
cf.summary









    



C:\Users\j.coltman\AppData\Local\Continuum\Anaconda3\lib\site-packages\lifelines\fitters\coxph_fitter.py:285: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....)
  df.sort(duration_col, inplace=True)






    Out[23]:






  
    
      
      coef
      exp(coef)
      se(coef)
      z
      p
      lower 0.95
      upper 0.95
    
  
  
    
      male
      0.918199
      2.504776
      0.023830
      38.531786
      0.000000
      0.871484
      0.964915
    
    
      age
      -0.007011
      0.993013
      0.022145
      -0.316614
      0.751537
      -0.050425
      0.036402
    
    
      search
      -1.051111
      0.349549
      0.024398
      -43.082327
      0.000000
      -1.098940
      -1.003281
    
    
      brand
      0.011492
      1.011558
      0.022047
      0.521233
      0.602204
      -0.031730
      0.054713

Once we've fit the data, we need to do something useful with it. Try to do the following things:

1 - Plot the baseline survival function

2 - Predict the functions for a particular set of features

3 - Plot the survival function for two different set of features

4 - For your results in part 3 caculate how much more likely a death event is for one than the other for a given period of time



In [24]:

    
#Solution to 1
fig, axis = plt.subplots(nrows=1, ncols=1)
cf.baseline_survival_.plot(ax = axis, title = "Baseline Survival")









    Out[24]:





<matplotlib.axes._subplots.AxesSubplot at 0x38ecf49da0>



In [26]:

    
# Solution to prediction
regressors = np.array([[1,45,0,0]])
survival = cf.predict_survival_function(regressors)
survival









    Out[26]:






  
    
      
      0
    
    
      event_at
      
    
  
  
    
      0.000000
      1.000000e+00
    
    
      1.007159
      9.986030e-01
    
    
      1.011755
      9.972059e-01
    
    
      1.026149
      9.958088e-01
    
    
      1.032037
      9.944117e-01
    
    
      1.032459
      9.930145e-01
    
    
      1.035578
      9.916174e-01
    
    
      1.046138
      9.902203e-01
    
    
      1.048734
      9.888231e-01
    
    
      1.048797
      9.874260e-01
    
    
      1.052254
      9.860288e-01
    
    
      1.052932
      9.846316e-01
    
    
      1.056191
      9.832344e-01
    
    
      1.056964
      9.818372e-01
    
    
      1.059231
      9.804399e-01
    
    
      1.073340
      9.790427e-01
    
    
      1.078840
      9.776454e-01
    
    
      1.086120
      9.762481e-01
    
    
      1.090214
      9.748508e-01
    
    
      1.099706
      9.734534e-01
    
    
      1.106758
      9.720560e-01
    
    
      1.112486
      9.706586e-01
    
    
      1.115967
      9.692611e-01
    
    
      1.116742
      9.678636e-01
    
    
      1.126141
      9.664661e-01
    
    
      1.130302
      9.650685e-01
    
    
      1.132234
      9.636710e-01
    
    
      1.136273
      9.622735e-01
    
    
      1.138430
      9.608758e-01
    
    
      1.142263
      9.594782e-01
    
    
      ...
      ...
    
    
      22.683012
      1.578424e-12
    
    
      22.695353
      1.578424e-12
    
    
      22.695885
      1.578424e-12
    
    
      22.729392
      1.578424e-12
    
    
      22.738744
      1.578424e-12
    
    
      22.740731
      1.578424e-12
    
    
      22.759964
      1.578424e-12
    
    
      22.760298
      1.578424e-12
    
    
      22.764757
      1.578424e-12
    
    
      22.765239
      1.578424e-12
    
    
      22.767487
      1.578424e-12
    
    
      22.774943
      1.578424e-12
    
    
      22.775257
      1.578424e-12
    
    
      22.784470
      1.578424e-12
    
    
      22.789286
      1.578424e-12
    
    
      22.800327
      1.578424e-12
    
    
      22.812494
      1.578424e-12
    
    
      22.826084
      1.578424e-12
    
    
      22.844867
      1.578424e-12
    
    
      22.884070
      1.578424e-12
    
    
      22.890574
      1.578424e-12
    
    
      22.946520
      1.578424e-12
    
    
      22.953504
      1.578424e-12
    
    
      22.955722
      1.578424e-12
    
    
      22.960908
      1.578424e-12
    
    
      22.965550
      1.578424e-12
    
    
      22.968054
      1.578424e-12
    
    
      22.975204
      1.578424e-12
    
    
      22.990884
      1.578424e-12
    
    
      22.997638
      1.578424e-12
    
  

3001 rows × 1 columns



In [32]:

    
#Solution to plotting multiple regressors
fig, axis = plt.subplots(nrows=1, ncols=1, sharex=True)
regressor1 = np.array([[1,45,0,1]])
regressor2 = np.array([[1,23,1,1]])
survival_1 = cf.predict_survival_function(regressor1)
survival_2 = cf.predict_survival_function(regressor2)
plt.plot(survival_1,label = "32 year old male")
plt.plot(survival_2,label = "46 year old female")
plt.legend(loc = "lower left")









    Out[32]:





<matplotlib.legend.Legend at 0x38edcce390>



In [34]:

    
#Difference in survival 
odds = survival_1 / survival_2
plt.plot(odds, c = "red")









    Out[34]:





[<matplotlib.lines.Line2D at 0x38ecad6860>]

Model selection

Difficult to do with classic tools (here)

Problem:

1 - Calculate the BMA coefficient values

2 - Compare these results to past the lifelines results

3 - Try running with different priors



In [40]:

    
##Solution to 1
from pyBMA import CoxPHFitter
bmaCox = pyBMA.CoxPHFitter.CoxPHFitter()
bmaCox.fit(df, "lifetime", event_col= "event", priors= [0.5]*4)









    



C:\Users\j.coltman\AppData\Local\Continuum\Anaconda3\lib\site-packages\lifelines\fitters\coxph_fitter.py:285: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....)
  df.sort(duration_col, inplace=True)






    Out[40]:





<pyBMA.CoxPHFitter.CoxPHFitter at 0x38edadaeb8>



In [41]:

    
print(bmaCox.summary)









    



            coef  exp(coef)
age    -0.000122   0.999878
brand   0.000225   1.000225
male    0.917887   2.503993
search -1.050679   0.349700



In [42]:

    
#Low probability for everything favours parsimonious models
bmaCox = pyBMA.CoxPHFitter.CoxPHFitter()
bmaCox.fit(df, "lifetime", event_col= "event", priors= [0.1]*4)
print(bmaCox.summary)









    



C:\Users\j.coltman\AppData\Local\Continuum\Anaconda3\lib\site-packages\lifelines\fitters\coxph_fitter.py:285: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....)
  df.sort(duration_col, inplace=True)






    Out[42]:





<pyBMA.CoxPHFitter.CoxPHFitter at 0x38eee54ef0>



In [43]:

    
#Low probability for everything favours parsimonious models
bmaCox = pyBMA.CoxPHFitter.CoxPHFitter()
bmaCox.fit(df, "lifetime", event_col= "event", priors= [0.9]*4)
print(bmaCox.summary)









    



            coef  exp(coef)
age    -0.000014   0.999986
brand   0.000026   1.000026
male    0.917881   2.503978
search -1.050672   0.349703



In [45]:

    
#Low probability for everything favours parsimonious models
bmaCox = pyBMA.CoxPHFitter.CoxPHFitter()
bmaCox.fit(df, "lifetime", event_col= "event", priors= [0.3, 0.9, 0.001, 0.3])
print(bmaCox.summary)









    



            coef  exp(coef)
age    -0.000967   0.999033
brand   0.000085   1.000085
male    0.917849   2.503900
search -1.050716   0.349687






    



C:\Users\j.coltman\AppData\Local\Continuum\Anaconda3\lib\site-packages\lifelines\fitters\coxph_fitter.py:285: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....)
  df.sort(duration_col, inplace=True)



In [ ]:

	male	age	search	brand	event	lifetime
id
1	0	59	1	1	0	18.358052
2	0	41	0	0	0	19.613458
3	0	56	0	1	1	10.007883
4	0	47	0	0	1	11.601139
5	1	44	1	0	1	11.815086
6	1	29	1	0	0	21.031488
7	0	22	0	0	1	13.039954
8	0	23	0	1	1	8.593106
9	1	19	0	0	1	1.965708
10	0	54	0	0	1	12.745363
11	0	20	1	0	1	14.968616
12	0	56	0	0	1	7.581745
13	0	53	0	0	1	5.788385
14	0	49	0	0	1	14.253003
15	0	48	1	0	0	18.254036
16	0	19	1	1	1	21.719913
17	0	34	1	0	0	18.651031
18	0	22	1	0	0	20.116080
19	0	38	1	0	0	20.770570
20	1	32	1	0	1	12.220795
21	0	42	0	0	1	10.867250
22	1	26	1	1	1	19.342920
23	0	57	1	0	1	17.028378
24	1	46	1	1	1	14.920933
25	0	58	1	0	0	20.671843
26	1	50	1	1	1	15.711533
27	0	49	1	0	0	18.648141
28	0	27	1	0	0	21.813143
29	0	38	0	1	1	17.475041
30	0	37	1	0	1	15.558864
...	...	...	...	...	...	...
2971	1	33	1	0	1	12.139852
2972	1	56	0	0	1	1.915499
2973	1	27	0	1	1	1.668352
2974	0	37	1	0	1	14.942464
2975	1	28	1	1	1	17.154657
2976	1	52	1	1	1	11.467616
2977	0	43	1	1	0	22.375457
2978	0	43	0	0	1	3.552823
2979	0	25	0	0	1	11.401859
2980	0	38	1	0	0	18.574942
2981	0	55	1	0	0	21.206353
2982	0	58	1	0	1	13.556694
2983	0	56	0	1	1	8.015187
2984	1	36	0	0	1	6.175789
2985	0	58	0	1	1	21.832655
2986	0	54	1	0	1	19.588053
2987	1	24	1	0	1	16.310147
2988	0	35	0	0	1	13.900499
2989	1	47	0	0	1	5.327094
2990	0	32	1	1	0	19.743018
2991	0	50	1	0	0	18.470577
2992	0	29	0	1	0	19.807723
2993	0	58	1	0	1	16.500628
2994	0	26	1	1	1	17.933190
2995	0	37	0	1	1	18.706615
2996	0	50	0	0	1	5.912346
2997	0	51	0	1	1	14.606427
2998	0	49	0	0	1	10.257404
2999	1	34	1	0	1	12.975489
3000	0	37	1	0	0	18.119690

	coef	exp(coef)	se(coef)	z	p	lower 0.95	upper 0.95
male	0.918199	2.504776	0.023830	38.531786	0.000000	0.871484	0.964915
age	-0.007011	0.993013	0.022145	-0.316614	0.751537	-0.050425	0.036402
search	-1.051111	0.349549	0.024398	-43.082327	0.000000	-1.098940	-1.003281
brand	0.011492	1.011558	0.022047	0.521233	0.602204	-0.031730	0.054713

	0
event_at
0.000000	1.000000e+00
1.007159	9.986030e-01
1.011755	9.972059e-01
1.026149	9.958088e-01
1.032037	9.944117e-01
1.032459	9.930145e-01
1.035578	9.916174e-01
1.046138	9.902203e-01
1.048734	9.888231e-01
1.048797	9.874260e-01
1.052254	9.860288e-01
1.052932	9.846316e-01
1.056191	9.832344e-01
1.056964	9.818372e-01
1.059231	9.804399e-01
1.073340	9.790427e-01
1.078840	9.776454e-01
1.086120	9.762481e-01
1.090214	9.748508e-01
1.099706	9.734534e-01
1.106758	9.720560e-01
1.112486	9.706586e-01
1.115967	9.692611e-01
1.116742	9.678636e-01
1.126141	9.664661e-01
1.130302	9.650685e-01
1.132234	9.636710e-01
1.136273	9.622735e-01
1.138430	9.608758e-01
1.142263	9.594782e-01
...	...
22.683012	1.578424e-12
22.695353	1.578424e-12
22.695885	1.578424e-12
22.729392	1.578424e-12
22.738744	1.578424e-12
22.740731	1.578424e-12
22.759964	1.578424e-12
22.760298	1.578424e-12
22.764757	1.578424e-12
22.765239	1.578424e-12
22.767487	1.578424e-12
22.774943	1.578424e-12
22.775257	1.578424e-12
22.784470	1.578424e-12
22.789286	1.578424e-12
22.800327	1.578424e-12
22.812494	1.578424e-12
22.826084	1.578424e-12
22.844867	1.578424e-12
22.884070	1.578424e-12
22.890574	1.578424e-12
22.946520	1.578424e-12
22.953504	1.578424e-12
22.955722	1.578424e-12
22.960908	1.578424e-12
22.965550	1.578424e-12
22.968054	1.578424e-12
22.975204	1.578424e-12
22.990884	1.578424e-12
22.997638	1.578424e-12