In [1]:
%pylab inline
import pandas as pd
import pygraphviz as pgz
from IPython.display import Image
from IPython.display import HTML
from scipy import stats
from lifelines.estimation import KaplanMeierFitter
from lifelines.statistics import logrank_test
from lifelines import CoxPHFitter
import statsmodels.api as sm
from myfisher import *
from utilHelpers import lineHist
from scipy import integrate
from functools import partial
from myboxplot import *
from arsampling import rejectionSampling
pylab.rc('font', family='sans-serif', size=15)
pd.set_option('display.max_colwidth', -1)
In [2]:
Image('images/enrollment_diagram-01.png',width = 700)
Out[2]:
In [83]:
"""Define parameter values"""
tmp = [('N',10000,"Total participants enrolled"),
('lostRate', 0.1,"Rate of participants lost to followup (per year)"),
('K', 2, "Randomization ratio of COR+ to treatment vs. placebo"),
('pi0', 0.2, "Prevalence of COR+"),
('Q', 0.9, "Prevalence of QFT+ (independent of COR for now...)"),
('F', 0.5, "Fraction of COR- that are followed to the endpoint"),
('tbRate', 0.008, "Rate of developing TB disease (TB per year in COR-)"),
('RRcor', 4.0, "Max RR for COR+ at time of diagnostic"),
('RRtmt', 2.0, "Reduction in RR for COR+ treated vs. untreated"),
('discretize',False,'Indicator of whether event times should be discretized by visits'),
('visits', 'monthly', "Vector of times in years of follow-up visits (for discretizing endpoints)"),
('RRdecay', 1, "Decay time constant for RRcor")]
paramsDf = pd.DataFrame(tmp,columns = ['Var','Value','Description'])
html = paramsDf.to_html(index=False)
params = paramsDf.set_index('Var').Value
if params['visits'] == 'monthly':
params['visits'] = (1/12)*arange(24)
HTML(html)
Out[83]:
In [4]:
def initializeDf(params):
"""Generate a full cohort of participants based on randomization parameters and COR+ prevelence,
in preparation for a single simulation."""
N,K,pi0,Q = params['N'],params['K'],params['pi0'],params['Q']
df = pd.DataFrame({'ptid':arange(N),
'QFT':zeros(N),
'COR':zeros(N),
'follow':ones(N),
'treated':zeros(N)}).set_index('ptid')
"""Set COR positivity for whole cohort"""
df['COR'] = (rand(N)<pi0).astype(int)
"""Set QFT positivity for whole cohort"""
"""For now it is independent of COR and is not used"""
df['QFT'] = (rand(N)<Q).astype(int)
"""Only a fraction of the COR- will be followed."""
df.loc[df.COR==0,'follow'] = (rand((df.COR==0).sum()) < params['F']).astype(int)
"""COR+ arm is randomized to treatment/placebo in K:1 ratio"""
ind = find(df.COR==1)
nTreated = int(round(len(ind)*(1-1/(K+1))))
df.loc[ind[:nTreated],'treated'] = 1
return df
In [5]:
df = initializeDf(params)
cols = ['COR','follow','treated']
print df[cols].groupby(cols).agg(len)
In [6]:
"""Time-varying (tv) function for RR parameter (RR has exp decay)"""
RRCORFunc = lambda RR,decay,t: (RR-1)*exp(-t/decay) + 1
"""PDF for exponential distribution"""
expPDF = lambda lam,t: lam*exp(-lam*t)
"""RV for exponential PDF"""
expRV = lambda lam0,n: -log(rand(n)) / lam0
"""Density function (not PDF) with time-varying lambda
For AR sampling to work with the exponential envelope function:
(1) RRmax >= 1
(2) RRdecay << 1/lam0
(I think. Could probably eliminate this by forcing envelope
to have longer/slower decay than target)"""
tvDF = lambda lam0,RR,decay,t: expPDF(lam0*RRCORFunc(RR,decay,t),t)
def endpointRV(QFT,COR,treated,n,params):
"""Simulates a group of participants time-to-TB endpoint based on model parameters and participant traits.
Parameters
----------
QFT : int
Indicator of QuantiFERON positivity
(not used currently)
COR : int
Indicator of COR positivity
treated : int
Indicator of treatment assignment
n : int
Number of endpoints to generate.
params : dict
Contains relevant model parameters including: tbRate, visits, RRcor, RRdecay, RRtmt, lostRate
Returns
-------
censored : array, dtype=int
Indicator of right censoring from lost to follow-up
endpoint : array, dtype=float64
Time in years of endpoints, possibly discretized by visits"""
if COR==1:
"""All COR+ will have a time-varying RR determined by RRmax and RRdecay"""
if treated==1:
"""RRmax will be less by a factor of RRtmt for treated COR+ participants"""
RRmax = params['RRcor'] * (1/params['RRtmt'])
elif treated==0:
RRmax = params['RRcor']
"""Numerical integration to compute normalizing constant for these parameters"""
ic,err = integrate.quad(partial(tvDF,params['tbRate'],RRmax,params['RRdecay']),0,inf)
#print "Normalizing constant (upper bound on error): %1.3f (%1.3g)" % (ic,err)
"""Normalized PDF with a time-varying lambda parameter"""
tvPDF = lambda t: partial(tvDF,params['tbRate'],RRmax,params['RRdecay'])(t)/ic
"""Scaled envelope function such that envPDF >= targetPDF"""
envPDF = lambda t: (tvPDF(0)/expPDF(params['tbRate'],0)) * expPDF(params['tbRate'],t)
T,acc = rejectionSampling(envPDF = envPDF,
envRV = partial(expRV,params['tbRate']),
targetPDF = tvPDF,
n = n)
elif COR==0:
"""Generate random exponential variate based on rate parameter"""
T = expRV(params['tbRate'],n)
"""Generate lost-to-followup indicators"""
Tlost = expRV(params['lostRate'],n)
censored = zeros(n)
censored[Tlost<T] = 1
T[Tlost<T] = Tlost[Tlost<T]
if params['discretize']:
"""Discretize T based on visits:"""
"""Last visit before lost"""
T[censored==1] = [argmax(find(params['visits'] < t)) for t in T[censored==1]]
"""First visit after endpoint"""
T[censored==0] = [argmax(find(params['visits'] > t)) for t in T[censored==0]]
return censored, T
In [7]:
def simulateDisease(df,followup1,followup2,params):
"""Simulate endpoints for all participants in the cohort,
during the specified followup periods and according to params"""
"""QFT parameter is not used currently"""
QFT = nan
"""Numpy arrays for temporarily storing all generated data."""
disease = nan*ones((df.shape[0],2))
dx = nan*zeros((df.shape[0],2))
for COR,treated in [(0,0), (1,0), (1,1)]:
ind = ((df.COR==COR) & (df.treated==treated) & (df.follow==1)).values
censored, T = endpointRV(QFT,COR,treated,ind.sum(),params)
"""Some data will be censored due to lost-to-followup"""
disease[ind,0] = 1 - censored
disease[ind,1] = 1 - censored
dx[ind,0] = T
dx[ind,1] = T
"""Additionally, right-censor dx based on length of follow-up"""
disease[ind & (dx[:,0] > followup1), 0] = 0
disease[ind & (dx[:,1] > followup2), 1] = 0
dx[ind & (dx[:,0] > followup1), 0] = followup1
dx[ind & (dx[:,1] > followup2), 1] = followup2
"""Copy data into the DataFrame"""
df['disease1'] = disease[:,0]
df['disease2'] = disease[:,1]
df['dx1'] = dx[:,0]
df['dx2'] = dx[:,1]
return df
In [8]:
%time df = simulateDisease(df, followup1=1, followup2=2, params=params)
cols = ['COR','follow','treated']
print df[cols+['disease1','disease2']].groupby(cols).agg(sum)
In [84]:
"""After the first year, compare the TB rates in COR+ treated vs. non-treated"""
kmf = KaplanMeierFitter()
figure(figsize=(12,5))
ax = subplot(111)
for tmti,tmt in enumerate(['Non-treated','Treated']):
ind = (df.COR==1) & (df.treated==tmti)
kmf.fit(durations = df.dx1[ind], event_observed = df.disease1[ind],label=tmt)
kmf.plot(ax=ax, show_censors=True)
ylabel('Probability of being TB disease free\nduring the follow-up period')
xlabel('Duration (years)')
summary, pvalue, test_results = logrank_test(df.dx1[(df.COR==1) & (df.treated==0)],
df.dx1[(df.COR==1) & (df.treated==1)],
df.disease1[(df.COR==1) & (df.treated==0)],
df.disease1[(df.COR==1) & (df.treated==1)],
suppress_print = True)
annotate(xy = (0.5,1), s='logrank p = %1.2g' % pvalue, xycoords = 'axes fraction',size ='x-large', ha = 'center',va='top')
Out[84]:
In [85]:
"""After the second year, compare the TB rates in COR+ treated vs. non-treated"""
kmf = KaplanMeierFitter()
figure(figsize=(12,5))
ax = subplot(111)
for tmti,tmt in enumerate(['Non-treated','Treated']):
ind = (df.COR==1) & (df.treated==tmti)
kmf.fit(durations = df.dx1[ind], event_observed = df.disease1[ind],label=tmt)
kmf.plot(ax=ax, show_censors=True)
ylabel('Probability of being TB disease free\nduring the follow-up period')
xlabel('Duration (years)')
summary, pvalue, test_results = logrank_test(df.dx2[(df.COR==1) & (df.treated==0)],
df.dx2[(df.COR==1) & (df.treated==1)],
df.disease2[(df.COR==1) & (df.treated==0)],
df.disease2[(df.COR==1) & (df.treated==1)],
suppress_print = True)
annotate(xy = (0.5,1), s='logrank p = %1.2g' % pvalue, xycoords = 'axes fraction',size ='x-large', ha = 'center',va='top')
Out[85]:
In [48]:
def estTECI(df, treatment_col='treated', event_col='disease2'):
"""Estimates treatment efficacy using cumulative incidence/attack rate.
TE = 1 - RR
RR = (c1/N1) / (c0/N0)
Parameters
----------
df : pandas.DataFrame
treatment_col : string
Column in df indicating treatment.
event_col : string
Column in df indicating events (censored data are 0)
covars : list
List of other columns to include in Cox model as covariates.
Returns
-------
est : float
Estimate of vaccine efficacy
ci : vector, length 2
95% confidence interval, [LL, UL]
pvalue : float
P-value for H0: TE=0 from Fisher's Exact test"""
a = ((df[treatment_col]==1) & (df[event_col]==1)).sum()
b = ((df[treatment_col]==1) & (df[event_col]==0)).sum()
c = ((df[treatment_col]==0) & (df[event_col]==1)).sum()
d = ((df[treatment_col]==0) & (df[event_col]==0)).sum()
rr = (a/(a+b)) / (c/(c+d))
te = 1 - rr
se = sqrt(b/(a*(a+b))+ d/(c*(c+d)))
ci = 1 - exp(array([log(rr)+se*1.96, log(rr)-se*1.96]))
"""Use the two-sided p-value from a Fisher's Exact test for now, although I think there are better ways..."""
pvalue = fisherTest([[a,b],[c,d]])[1]
return te,ci,pvalue
def estTEPH(df, treatment_col='treated', duration_col='dx2', event_col='disease2',covars=[]):
"""Estimates treatment efficacy using proportional hazards (Cox model).
Parameters
----------
df : pandas.DataFrame
treatment_col : string
Column in df indicating treatment.
duration_col : string
Column in df indicating survival times.
event_col : string
Column in df indicating events (censored data are 0)
covars : list
List of other columns to include in Cox model as covariates.
Returns
-------
est : float
Estimate of vaccine efficacy
ci : vector, length 2
95% confidence interval, [LL, UL]
pvalue : float
P-value for H0: VE=0"""
coxphf = CoxPHFitter()
coxphf.fit(df[[treatment_col, duration_col, event_col]+covars],duration_col = duration_col,event_col = event_col)
te = 1-exp(coxphf.hazards_.loc['coef',treatment_col])
ci = 1-exp(coxphf.confidence_intervals_[treatment_col].loc[['upper-bound','lower-bound']])
pvalue = coxphf._compute_p_values()[0]
return te,ci,pvalue
In [91]:
def runKSimulations(k=100, duration_col='dx2', event_col='disease2'):
"""Run K simulations and return in one big df
Returns
-------
allDf : pd.DataFrame
Contains all data sets with added column simi."""
allDf = None
for simi in range(k):
df = initializeDf(params)
df = simulateDisease(df, followup1=1, followup2=2, params=params)
df['simi'] = simi
if allDf is None:
allDf = df
else:
allDf = pd.concat((allDf,df))
return allDf
def analyzeSimulations(allDf,duration_col='dx2', event_col='disease2'):
"""Compute summary stats for all simulations in allDf
Returns
-------
resDf : pd.DataFrame
Contains results of each simulation."""
summD = dict(CI_est = [],
CI_lower = [],
CI_upper = [],
CI_pvalue = [],
PH_est = [],
PH_lower = [],
PH_upper = [],
PH_pvalue = [],
logrank_pvalue = [],
pi0 = [])
summD['COR+, Treated Rate'] = []
summD['COR- Rate'] = []
summD['COR+, Non-treated Rate'] = []
for simi in range(allDf.simi.max()):
"""Estimate pi0 based on all screened participants"""
summD['pi0'] = ((allDf.simi==simi) & (allDf.COR==1)).sum()/(allDf.simi==simi).sum()
ind = (allDf.simi==simi) & (allDf.follow==1) & (allDf.COR==1)
df = allDf.loc[ind]
summary, pvalue, test_results = logrank_test(df.dx2[df.treated==0],
df.dx2[df.treated==1],
df.disease2[df.treated==0],
df.disease2[df.treated==1],
suppress_print = True)
summD['logrank_pvalue'].append(pvalue)
te,ci,pvalue = estTEPH(df, treatment_col='treated', duration_col=duration_col, event_col=event_col)
summD['PH_est'].append(te)
summD['PH_lower'].append(ci[0])
summD['PH_upper'].append(ci[1])
summD['PH_pvalue'].append(pvalue)
te,ci,pvalue = estTECI(df, treatment_col='treated', event_col=event_col)
summD['CI_est'].append(te)
summD['CI_lower'].append(ci[0])
summD['CI_upper'].append(ci[1])
summD['CI_pvalue'].append(pvalue)
a = ((df.treated==1) & (df[event_col]==1)).sum()
b = ((df.treated==1) & (df[event_col]==0)).sum()
c = ((df.treated==0) & (df[event_col]==1)).sum()
d = ((df.treated==0) & (df[event_col]==0)).sum()
summD['COR+, Treated Rate'].append(a/(a+b))
summD['COR+, Non-treated Rate'].append(c/(c+d))
ind = (allDf.simi==simi) & (allDf.follow==1) & (allDf.COR==0)
df = allDf.loc[ind]
a = (df[event_col]==1).sum()
b = (df[event_col]==0).sum()
summD['COR- Rate'].append(a/(a+b))
summDf = pd.DataFrame(summD)
summDf['RR COR+/- SOC'] = summDf['COR+, Non-treated Rate']/summD['COR- Rate']
summDf['theta'] = nan #TODO
summDf['Rate diff'] = nan #TODO
summDf['Rate ratio'] = nan #TODO
return summDf
In [92]:
#%time allDf, resDf = runKSimulations(100)
#allDf.to_csv(DATA_PATH + 'TB_ST_COR/sim_100_Dec16.csv')
#allDf = pd.read_csv(DATA_PATH + 'TB_ST_COR/sim_100_Dec16.csv')
resDf = analyzeSimulations(allDf)
In [86]:
subsets = {'COR+, Treated' : (allDf.COR==1) & (allDf.follow==1) & (allDf.treated==1),
'COR+, Non-treated' : (allDf.COR==1) & (allDf.follow==1) & (allDf.treated==0),
'COR-' : (allDf.COR==0) & (allDf.follow==1)}
colors = {'COR+, Treated' : 'tomato',
'COR+, Non-treated' : 'slateblue',
'COR-' : 'seagreen'}
kmf = KaplanMeierFitter()
figure(figsize=(17,8))
axi = 1
for label,ind in subsets.items():
axh = subplot(1,3,axi)
for simi in range(allDf.simi.max()):
kmf.fit(durations = allDf.dx2[(allDf.simi==simi) & ind], event_observed = allDf.disease2[(allDf.simi==simi) & ind])
kmf.plot(ax=axh, ci_show=False, show_censors=False,color = colors[label],alpha=0.3)
xlabel('Years post-enrollment')
if axi==1:
ylabel('Probability of being TB disease free\nduring the follow-up period')
title(label)
ylim((0.9,1))
xlim((0,2))
axh.legend_.remove()
axi += 1
In [96]:
figure(figsize=(17,5))
myboxplot(resDf['COR- Rate'],x=0)
myboxplot(resDf['COR+, Non-treated Rate'],x=1)
myboxplot(resDf['COR+, Treated Rate'],x=2)
xlim((-0.5,2.5))
xticks([0,1,2],['COR-','COR+, Non-treated','COR+, Treated'])
xlabel('TB disease rate\n(cumulative incidence, ignoring censoring)')
figure(figsize=(17,5))
subplot(1,3,1)
myboxplot((resDf['COR+, Treated Rate']-resDf['COR+, Non-treated Rate'])*params['pi0'],x=0)
xlabel('SnT - SOC\nRate Difference')
subplot(1,3,2)
myboxplot(resDf['RR COR+/- SOC'],x=0)
xticks()
xlabel('RRCOR+;COR-(t|SOC)')
subplot(1,3,3)
ratioNumer = resDf['RR COR+/- SOC']*resDf['COR+, Treated Rate']/resDf['COR+, Non-treated Rate'] + (1-params['pi0'])/params['pi0']
ratioDenom = resDf['RR COR+/- SOC'] + (1-params['pi0'])/params['pi0']
myboxplot(ratioNumer/ratioDenom,x=0)
xticks()
xlabel('SnT : SOC\nRate Ratio')
Out[96]:
In [81]:
figure(figsize=(17,5))
for x,(col,label) in enumerate([('PH','Cox PH'),('CI','Cumulative incidence')]):
subplot(1,2,x+1)
plot([-1,7],[0,0],'--',color='gray',lw=2)
myboxplot(resDf[col+'_lower'],x=0)
myboxplot(resDf[col+'_est'],x=1)
myboxplot(resDf[col+'_upper'],x=2)
ylim((-0.6,1))
xlim((-1,3))
title(label)
xticks([0,1,2],['Lower\n95% CI','Est.','Upper\n95% CI'])
if x==0:
ylabel('Treatment efficacy\nin COR+ group')
In [80]:
pFunc = lambda p: -10*log10(p)
figure(figsize=(12,8))
plot([-1,3],[pFunc(0.05)]*2,'--',color='gray',lw=2)
for x,col in enumerate(['logrank','PH','CI']):
myboxplot(resDf[col+'_pvalue'].map(pFunc),x=x)
xlim((-1,3))
ylim((-5,60))
xticks([0,1,2],['Log-rank','Cox PH',"Fisher's"])
ylabel('P-value [-10 * log10(p)]')
Out[80]:
In [87]:
"""Look at the distributions of dx"""
edges = linspace(0,2,50)
figure(figsize=(12,5))
for label,ind in subsets.items():
lineHist(allDf.dx2.loc[ind & (allDf.disease2==1)],
edges=edges,
density=True,
color=colors[label],
label=label+' (N = %d)' % (ind & (allDf.disease2==1)).sum())
legend(loc=0)
Out[87]: