In [1]:
%pylab inline
import pandas as pd
import pygraphviz as pgz
from IPython.display import Image
from IPython.display import HTML
from scipy import stats
from lifelines.estimation import KaplanMeierFitter
from lifelines.statistics import logrank_test
from lifelines import CoxPHFitter
import statsmodels.api as sm
from myfisher import *
from utilHelpers import lineHist
from scipy import integrate
from functools import partial
from myboxplot import *

from arsampling import rejectionSampling

pylab.rc('font', family='sans-serif', size=15)
pd.set_option('display.max_colwidth', -1)


Populating the interactive namespace from numpy and matplotlib
Using Cython-powered Fisher's exact test

In [2]:
Image('images/enrollment_diagram-01.png',width = 700)


Out[2]:

In [83]:
"""Define parameter values"""
tmp =    [('N',10000,"Total participants enrolled"),
          ('lostRate', 0.1,"Rate of participants lost to followup (per year)"),
          ('K', 2, "Randomization ratio of COR+ to treatment vs. placebo"),
          ('pi0', 0.2, "Prevalence of COR+"),
          ('Q', 0.9, "Prevalence of QFT+ (independent of COR for now...)"),
          ('F', 0.5, "Fraction of COR- that are followed to the endpoint"),
          ('tbRate', 0.008, "Rate of developing TB disease (TB per year in COR-)"),
          ('RRcor', 4.0, "Max RR for COR+ at time of diagnostic"),
          ('RRtmt', 2.0, "Reduction in RR for COR+ treated vs. untreated"),
          ('discretize',False,'Indicator of whether event times should be discretized by visits'),
          ('visits', 'monthly', "Vector of times in years of follow-up visits (for discretizing endpoints)"),
          ('RRdecay', 1, "Decay time constant for RRcor")]
paramsDf = pd.DataFrame(tmp,columns = ['Var','Value','Description'])
html = paramsDf.to_html(index=False)
params = paramsDf.set_index('Var').Value
if params['visits'] == 'monthly':
    params['visits'] = (1/12)*arange(24)
HTML(html)


Out[83]:
Var Value Description
N 10000 Total participants enrolled
lostRate 0.1 Rate of participants lost to followup (per year)
K 2 Randomization ratio of COR+ to treatment vs. placebo
pi0 0.2 Prevalence of COR+
Q 0.9 Prevalence of QFT+ (independent of COR for now...)
F 0.5 Fraction of COR- that are followed to the endpoint
tbRate 0.008 Rate of developing TB disease (TB per year in COR-)
RRcor 4 Max RR for COR+ at time of diagnostic
RRtmt 2 Reduction in RR for COR+ treated vs. untreated
discretize False Indicator of whether event times should be discretized by visits
visits monthly Vector of times in years of follow-up visits (for discretizing endpoints)
RRdecay 1 Decay time constant for RRcor

In [4]:
def initializeDf(params):
    """Generate a full cohort of participants based on randomization parameters and COR+ prevelence,
    in preparation for a single simulation."""
    
    N,K,pi0,Q = params['N'],params['K'],params['pi0'],params['Q']
    
    df = pd.DataFrame({'ptid':arange(N),
                       'QFT':zeros(N),
                       'COR':zeros(N),
                       'follow':ones(N),
                       'treated':zeros(N)}).set_index('ptid')
    
    """Set COR positivity for whole cohort"""    
    df['COR'] = (rand(N)<pi0).astype(int)
    
    """Set QFT positivity for whole cohort"""
    """For now it is independent of COR and is not used"""
    df['QFT'] = (rand(N)<Q).astype(int)
    
    """Only a fraction of the COR- will be followed."""
    df.loc[df.COR==0,'follow'] = (rand((df.COR==0).sum()) < params['F']).astype(int)
    
    """COR+ arm is randomized to treatment/placebo in K:1 ratio"""
    ind = find(df.COR==1)
    nTreated = int(round(len(ind)*(1-1/(K+1))))
    df.loc[ind[:nTreated],'treated'] = 1
    
    return df

In [5]:
df = initializeDf(params)
cols = ['COR','follow','treated']
print df[cols].groupby(cols).agg(len)


COR  follow  treated
0    0       0          3987
     1       0          3987
1    1       0          675
             1          1351
dtype: int64

In [6]:
"""Time-varying (tv) function for RR parameter (RR has exp decay)"""
RRCORFunc = lambda RR,decay,t: (RR-1)*exp(-t/decay) + 1

"""PDF for exponential distribution"""
expPDF = lambda lam,t: lam*exp(-lam*t)

"""RV for exponential PDF"""
expRV = lambda lam0,n: -log(rand(n)) / lam0

"""Density function (not PDF) with time-varying lambda

    For AR sampling to work with the exponential envelope function:
        (1) RRmax >= 1
        (2) RRdecay << 1/lam0
            (I think. Could probably eliminate this by forcing envelope
            to have longer/slower decay than target)"""
   
tvDF = lambda lam0,RR,decay,t: expPDF(lam0*RRCORFunc(RR,decay,t),t)

def endpointRV(QFT,COR,treated,n,params):
    """Simulates a group of participants time-to-TB endpoint based on model parameters and participant traits.
    
    Parameters
    ----------
    QFT : int
        Indicator of QuantiFERON positivity
        (not used currently)
    COR : int
        Indicator of COR positivity
    treated : int
        Indicator of treatment assignment
    n : int
        Number of endpoints to generate.
    params : dict
        Contains relevant model parameters including: tbRate, visits, RRcor, RRdecay, RRtmt, lostRate
    
        
    Returns
    -------
    censored : array, dtype=int
        Indicator of right censoring from lost to follow-up
    endpoint : array, dtype=float64
        Time in years of endpoints, possibly discretized by visits"""
 
    if COR==1:
        """All COR+ will have a time-varying RR determined by RRmax and RRdecay"""
        if treated==1:
            """RRmax will be less by a factor of RRtmt for treated COR+ participants"""
            RRmax =  params['RRcor'] * (1/params['RRtmt'])
        elif treated==0:
            RRmax =  params['RRcor']
        
        """Numerical integration to compute normalizing constant for these parameters"""
        ic,err = integrate.quad(partial(tvDF,params['tbRate'],RRmax,params['RRdecay']),0,inf)
        #print "Normalizing constant (upper bound on error): %1.3f (%1.3g)" % (ic,err)

        """Normalized PDF with a time-varying lambda parameter"""
        tvPDF = lambda t: partial(tvDF,params['tbRate'],RRmax,params['RRdecay'])(t)/ic

        """Scaled envelope function such that envPDF >= targetPDF"""
        envPDF = lambda t: (tvPDF(0)/expPDF(params['tbRate'],0)) * expPDF(params['tbRate'],t)

        T,acc = rejectionSampling(envPDF = envPDF,
                                        envRV = partial(expRV,params['tbRate']),
                                        targetPDF = tvPDF,
                                        n = n)
    elif COR==0:
        """Generate random exponential variate based on rate parameter"""
        T = expRV(params['tbRate'],n)
    
    """Generate lost-to-followup indicators"""
    Tlost = expRV(params['lostRate'],n)
    
    censored = zeros(n)
    censored[Tlost<T] = 1
    T[Tlost<T] = Tlost[Tlost<T]
    
    if params['discretize']:
        """Discretize T based on visits:"""
        
        """Last visit before lost"""
        T[censored==1] = [argmax(find(params['visits'] < t)) for t in T[censored==1]]
        
        """First visit after endpoint"""
        T[censored==0] = [argmax(find(params['visits'] > t)) for t in T[censored==0]]

    return censored, T

In [7]:
def simulateDisease(df,followup1,followup2,params):
    """Simulate endpoints for all participants in the cohort,
    during the specified followup periods and according to params"""
    
    """QFT parameter is not used currently"""
    QFT = nan
    
    """Numpy arrays for temporarily storing all generated data."""
    disease = nan*ones((df.shape[0],2))
    dx = nan*zeros((df.shape[0],2))
    
    for COR,treated in [(0,0), (1,0), (1,1)]:
        ind = ((df.COR==COR) & (df.treated==treated) & (df.follow==1)).values

        censored, T = endpointRV(QFT,COR,treated,ind.sum(),params)
        
        """Some data will be censored due to lost-to-followup"""
        disease[ind,0] = 1 - censored
        disease[ind,1] = 1 - censored
        
        dx[ind,0] = T
        dx[ind,1] = T
        
        """Additionally, right-censor dx based on length of follow-up""" 
        disease[ind & (dx[:,0] > followup1), 0] = 0
        disease[ind & (dx[:,1] > followup2), 1] = 0
        
        dx[ind & (dx[:,0] > followup1), 0] = followup1
        dx[ind & (dx[:,1] > followup2), 1] = followup2

    
    """Copy data into the DataFrame"""
    df['disease1'] = disease[:,0]
    df['disease2'] = disease[:,1]
    df['dx1'] = dx[:,0]
    df['dx2'] = dx[:,1]
    return df

In [8]:
%time df = simulateDisease(df, followup1=1, followup2=2, params=params)

cols = ['COR','follow','treated']
print df[cols+['disease1','disease2']].groupby(cols).agg(sum)


Wall time: 12 ms
                    disease1  disease2
COR follow treated                    
0   0      0       NaN       NaN      
    1      0        38        67      
1   1      0        8         12      
           1        13        21      

Figure 1

Plot of KM curves for treated and non-treated COR+ participants, after one year of follow-up.


In [84]:
"""After the first year, compare the TB rates in COR+ treated vs. non-treated"""
kmf = KaplanMeierFitter()

figure(figsize=(12,5))
ax = subplot(111)

for tmti,tmt in enumerate(['Non-treated','Treated']):
    ind = (df.COR==1) & (df.treated==tmti)
    kmf.fit(durations = df.dx1[ind], event_observed = df.disease1[ind],label=tmt)
    kmf.plot(ax=ax, show_censors=True)
ylabel('Probability of being TB disease free\nduring the follow-up period')
xlabel('Duration (years)')

summary, pvalue, test_results = logrank_test(df.dx1[(df.COR==1) & (df.treated==0)],
                                             df.dx1[(df.COR==1) & (df.treated==1)],
                                             df.disease1[(df.COR==1) & (df.treated==0)],
                                             df.disease1[(df.COR==1) & (df.treated==1)],
                                             suppress_print = True)
annotate(xy = (0.5,1), s='logrank p = %1.2g' % pvalue, xycoords = 'axes fraction',size ='x-large', ha = 'center',va='top')


Out[84]:
<matplotlib.text.Annotation at 0x30eaa710>

Figure 2

Plot of KM curves for treated and non-treated COR+ participants, after two years of follow-up.


In [85]:
"""After the second year, compare the TB rates in COR+ treated vs. non-treated"""
kmf = KaplanMeierFitter()

figure(figsize=(12,5))
ax = subplot(111)

for tmti,tmt in enumerate(['Non-treated','Treated']):
    ind = (df.COR==1) & (df.treated==tmti)
    kmf.fit(durations = df.dx1[ind], event_observed = df.disease1[ind],label=tmt)
    kmf.plot(ax=ax, show_censors=True)
ylabel('Probability of being TB disease free\nduring the follow-up period')
xlabel('Duration (years)')

summary, pvalue, test_results = logrank_test(df.dx2[(df.COR==1) & (df.treated==0)],
                                             df.dx2[(df.COR==1) & (df.treated==1)],
                                             df.disease2[(df.COR==1) & (df.treated==0)],
                                             df.disease2[(df.COR==1) & (df.treated==1)],
                                             suppress_print = True)
                                             
annotate(xy = (0.5,1), s='logrank p = %1.2g' % pvalue, xycoords = 'axes fraction',size ='x-large', ha = 'center',va='top')


Out[85]:
<matplotlib.text.Annotation at 0x31d64828>

In [48]:
def estTECI(df, treatment_col='treated', event_col='disease2'):
    """Estimates treatment efficacy using cumulative incidence/attack rate.
    
    TE = 1 - RR
    
    RR = (c1/N1) / (c0/N0)
    
    Parameters
    ----------
    df : pandas.DataFrame

    treatment_col : string
        Column in df indicating treatment.
    event_col : string
        Column in df indicating events (censored data are 0)
    covars : list
        List of other columns to include in Cox model as covariates.
    
    Returns
    -------
    est : float
        Estimate of vaccine efficacy
    ci : vector, length 2
        95% confidence interval, [LL, UL]
    pvalue : float
        P-value for H0: TE=0 from Fisher's Exact test"""
    
    a = ((df[treatment_col]==1) & (df[event_col]==1)).sum()
    b = ((df[treatment_col]==1) & (df[event_col]==0)).sum()
    c = ((df[treatment_col]==0) & (df[event_col]==1)).sum()
    d = ((df[treatment_col]==0) & (df[event_col]==0)).sum()
    rr = (a/(a+b)) / (c/(c+d))

    te = 1 - rr
    se = sqrt(b/(a*(a+b))+ d/(c*(c+d)))
    ci = 1 - exp(array([log(rr)+se*1.96, log(rr)-se*1.96]))
    """Use the two-sided p-value from a Fisher's Exact test for now, although I think there are better ways..."""
    pvalue = fisherTest([[a,b],[c,d]])[1]
    
    return te,ci,pvalue

def estTEPH(df, treatment_col='treated', duration_col='dx2', event_col='disease2',covars=[]):
    """Estimates treatment efficacy using proportional hazards (Cox model).
    
    Parameters
    ----------
    df : pandas.DataFrame
    
    treatment_col : string
        Column in df indicating treatment.
    duration_col : string
        Column in df indicating survival times.
    event_col : string
        Column in df indicating events (censored data are 0)
    covars : list
        List of other columns to include in Cox model as covariates.
    
    Returns
    -------
    est : float
        Estimate of vaccine efficacy
    ci : vector, length 2
        95% confidence interval, [LL, UL]
    pvalue : float
        P-value for H0: VE=0"""
    
    coxphf = CoxPHFitter()
    
    coxphf.fit(df[[treatment_col, duration_col, event_col]+covars],duration_col = duration_col,event_col = event_col)
    
    te = 1-exp(coxphf.hazards_.loc['coef',treatment_col])
    ci = 1-exp(coxphf.confidence_intervals_[treatment_col].loc[['upper-bound','lower-bound']])
    pvalue = coxphf._compute_p_values()[0]
    return te,ci,pvalue

Run simulation 100 times


In [91]:
def runKSimulations(k=100, duration_col='dx2', event_col='disease2'):
    """Run K simulations and return in one big df
    
    Returns
    -------
    allDf : pd.DataFrame 
        Contains all data sets with added column simi."""
    
    allDf = None
    for simi in range(k):
        df = initializeDf(params)
        df = simulateDisease(df, followup1=1, followup2=2, params=params)
        df['simi'] = simi
        if allDf is None:
            allDf = df
        else:
            allDf = pd.concat((allDf,df))

    return allDf

def analyzeSimulations(allDf,duration_col='dx2', event_col='disease2'):
    """Compute summary stats for all simulations in allDf
    
    Returns
    -------
    resDf : pd.DataFrame
        Contains results of each simulation."""
    
    summD = dict(CI_est = [],
                 CI_lower = [],
                 CI_upper = [],
                 CI_pvalue = [],
                 PH_est = [],
                 PH_lower = [],
                 PH_upper = [],
                 PH_pvalue = [],
                 logrank_pvalue = [],
                 pi0 = [])
    summD['COR+, Treated Rate'] = []
    summD['COR- Rate'] = []
    summD['COR+, Non-treated Rate'] = []

    for simi in range(allDf.simi.max()):
        """Estimate pi0 based on all screened participants"""
        summD['pi0'] = ((allDf.simi==simi) & (allDf.COR==1)).sum()/(allDf.simi==simi).sum()
        
        ind = (allDf.simi==simi) & (allDf.follow==1) & (allDf.COR==1)
        df = allDf.loc[ind]
        
        summary, pvalue, test_results = logrank_test(df.dx2[df.treated==0],
                                                     df.dx2[df.treated==1],
                                                     df.disease2[df.treated==0],
                                                     df.disease2[df.treated==1],
                                                     suppress_print = True)
        summD['logrank_pvalue'].append(pvalue)

        te,ci,pvalue = estTEPH(df, treatment_col='treated', duration_col=duration_col, event_col=event_col)
        summD['PH_est'].append(te)
        summD['PH_lower'].append(ci[0])
        summD['PH_upper'].append(ci[1])
        summD['PH_pvalue'].append(pvalue)

        te,ci,pvalue = estTECI(df, treatment_col='treated', event_col=event_col)
        summD['CI_est'].append(te)
        summD['CI_lower'].append(ci[0])
        summD['CI_upper'].append(ci[1])
        summD['CI_pvalue'].append(pvalue)
        
        a = ((df.treated==1) & (df[event_col]==1)).sum()
        b = ((df.treated==1) & (df[event_col]==0)).sum()
        c = ((df.treated==0) & (df[event_col]==1)).sum()
        d = ((df.treated==0) & (df[event_col]==0)).sum()
        summD['COR+, Treated Rate'].append(a/(a+b))
        summD['COR+, Non-treated Rate'].append(c/(c+d))

        ind = (allDf.simi==simi) & (allDf.follow==1) & (allDf.COR==0)
        df = allDf.loc[ind]
        a = (df[event_col]==1).sum()
        b = (df[event_col]==0).sum()
        summD['COR- Rate'].append(a/(a+b))
    summDf = pd.DataFrame(summD)
    summDf['RR COR+/- SOC'] = summDf['COR+, Non-treated Rate']/summD['COR- Rate']
     
    summDf['theta'] = nan #TODO
    summDf['Rate diff'] = nan #TODO
    summDf['Rate ratio'] = nan #TODO
    return summDf

In [92]:
#%time allDf, resDf = runKSimulations(100)
#allDf.to_csv(DATA_PATH + 'TB_ST_COR/sim_100_Dec16.csv')
#allDf = pd.read_csv(DATA_PATH + 'TB_ST_COR/sim_100_Dec16.csv')
resDf = analyzeSimulations(allDf)

Figure 3

Accumulation of endpoints, over K simulations


In [86]:
subsets = {'COR+, Treated' : (allDf.COR==1) & (allDf.follow==1) & (allDf.treated==1),
           'COR+, Non-treated' : (allDf.COR==1) & (allDf.follow==1) & (allDf.treated==0),
           'COR-' : (allDf.COR==0) & (allDf.follow==1)}
colors = {'COR+, Treated' : 'tomato',
           'COR+, Non-treated' : 'slateblue',
           'COR-' : 'seagreen'}

kmf = KaplanMeierFitter()

figure(figsize=(17,8))
axi = 1
for label,ind in subsets.items():
    axh = subplot(1,3,axi)
    for simi in range(allDf.simi.max()):
        kmf.fit(durations = allDf.dx2[(allDf.simi==simi) & ind], event_observed = allDf.disease2[(allDf.simi==simi) & ind])
        kmf.plot(ax=axh, ci_show=False, show_censors=False,color = colors[label],alpha=0.3)
    xlabel('Years post-enrollment')
    if axi==1:
        ylabel('Probability of being TB disease free\nduring the follow-up period')
    title(label)
    ylim((0.9,1))
    xlim((0,2))
    axh.legend_.remove()
    axi += 1



In [96]:
figure(figsize=(17,5))
myboxplot(resDf['COR- Rate'],x=0)
myboxplot(resDf['COR+, Non-treated Rate'],x=1)
myboxplot(resDf['COR+, Treated Rate'],x=2)
xlim((-0.5,2.5))
xticks([0,1,2],['COR-','COR+, Non-treated','COR+, Treated'])
xlabel('TB disease rate\n(cumulative incidence, ignoring censoring)')

figure(figsize=(17,5))
subplot(1,3,1)
myboxplot((resDf['COR+, Treated Rate']-resDf['COR+, Non-treated Rate'])*params['pi0'],x=0)
xlabel('SnT - SOC\nRate Difference')
subplot(1,3,2)
myboxplot(resDf['RR COR+/- SOC'],x=0)
xticks()
xlabel('RRCOR+;COR-(t|SOC)')
subplot(1,3,3)
ratioNumer = resDf['RR COR+/- SOC']*resDf['COR+, Treated Rate']/resDf['COR+, Non-treated Rate'] + (1-params['pi0'])/params['pi0']
ratioDenom = resDf['RR COR+/- SOC'] + (1-params['pi0'])/params['pi0']
myboxplot(ratioNumer/ratioDenom,x=0)
xticks()
xlabel('SnT : SOC\nRate Ratio')


Out[96]:
<matplotlib.text.Text at 0x19ab4668>

In [81]:
figure(figsize=(17,5))
for x,(col,label) in enumerate([('PH','Cox PH'),('CI','Cumulative incidence')]):
    subplot(1,2,x+1)
    plot([-1,7],[0,0],'--',color='gray',lw=2)
    myboxplot(resDf[col+'_lower'],x=0)
    myboxplot(resDf[col+'_est'],x=1)
    myboxplot(resDf[col+'_upper'],x=2)
    ylim((-0.6,1))
    xlim((-1,3))
    title(label)
    xticks([0,1,2],['Lower\n95% CI','Est.','Upper\n95% CI'])
    if x==0:
        ylabel('Treatment efficacy\nin COR+ group')



In [80]:
pFunc = lambda p: -10*log10(p)

figure(figsize=(12,8))
plot([-1,3],[pFunc(0.05)]*2,'--',color='gray',lw=2)
for x,col in enumerate(['logrank','PH','CI']):
    myboxplot(resDf[col+'_pvalue'].map(pFunc),x=x)
xlim((-1,3))
ylim((-5,60))
xticks([0,1,2],['Log-rank','Cox PH',"Fisher's"])
ylabel('P-value [-10 * log10(p)]')


Out[80]:
<matplotlib.text.Text at 0x2a38b780>

In [87]:
"""Look at the distributions of dx"""
edges = linspace(0,2,50)
figure(figsize=(12,5))
for label,ind in subsets.items():
    lineHist(allDf.dx2.loc[ind & (allDf.disease2==1)],
             edges=edges,
             density=True,
             color=colors[label],
             label=label+' (N = %d)' % (ind & (allDf.disease2==1)).sum())
legend(loc=0)


Out[87]:
<matplotlib.legend.Legend at 0x2ef3d1d0>