notebook.community

Edit and run



In [1]:

    
%pylab inline
import pandas as pd
import pygraphviz as pgz
from IPython.display import Image
from IPython.display import HTML
from scipy import stats
from lifelines.estimation import KaplanMeierFitter
from lifelines.statistics import logrank_test
from lifelines import CoxPHFitter
import statsmodels.api as sm
from myfisher import *
from utilHelpers import lineHist
from scipy import integrate
from functools import partial
from myboxplot import *

from arsampling import rejectionSampling

pylab.rc('font', family='sans-serif', size=15)
pd.set_option('display.max_colwidth', -1)









    



Populating the interactive namespace from numpy and matplotlib
Using Cython-powered Fisher's exact test



In [2]:

    
Image('images/enrollment_diagram-01.png',width = 700)









    Out[2]:



In [83]:

    
"""Define parameter values"""
tmp =    [('N',10000,"Total participants enrolled"),
          ('lostRate', 0.1,"Rate of participants lost to followup (per year)"),
          ('K', 2, "Randomization ratio of COR+ to treatment vs. placebo"),
          ('pi0', 0.2, "Prevalence of COR+"),
          ('Q', 0.9, "Prevalence of QFT+ (independent of COR for now...)"),
          ('F', 0.5, "Fraction of COR- that are followed to the endpoint"),
          ('tbRate', 0.008, "Rate of developing TB disease (TB per year in COR-)"),
          ('RRcor', 4.0, "Max RR for COR+ at time of diagnostic"),
          ('RRtmt', 2.0, "Reduction in RR for COR+ treated vs. untreated"),
          ('discretize',False,'Indicator of whether event times should be discretized by visits'),
          ('visits', 'monthly', "Vector of times in years of follow-up visits (for discretizing endpoints)"),
          ('RRdecay', 1, "Decay time constant for RRcor")]
paramsDf = pd.DataFrame(tmp,columns = ['Var','Value','Description'])
html = paramsDf.to_html(index=False)
params = paramsDf.set_index('Var').Value
if params['visits'] == 'monthly':
    params['visits'] = (1/12)*arange(24)
HTML(html)









    Out[83]:





  
    
      Var
      Value
      Description
    
  
  
    
       N
       10000
       Total participants enrolled
    
    
       lostRate
       0.1
       Rate of participants lost to followup (per year)
    
    
       K
       2
       Randomization ratio of COR+ to treatment vs. placebo
    
    
       pi0
       0.2
       Prevalence of COR+
    
    
       Q
       0.9
       Prevalence of QFT+ (independent of COR for now...)
    
    
       F
       0.5
       Fraction of COR- that are followed to the endpoint
    
    
       tbRate
       0.008
       Rate of developing TB disease (TB per year in COR-)
    
    
       RRcor
       4
       Max RR for COR+ at time of diagnostic
    
    
       RRtmt
       2
       Reduction in RR for COR+ treated vs. untreated
    
    
       discretize
       False
       Indicator of whether event times should be discretized by visits
    
    
       visits
       monthly
       Vector of times in years of follow-up visits (for discretizing endpoints)
    
    
       RRdecay
       1
       Decay time constant for RRcor



In [4]:

    
def initializeDf(params):
    """Generate a full cohort of participants based on randomization parameters and COR+ prevelence,
    in preparation for a single simulation."""
    
    N,K,pi0,Q = params['N'],params['K'],params['pi0'],params['Q']
    
    df = pd.DataFrame({'ptid':arange(N),
                       'QFT':zeros(N),
                       'COR':zeros(N),
                       'follow':ones(N),
                       'treated':zeros(N)}).set_index('ptid')
    
    """Set COR positivity for whole cohort"""    
    df['COR'] = (rand(N)<pi0).astype(int)
    
    """Set QFT positivity for whole cohort"""
    """For now it is independent of COR and is not used"""
    df['QFT'] = (rand(N)<Q).astype(int)
    
    """Only a fraction of the COR- will be followed."""
    df.loc[df.COR==0,'follow'] = (rand((df.COR==0).sum()) < params['F']).astype(int)
    
    """COR+ arm is randomized to treatment/placebo in K:1 ratio"""
    ind = find(df.COR==1)
    nTreated = int(round(len(ind)*(1-1/(K+1))))
    df.loc[ind[:nTreated],'treated'] = 1
    
    return df



In [5]:

    
df = initializeDf(params)
cols = ['COR','follow','treated']
print df[cols].groupby(cols).agg(len)









    



COR  follow  treated
0    0       0          3987
     1       0          3987
1    1       0          675
             1          1351
dtype: int64



In [6]:

    
"""Time-varying (tv) function for RR parameter (RR has exp decay)"""
RRCORFunc = lambda RR,decay,t: (RR-1)*exp(-t/decay) + 1

"""PDF for exponential distribution"""
expPDF = lambda lam,t: lam*exp(-lam*t)

"""RV for exponential PDF"""
expRV = lambda lam0,n: -log(rand(n)) / lam0

"""Density function (not PDF) with time-varying lambda

    For AR sampling to work with the exponential envelope function:
        (1) RRmax >= 1
        (2) RRdecay << 1/lam0
            (I think. Could probably eliminate this by forcing envelope
            to have longer/slower decay than target)"""
   
tvDF = lambda lam0,RR,decay,t: expPDF(lam0*RRCORFunc(RR,decay,t),t)

def endpointRV(QFT,COR,treated,n,params):
    """Simulates a group of participants time-to-TB endpoint based on model parameters and participant traits.
    
    Parameters
    ----------
    QFT : int
        Indicator of QuantiFERON positivity
        (not used currently)
    COR : int
        Indicator of COR positivity
    treated : int
        Indicator of treatment assignment
    n : int
        Number of endpoints to generate.
    params : dict
        Contains relevant model parameters including: tbRate, visits, RRcor, RRdecay, RRtmt, lostRate
    
        
    Returns
    -------
    censored : array, dtype=int
        Indicator of right censoring from lost to follow-up
    endpoint : array, dtype=float64
        Time in years of endpoints, possibly discretized by visits"""
 
    if COR==1:
        """All COR+ will have a time-varying RR determined by RRmax and RRdecay"""
        if treated==1:
            """RRmax will be less by a factor of RRtmt for treated COR+ participants"""
            RRmax =  params['RRcor'] * (1/params['RRtmt'])
        elif treated==0:
            RRmax =  params['RRcor']
        
        """Numerical integration to compute normalizing constant for these parameters"""
        ic,err = integrate.quad(partial(tvDF,params['tbRate'],RRmax,params['RRdecay']),0,inf)
        #print "Normalizing constant (upper bound on error): %1.3f (%1.3g)" % (ic,err)

        """Normalized PDF with a time-varying lambda parameter"""
        tvPDF = lambda t: partial(tvDF,params['tbRate'],RRmax,params['RRdecay'])(t)/ic

        """Scaled envelope function such that envPDF >= targetPDF"""
        envPDF = lambda t: (tvPDF(0)/expPDF(params['tbRate'],0)) * expPDF(params['tbRate'],t)

        T,acc = rejectionSampling(envPDF = envPDF,
                                        envRV = partial(expRV,params['tbRate']),
                                        targetPDF = tvPDF,
                                        n = n)
    elif COR==0:
        """Generate random exponential variate based on rate parameter"""
        T = expRV(params['tbRate'],n)
    
    """Generate lost-to-followup indicators"""
    Tlost = expRV(params['lostRate'],n)
    
    censored = zeros(n)
    censored[Tlost<T] = 1
    T[Tlost<T] = Tlost[Tlost<T]
    
    if params['discretize']:
        """Discretize T based on visits:"""
        
        """Last visit before lost"""
        T[censored==1] = [argmax(find(params['visits'] < t)) for t in T[censored==1]]
        
        """First visit after endpoint"""
        T[censored==0] = [argmax(find(params['visits'] > t)) for t in T[censored==0]]

    return censored, T



In [7]:

    
def simulateDisease(df,followup1,followup2,params):
    """Simulate endpoints for all participants in the cohort,
    during the specified followup periods and according to params"""
    
    """QFT parameter is not used currently"""
    QFT = nan
    
    """Numpy arrays for temporarily storing all generated data."""
    disease = nan*ones((df.shape[0],2))
    dx = nan*zeros((df.shape[0],2))
    
    for COR,treated in [(0,0), (1,0), (1,1)]:
        ind = ((df.COR==COR) & (df.treated==treated) & (df.follow==1)).values

        censored, T = endpointRV(QFT,COR,treated,ind.sum(),params)
        
        """Some data will be censored due to lost-to-followup"""
        disease[ind,0] = 1 - censored
        disease[ind,1] = 1 - censored
        
        dx[ind,0] = T
        dx[ind,1] = T
        
        """Additionally, right-censor dx based on length of follow-up""" 
        disease[ind & (dx[:,0] > followup1), 0] = 0
        disease[ind & (dx[:,1] > followup2), 1] = 0
        
        dx[ind & (dx[:,0] > followup1), 0] = followup1
        dx[ind & (dx[:,1] > followup2), 1] = followup2

    
    """Copy data into the DataFrame"""
    df['disease1'] = disease[:,0]
    df['disease2'] = disease[:,1]
    df['dx1'] = dx[:,0]
    df['dx2'] = dx[:,1]
    return df



In [8]:

    
%time df = simulateDisease(df, followup1=1, followup2=2, params=params)

cols = ['COR','follow','treated']
print df[cols+['disease1','disease2']].groupby(cols).agg(sum)









    



Wall time: 12 ms
                    disease1  disease2
COR follow treated                    
0   0      0       NaN       NaN      
    1      0        38        67      
1   1      0        8         12      
           1        13        21

Figure 1

Plot of KM curves for treated and non-treated COR+ participants, after one year of follow-up.



In [84]:

    
"""After the first year, compare the TB rates in COR+ treated vs. non-treated"""
kmf = KaplanMeierFitter()

figure(figsize=(12,5))
ax = subplot(111)

for tmti,tmt in enumerate(['Non-treated','Treated']):
    ind = (df.COR==1) & (df.treated==tmti)
    kmf.fit(durations = df.dx1[ind], event_observed = df.disease1[ind],label=tmt)
    kmf.plot(ax=ax, show_censors=True)
ylabel('Probability of being TB disease free\nduring the follow-up period')
xlabel('Duration (years)')

summary, pvalue, test_results = logrank_test(df.dx1[(df.COR==1) & (df.treated==0)],
                                             df.dx1[(df.COR==1) & (df.treated==1)],
                                             df.disease1[(df.COR==1) & (df.treated==0)],
                                             df.disease1[(df.COR==1) & (df.treated==1)],
                                             suppress_print = True)
annotate(xy = (0.5,1), s='logrank p = %1.2g' % pvalue, xycoords = 'axes fraction',size ='x-large', ha = 'center',va='top')









    Out[84]:





<matplotlib.text.Annotation at 0x30eaa710>

Figure 2

Plot of KM curves for treated and non-treated COR+ participants, after two years of follow-up.



In [85]:

    
"""After the second year, compare the TB rates in COR+ treated vs. non-treated"""
kmf = KaplanMeierFitter()

figure(figsize=(12,5))
ax = subplot(111)

for tmti,tmt in enumerate(['Non-treated','Treated']):
    ind = (df.COR==1) & (df.treated==tmti)
    kmf.fit(durations = df.dx1[ind], event_observed = df.disease1[ind],label=tmt)
    kmf.plot(ax=ax, show_censors=True)
ylabel('Probability of being TB disease free\nduring the follow-up period')
xlabel('Duration (years)')

summary, pvalue, test_results = logrank_test(df.dx2[(df.COR==1) & (df.treated==0)],
                                             df.dx2[(df.COR==1) & (df.treated==1)],
                                             df.disease2[(df.COR==1) & (df.treated==0)],
                                             df.disease2[(df.COR==1) & (df.treated==1)],
                                             suppress_print = True)
                                             
annotate(xy = (0.5,1), s='logrank p = %1.2g' % pvalue, xycoords = 'axes fraction',size ='x-large', ha = 'center',va='top')









    Out[85]:





<matplotlib.text.Annotation at 0x31d64828>



In [48]:

    
def estTECI(df, treatment_col='treated', event_col='disease2'):
    """Estimates treatment efficacy using cumulative incidence/attack rate.
    
    TE = 1 - RR
    
    RR = (c1/N1) / (c0/N0)
    
    Parameters
    ----------
    df : pandas.DataFrame

    treatment_col : string
        Column in df indicating treatment.
    event_col : string
        Column in df indicating events (censored data are 0)
    covars : list
        List of other columns to include in Cox model as covariates.
    
    Returns
    -------
    est : float
        Estimate of vaccine efficacy
    ci : vector, length 2
        95% confidence interval, [LL, UL]
    pvalue : float
        P-value for H0: TE=0 from Fisher's Exact test"""
    
    a = ((df[treatment_col]==1) & (df[event_col]==1)).sum()
    b = ((df[treatment_col]==1) & (df[event_col]==0)).sum()
    c = ((df[treatment_col]==0) & (df[event_col]==1)).sum()
    d = ((df[treatment_col]==0) & (df[event_col]==0)).sum()
    rr = (a/(a+b)) / (c/(c+d))

    te = 1 - rr
    se = sqrt(b/(a*(a+b))+ d/(c*(c+d)))
    ci = 1 - exp(array([log(rr)+se*1.96, log(rr)-se*1.96]))
    """Use the two-sided p-value from a Fisher's Exact test for now, although I think there are better ways..."""
    pvalue = fisherTest([[a,b],[c,d]])[1]
    
    return te,ci,pvalue

def estTEPH(df, treatment_col='treated', duration_col='dx2', event_col='disease2',covars=[]):
    """Estimates treatment efficacy using proportional hazards (Cox model).
    
    Parameters
    ----------
    df : pandas.DataFrame
    
    treatment_col : string
        Column in df indicating treatment.
    duration_col : string
        Column in df indicating survival times.
    event_col : string
        Column in df indicating events (censored data are 0)
    covars : list
        List of other columns to include in Cox model as covariates.
    
    Returns
    -------
    est : float
        Estimate of vaccine efficacy
    ci : vector, length 2
        95% confidence interval, [LL, UL]
    pvalue : float
        P-value for H0: VE=0"""
    
    coxphf = CoxPHFitter()
    
    coxphf.fit(df[[treatment_col, duration_col, event_col]+covars],duration_col = duration_col,event_col = event_col)
    
    te = 1-exp(coxphf.hazards_.loc['coef',treatment_col])
    ci = 1-exp(coxphf.confidence_intervals_[treatment_col].loc[['upper-bound','lower-bound']])
    pvalue = coxphf._compute_p_values()[0]
    return te,ci,pvalue

Run simulation 100 times



In [91]:

    
def runKSimulations(k=100, duration_col='dx2', event_col='disease2'):
    """Run K simulations and return in one big df
    
    Returns
    -------
    allDf : pd.DataFrame 
        Contains all data sets with added column simi."""
    
    allDf = None
    for simi in range(k):
        df = initializeDf(params)
        df = simulateDisease(df, followup1=1, followup2=2, params=params)
        df['simi'] = simi
        if allDf is None:
            allDf = df
        else:
            allDf = pd.concat((allDf,df))

    return allDf

def analyzeSimulations(allDf,duration_col='dx2', event_col='disease2'):
    """Compute summary stats for all simulations in allDf
    
    Returns
    -------
    resDf : pd.DataFrame
        Contains results of each simulation."""
    
    summD = dict(CI_est = [],
                 CI_lower = [],
                 CI_upper = [],
                 CI_pvalue = [],
                 PH_est = [],
                 PH_lower = [],
                 PH_upper = [],
                 PH_pvalue = [],
                 logrank_pvalue = [],
                 pi0 = [])
    summD['COR+, Treated Rate'] = []
    summD['COR- Rate'] = []
    summD['COR+, Non-treated Rate'] = []

    for simi in range(allDf.simi.max()):
        """Estimate pi0 based on all screened participants"""
        summD['pi0'] = ((allDf.simi==simi) & (allDf.COR==1)).sum()/(allDf.simi==simi).sum()
        
        ind = (allDf.simi==simi) & (allDf.follow==1) & (allDf.COR==1)
        df = allDf.loc[ind]
        
        summary, pvalue, test_results = logrank_test(df.dx2[df.treated==0],
                                                     df.dx2[df.treated==1],
                                                     df.disease2[df.treated==0],
                                                     df.disease2[df.treated==1],
                                                     suppress_print = True)
        summD['logrank_pvalue'].append(pvalue)

        te,ci,pvalue = estTEPH(df, treatment_col='treated', duration_col=duration_col, event_col=event_col)
        summD['PH_est'].append(te)
        summD['PH_lower'].append(ci[0])
        summD['PH_upper'].append(ci[1])
        summD['PH_pvalue'].append(pvalue)

        te,ci,pvalue = estTECI(df, treatment_col='treated', event_col=event_col)
        summD['CI_est'].append(te)
        summD['CI_lower'].append(ci[0])
        summD['CI_upper'].append(ci[1])
        summD['CI_pvalue'].append(pvalue)
        
        a = ((df.treated==1) & (df[event_col]==1)).sum()
        b = ((df.treated==1) & (df[event_col]==0)).sum()
        c = ((df.treated==0) & (df[event_col]==1)).sum()
        d = ((df.treated==0) & (df[event_col]==0)).sum()
        summD['COR+, Treated Rate'].append(a/(a+b))
        summD['COR+, Non-treated Rate'].append(c/(c+d))

        ind = (allDf.simi==simi) & (allDf.follow==1) & (allDf.COR==0)
        df = allDf.loc[ind]
        a = (df[event_col]==1).sum()
        b = (df[event_col]==0).sum()
        summD['COR- Rate'].append(a/(a+b))
    summDf = pd.DataFrame(summD)
    summDf['RR COR+/- SOC'] = summDf['COR+, Non-treated Rate']/summD['COR- Rate']
     
    summDf['theta'] = nan #TODO
    summDf['Rate diff'] = nan #TODO
    summDf['Rate ratio'] = nan #TODO
    return summDf



In [92]:

    
#%time allDf, resDf = runKSimulations(100)
#allDf.to_csv(DATA_PATH + 'TB_ST_COR/sim_100_Dec16.csv')
#allDf = pd.read_csv(DATA_PATH + 'TB_ST_COR/sim_100_Dec16.csv')
resDf = analyzeSimulations(allDf)

Figure 3

Accumulation of endpoints, over K simulations



In [86]:

    
subsets = {'COR+, Treated' : (allDf.COR==1) & (allDf.follow==1) & (allDf.treated==1),
           'COR+, Non-treated' : (allDf.COR==1) & (allDf.follow==1) & (allDf.treated==0),
           'COR-' : (allDf.COR==0) & (allDf.follow==1)}
colors = {'COR+, Treated' : 'tomato',
           'COR+, Non-treated' : 'slateblue',
           'COR-' : 'seagreen'}

kmf = KaplanMeierFitter()

figure(figsize=(17,8))
axi = 1
for label,ind in subsets.items():
    axh = subplot(1,3,axi)
    for simi in range(allDf.simi.max()):
        kmf.fit(durations = allDf.dx2[(allDf.simi==simi) & ind], event_observed = allDf.disease2[(allDf.simi==simi) & ind])
        kmf.plot(ax=axh, ci_show=False, show_censors=False,color = colors[label],alpha=0.3)
    xlabel('Years post-enrollment')
    if axi==1:
        ylabel('Probability of being TB disease free\nduring the follow-up period')
    title(label)
    ylim((0.9,1))
    xlim((0,2))
    axh.legend_.remove()
    axi += 1



In [96]:

    
figure(figsize=(17,5))
myboxplot(resDf['COR- Rate'],x=0)
myboxplot(resDf['COR+, Non-treated Rate'],x=1)
myboxplot(resDf['COR+, Treated Rate'],x=2)
xlim((-0.5,2.5))
xticks([0,1,2],['COR-','COR+, Non-treated','COR+, Treated'])
xlabel('TB disease rate\n(cumulative incidence, ignoring censoring)')

figure(figsize=(17,5))
subplot(1,3,1)
myboxplot((resDf['COR+, Treated Rate']-resDf['COR+, Non-treated Rate'])*params['pi0'],x=0)
xlabel('SnT - SOC\nRate Difference')
subplot(1,3,2)
myboxplot(resDf['RR COR+/- SOC'],x=0)
xticks()
xlabel('RRCOR+;COR-(t|SOC)')
subplot(1,3,3)
ratioNumer = resDf['RR COR+/- SOC']*resDf['COR+, Treated Rate']/resDf['COR+, Non-treated Rate'] + (1-params['pi0'])/params['pi0']
ratioDenom = resDf['RR COR+/- SOC'] + (1-params['pi0'])/params['pi0']
myboxplot(ratioNumer/ratioDenom,x=0)
xticks()
xlabel('SnT : SOC\nRate Ratio')









    Out[96]:





<matplotlib.text.Text at 0x19ab4668>



In [81]:

    
figure(figsize=(17,5))
for x,(col,label) in enumerate([('PH','Cox PH'),('CI','Cumulative incidence')]):
    subplot(1,2,x+1)
    plot([-1,7],[0,0],'--',color='gray',lw=2)
    myboxplot(resDf[col+'_lower'],x=0)
    myboxplot(resDf[col+'_est'],x=1)
    myboxplot(resDf[col+'_upper'],x=2)
    ylim((-0.6,1))
    xlim((-1,3))
    title(label)
    xticks([0,1,2],['Lower\n95% CI','Est.','Upper\n95% CI'])
    if x==0:
        ylabel('Treatment efficacy\nin COR+ group')



In [80]:

    
pFunc = lambda p: -10*log10(p)

figure(figsize=(12,8))
plot([-1,3],[pFunc(0.05)]*2,'--',color='gray',lw=2)
for x,col in enumerate(['logrank','PH','CI']):
    myboxplot(resDf[col+'_pvalue'].map(pFunc),x=x)
xlim((-1,3))
ylim((-5,60))
xticks([0,1,2],['Log-rank','Cox PH',"Fisher's"])
ylabel('P-value [-10 * log10(p)]')









    Out[80]:





<matplotlib.text.Text at 0x2a38b780>



In [87]:

    
"""Look at the distributions of dx"""
edges = linspace(0,2,50)
figure(figsize=(12,5))
for label,ind in subsets.items():
    lineHist(allDf.dx2.loc[ind & (allDf.disease2==1)],
             edges=edges,
             density=True,
             color=colors[label],
             label=label+' (N = %d)' % (ind & (allDf.disease2==1)).sum())
legend(loc=0)









    Out[87]:





<matplotlib.legend.Legend at 0x2ef3d1d0>

Var	Value	Description
N	10000	Total participants enrolled
lostRate	0.1	Rate of participants lost to followup (per year)
K	2	Randomization ratio of COR+ to treatment vs. placebo
pi0	0.2	Prevalence of COR+
Q	0.9	Prevalence of QFT+ (independent of COR for now...)
F	0.5	Fraction of COR- that are followed to the endpoint
tbRate	0.008	Rate of developing TB disease (TB per year in COR-)
RRcor	4	Max RR for COR+ at time of diagnostic
RRtmt	2	Reduction in RR for COR+ treated vs. untreated
discretize	False	Indicator of whether event times should be discretized by visits
visits	monthly	Vector of times in years of follow-up visits (for discretizing endpoints)
RRdecay	1	Decay time constant for RRcor