In [137]:

    
%pylab inline
from generate_corr import *
from corrplot import *
from lifelines.estimation import KaplanMeierFitter
from scipy import stats
from IPython.display import HTML
pylab.rc('font', family='sans-serif', size=15)









    



Populating the interactive namespace from numpy and matplotlib






    



WARNING: pylab import has clobbered these variables: ['power', 'f', 'linalg', 'random', 'logistic', 'fft', 'info']
`%matplotlib` prevents importing * from pylab and numpy

Generate fake data for analysis

Biomarker $BM$ is a continuous, normally distributed marker associated with a normally distributed disease rate $\lambda_0$
Association is a positive correlation $\rho$ (with a low threshold on $BM$?)
Readout of $BM$ is binary based on a threshold



In [210]:

    
N = 1000
rho = 0.8
T = 1.0 #follow-up duration

raw = generateNormalCorr(N,2,array([[1., rho],[rho, 1.]]))
#raw = zeros((N,2))

"""Biomarker will be centered on 0.5"""
#raw[:,0] += 0.5
raw[:,0] = randn(N) + 0.5

"""Units of rate parameter will be disease endpoints per year"""
#raw[:,1] = 5 * (raw[:,1]+1) ** 2.0
#raw[:,1] = abs(0.02*(raw[:,0]+3.5)**2 + randn(N)/10)
logistic = lambda L,k,x0,x: L /(1+exp(-k*(x-x0)))
raw[:,1] = logistic(4,5,0.7,raw[:,0] + 0.1*randn(N))
#raw[:,1] = (15-exp(-raw[:,0] + 0.2*randn(N)+0.3))/5

df = pd.DataFrame(data=raw, columns=['bm','rate'])

"""RV for exponential PDF"""
expRV = lambda lam0,n: -log(rand(n)) / lam0

df['dx'] = expRV(raw[:,1],N)

"""Left-censor diagnose times at T"""
df['disease'] = (df.dx<T).astype(int)
df.loc[df.dx>T,'dx'] = T

HTML(df.describe().to_html())









    Out[210]:





  
    
      
      bm
      rate
      dx
      disease
    
  
  
    
      count
       1000.000000
       1000.000000
       1000.000000
       1000.000000
    
    
      mean
          0.488160
          1.701162
          0.607287
          0.546000
    
    
      std
          1.032245
          1.669579
          0.399821
          0.498129
    
    
      min
         -2.477480
          0.000001
          0.000179
          0.000000
    
    
      25%
         -0.265719
          0.032175
          0.198425
          0.000000
    
    
      50%
          0.482435
          1.014863
          0.670743
          1.000000
    
    
      75%
          1.222156
          3.710118
          1.000000
          1.000000
    
    
      max
          3.738415
          3.999999
          1.000000
          1.000000



In [211]:

    
scatterfit(df.bm,df.rate,method='spearman')



In [212]:

    
tertiles = df.bm.quantile([1/3.,2/3.]).values
kmf = KaplanMeierFitter()

figure(figsize=(15,5))
axh = subplot(111)
kmf.fit(durations = df.dx, event_observed = df.disease)
kmf.plot_survival_function_(ax=axh,color = 'black')

ind = df.bm>tertiles[1]
kmf.fit(durations = df.dx.loc[ind], event_observed = df.disease.loc[ind])
kmf.plot_survival_function_(ax=axh,color = 'red')

ind = df.bm<tertiles[0]
kmf.fit(durations = df.dx.loc[ind], event_observed = df.disease.loc[ind])
kmf.plot_survival_function_(ax=axh,color = 'blue')

axh.legend_.remove()
ylabel('Fraction of disease free\nparticipants')
xlabel('Time since (years)')
ylim((0.0,1))









    Out[212]:





(0.0, 1)

Analyze performance of biomarker as a function of the threshold

Relative risk (RR) of disease for $BM_+$ vs. $BM_-$ as a function of threshold
Sensitivity and specificity
Number needed to treat to prevent one case (NNT)



In [213]:

    
"""Define functions for evaluating performance"""
def computeContig(df,bm_col,disease_col):
    tab = zeros((2,2))
    tab[0,0] = ((df[bm_col]==1) & (df[disease_col]==1)).sum()
    tab[0,1] = ((df[bm_col]==1) & (df[disease_col]==0)).sum()
    tab[1,0] = ((df[bm_col]==0) & (df[disease_col]==1)).sum()
    tab[1,1] = ((df[bm_col]==0) & (df[disease_col]==0)).sum()
    return tab

def RRFunc(tab):
    a,b,c,d = tab.flatten()
    return (a/(a+b)) / (c/(c+d))
def sensitivityFunc(tab):
    a,b,c,d = tab.flatten()
    return a/(a+c)
def specificityFunc(tab):
    a,b,c,d = tab.flatten()
    return d/(b+d)
def NNTFunc(tab):
    a,b,c,d = tab.flatten()
    return 1/((a/(a+b))-(c/(c+d)))
def NPVFunc(tab):
    a,b,c,d = tab.flatten()
    return d/(d+c)
def PPVFunc(tab):
    a,b,c,d = tab.flatten()
    return a/(a+b)



In [214]:

    
m = 20
thresholds = df.bm.quantile((arange(m-1)+1)/m)
perf = pd.DataFrame({k:zeros(m-1) for k in ['RR','Sensitivity','Specificity','NNT','PPV','NPV']})
for threshi,thresh in enumerate(thresholds):
    df['bin'] = (df.bm>thresh).astype(int)
    tab = computeContig(df,'bin','disease')
    perf.loc[threshi,'RR'] = RRFunc(tab)
    perf.loc[threshi,'Sensitivity'] = sensitivityFunc(tab)
    perf.loc[threshi,'Specificity'] = specificityFunc(tab)
    perf.loc[threshi,'NNT'] = NNTFunc(tab)
    perf.loc[threshi,'PPV'] = PPVFunc(tab)
    perf.loc[threshi,'NPV'] = NPVFunc(tab)



In [215]:

    
figure(figsize=(10,4))
plot(thresholds,perf.RR)









    Out[215]:





[<matplotlib.lines.Line2D at 0x21f04898>]



In [206]:

    
figure(figsize=(15,4))
subplot(1,2,1)
plot(thresholds,perf.Sensitivity)
subplot(1,2,2)
plot(thresholds,perf.Specificity)









    Out[206]:





[<matplotlib.lines.Line2D at 0x240882e8>]



In [207]:

    
figure(figsize=(6,6))
plot(1-perf.Specificity,perf.Sensitivity)
plot([0,1],[0,1],'--')
grid(which='both')



In [208]:

    
figure(figsize=(10,4))
plot(thresholds,perf.NNT)









    Out[208]:





[<matplotlib.lines.Line2D at 0x2468d550>]



In [209]:

    
figure(figsize=(15,4))
subplot(1,2,1)
plot(thresholds,perf.PPV)
subplot(1,2,2)
plot(thresholds,perf.NPV)









    Out[209]:





[<matplotlib.lines.Line2D at 0x23ae22b0>]



In [196]:



In [ ]:

	bm	rate	dx	disease
count	1000.000000	1000.000000	1000.000000	1000.000000
mean	0.488160	1.701162	0.607287	0.546000
std	1.032245	1.669579	0.399821	0.498129
min	-2.477480	0.000001	0.000179	0.000000
25%	-0.265719	0.032175	0.198425	0.000000
50%	0.482435	1.014863	0.670743	1.000000
75%	1.222156	3.710118	1.000000	1.000000
max	3.738415	3.999999	1.000000	1.000000