In [137]:
%pylab inline
from generate_corr import *
from corrplot import *
from lifelines.estimation import KaplanMeierFitter
from scipy import stats
from IPython.display import HTML
pylab.rc('font', family='sans-serif', size=15)


Populating the interactive namespace from numpy and matplotlib
WARNING: pylab import has clobbered these variables: ['power', 'f', 'linalg', 'random', 'logistic', 'fft', 'info']
`%matplotlib` prevents importing * from pylab and numpy

Generate fake data for analysis

  • Biomarker $BM$ is a continuous, normally distributed marker associated with a normally distributed disease rate $\lambda_0$
  • Association is a positive correlation $\rho$ (with a low threshold on $BM$?)
  • Readout of $BM$ is binary based on a threshold

In [210]:
N = 1000
rho = 0.8
T = 1.0 #follow-up duration

raw = generateNormalCorr(N,2,array([[1., rho],[rho, 1.]]))
#raw = zeros((N,2))

"""Biomarker will be centered on 0.5"""
#raw[:,0] += 0.5
raw[:,0] = randn(N) + 0.5

"""Units of rate parameter will be disease endpoints per year"""
#raw[:,1] = 5 * (raw[:,1]+1) ** 2.0
#raw[:,1] = abs(0.02*(raw[:,0]+3.5)**2 + randn(N)/10)
logistic = lambda L,k,x0,x: L /(1+exp(-k*(x-x0)))
raw[:,1] = logistic(4,5,0.7,raw[:,0] + 0.1*randn(N))
#raw[:,1] = (15-exp(-raw[:,0] + 0.2*randn(N)+0.3))/5

df = pd.DataFrame(data=raw, columns=['bm','rate'])

"""RV for exponential PDF"""
expRV = lambda lam0,n: -log(rand(n)) / lam0

df['dx'] = expRV(raw[:,1],N)

"""Left-censor diagnose times at T"""
df['disease'] = (df.dx<T).astype(int)
df.loc[df.dx>T,'dx'] = T

HTML(df.describe().to_html())


Out[210]:
bm rate dx disease
count 1000.000000 1000.000000 1000.000000 1000.000000
mean 0.488160 1.701162 0.607287 0.546000
std 1.032245 1.669579 0.399821 0.498129
min -2.477480 0.000001 0.000179 0.000000
25% -0.265719 0.032175 0.198425 0.000000
50% 0.482435 1.014863 0.670743 1.000000
75% 1.222156 3.710118 1.000000 1.000000
max 3.738415 3.999999 1.000000 1.000000

In [211]:
scatterfit(df.bm,df.rate,method='spearman')



In [212]:
tertiles = df.bm.quantile([1/3.,2/3.]).values
kmf = KaplanMeierFitter()

figure(figsize=(15,5))
axh = subplot(111)
kmf.fit(durations = df.dx, event_observed = df.disease)
kmf.plot_survival_function_(ax=axh,color = 'black')

ind = df.bm>tertiles[1]
kmf.fit(durations = df.dx.loc[ind], event_observed = df.disease.loc[ind])
kmf.plot_survival_function_(ax=axh,color = 'red')

ind = df.bm<tertiles[0]
kmf.fit(durations = df.dx.loc[ind], event_observed = df.disease.loc[ind])
kmf.plot_survival_function_(ax=axh,color = 'blue')

axh.legend_.remove()
ylabel('Fraction of disease free\nparticipants')
xlabel('Time since (years)')
ylim((0.0,1))


Out[212]:
(0.0, 1)

Analyze performance of biomarker as a function of the threshold

  • Relative risk (RR) of disease for $BM_+$ vs. $BM_-$ as a function of threshold
  • Sensitivity and specificity
  • Number needed to treat to prevent one case (NNT)

In [213]:
"""Define functions for evaluating performance"""
def computeContig(df,bm_col,disease_col):
    tab = zeros((2,2))
    tab[0,0] = ((df[bm_col]==1) & (df[disease_col]==1)).sum()
    tab[0,1] = ((df[bm_col]==1) & (df[disease_col]==0)).sum()
    tab[1,0] = ((df[bm_col]==0) & (df[disease_col]==1)).sum()
    tab[1,1] = ((df[bm_col]==0) & (df[disease_col]==0)).sum()
    return tab

def RRFunc(tab):
    a,b,c,d = tab.flatten()
    return (a/(a+b)) / (c/(c+d))
def sensitivityFunc(tab):
    a,b,c,d = tab.flatten()
    return a/(a+c)
def specificityFunc(tab):
    a,b,c,d = tab.flatten()
    return d/(b+d)
def NNTFunc(tab):
    a,b,c,d = tab.flatten()
    return 1/((a/(a+b))-(c/(c+d)))
def NPVFunc(tab):
    a,b,c,d = tab.flatten()
    return d/(d+c)
def PPVFunc(tab):
    a,b,c,d = tab.flatten()
    return a/(a+b)

In [214]:
m = 20
thresholds = df.bm.quantile((arange(m-1)+1)/m)
perf = pd.DataFrame({k:zeros(m-1) for k in ['RR','Sensitivity','Specificity','NNT','PPV','NPV']})
for threshi,thresh in enumerate(thresholds):
    df['bin'] = (df.bm>thresh).astype(int)
    tab = computeContig(df,'bin','disease')
    perf.loc[threshi,'RR'] = RRFunc(tab)
    perf.loc[threshi,'Sensitivity'] = sensitivityFunc(tab)
    perf.loc[threshi,'Specificity'] = specificityFunc(tab)
    perf.loc[threshi,'NNT'] = NNTFunc(tab)
    perf.loc[threshi,'PPV'] = PPVFunc(tab)
    perf.loc[threshi,'NPV'] = NPVFunc(tab)

In [215]:
figure(figsize=(10,4))
plot(thresholds,perf.RR)


Out[215]:
[<matplotlib.lines.Line2D at 0x21f04898>]

In [206]:
figure(figsize=(15,4))
subplot(1,2,1)
plot(thresholds,perf.Sensitivity)
subplot(1,2,2)
plot(thresholds,perf.Specificity)


Out[206]:
[<matplotlib.lines.Line2D at 0x240882e8>]

In [207]:
figure(figsize=(6,6))
plot(1-perf.Specificity,perf.Sensitivity)
plot([0,1],[0,1],'--')
grid(which='both')



In [208]:
figure(figsize=(10,4))
plot(thresholds,perf.NNT)


Out[208]:
[<matplotlib.lines.Line2D at 0x2468d550>]

In [209]:
figure(figsize=(15,4))
subplot(1,2,1)
plot(thresholds,perf.PPV)
subplot(1,2,2)
plot(thresholds,perf.NPV)


Out[209]:
[<matplotlib.lines.Line2D at 0x23ae22b0>]

In [196]:


In [ ]: