In [137]:
%pylab inline
from generate_corr import *
from corrplot import *
from lifelines.estimation import KaplanMeierFitter
from scipy import stats
from IPython.display import HTML
pylab.rc('font', family='sans-serif', size=15)
In [210]:
N = 1000
rho = 0.8
T = 1.0 #follow-up duration
raw = generateNormalCorr(N,2,array([[1., rho],[rho, 1.]]))
#raw = zeros((N,2))
"""Biomarker will be centered on 0.5"""
#raw[:,0] += 0.5
raw[:,0] = randn(N) + 0.5
"""Units of rate parameter will be disease endpoints per year"""
#raw[:,1] = 5 * (raw[:,1]+1) ** 2.0
#raw[:,1] = abs(0.02*(raw[:,0]+3.5)**2 + randn(N)/10)
logistic = lambda L,k,x0,x: L /(1+exp(-k*(x-x0)))
raw[:,1] = logistic(4,5,0.7,raw[:,0] + 0.1*randn(N))
#raw[:,1] = (15-exp(-raw[:,0] + 0.2*randn(N)+0.3))/5
df = pd.DataFrame(data=raw, columns=['bm','rate'])
"""RV for exponential PDF"""
expRV = lambda lam0,n: -log(rand(n)) / lam0
df['dx'] = expRV(raw[:,1],N)
"""Left-censor diagnose times at T"""
df['disease'] = (df.dx<T).astype(int)
df.loc[df.dx>T,'dx'] = T
HTML(df.describe().to_html())
Out[210]:
In [211]:
scatterfit(df.bm,df.rate,method='spearman')
In [212]:
tertiles = df.bm.quantile([1/3.,2/3.]).values
kmf = KaplanMeierFitter()
figure(figsize=(15,5))
axh = subplot(111)
kmf.fit(durations = df.dx, event_observed = df.disease)
kmf.plot_survival_function_(ax=axh,color = 'black')
ind = df.bm>tertiles[1]
kmf.fit(durations = df.dx.loc[ind], event_observed = df.disease.loc[ind])
kmf.plot_survival_function_(ax=axh,color = 'red')
ind = df.bm<tertiles[0]
kmf.fit(durations = df.dx.loc[ind], event_observed = df.disease.loc[ind])
kmf.plot_survival_function_(ax=axh,color = 'blue')
axh.legend_.remove()
ylabel('Fraction of disease free\nparticipants')
xlabel('Time since (years)')
ylim((0.0,1))
Out[212]:
In [213]:
"""Define functions for evaluating performance"""
def computeContig(df,bm_col,disease_col):
tab = zeros((2,2))
tab[0,0] = ((df[bm_col]==1) & (df[disease_col]==1)).sum()
tab[0,1] = ((df[bm_col]==1) & (df[disease_col]==0)).sum()
tab[1,0] = ((df[bm_col]==0) & (df[disease_col]==1)).sum()
tab[1,1] = ((df[bm_col]==0) & (df[disease_col]==0)).sum()
return tab
def RRFunc(tab):
a,b,c,d = tab.flatten()
return (a/(a+b)) / (c/(c+d))
def sensitivityFunc(tab):
a,b,c,d = tab.flatten()
return a/(a+c)
def specificityFunc(tab):
a,b,c,d = tab.flatten()
return d/(b+d)
def NNTFunc(tab):
a,b,c,d = tab.flatten()
return 1/((a/(a+b))-(c/(c+d)))
def NPVFunc(tab):
a,b,c,d = tab.flatten()
return d/(d+c)
def PPVFunc(tab):
a,b,c,d = tab.flatten()
return a/(a+b)
In [214]:
m = 20
thresholds = df.bm.quantile((arange(m-1)+1)/m)
perf = pd.DataFrame({k:zeros(m-1) for k in ['RR','Sensitivity','Specificity','NNT','PPV','NPV']})
for threshi,thresh in enumerate(thresholds):
df['bin'] = (df.bm>thresh).astype(int)
tab = computeContig(df,'bin','disease')
perf.loc[threshi,'RR'] = RRFunc(tab)
perf.loc[threshi,'Sensitivity'] = sensitivityFunc(tab)
perf.loc[threshi,'Specificity'] = specificityFunc(tab)
perf.loc[threshi,'NNT'] = NNTFunc(tab)
perf.loc[threshi,'PPV'] = PPVFunc(tab)
perf.loc[threshi,'NPV'] = NPVFunc(tab)
In [215]:
figure(figsize=(10,4))
plot(thresholds,perf.RR)
Out[215]:
In [206]:
figure(figsize=(15,4))
subplot(1,2,1)
plot(thresholds,perf.Sensitivity)
subplot(1,2,2)
plot(thresholds,perf.Specificity)
Out[206]:
In [207]:
figure(figsize=(6,6))
plot(1-perf.Specificity,perf.Sensitivity)
plot([0,1],[0,1],'--')
grid(which='both')
In [208]:
figure(figsize=(10,4))
plot(thresholds,perf.NNT)
Out[208]:
In [209]:
figure(figsize=(15,4))
subplot(1,2,1)
plot(thresholds,perf.PPV)
subplot(1,2,2)
plot(thresholds,perf.NPV)
Out[209]:
In [196]:
In [ ]: