In [2]:
from __future__ import print_function
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
plt.style.use('ggplot')
%matplotlib inline
font = {'family' : 'DejaVu Sans',
'size' : 20}
matplotlib.rc('font', **font)
In [85]:
df1 = pd.read_csv('results_hospmort.csv', na_values=['-'])
df2 = pd.read_csv('results_othermort.csv', na_values=['-'])
df = pd.concat([df1, df2])
In [6]:
# plot bar chart
xi = np.arange(0.5, 2.6, 0.1)
# exclude studies which had a fixed sample size
studies_ignore = ['caballero2015dynamically']
idxIgnore = np.in1d(df['Cohort'], studies_ignore)
plt.figure(figsize=[10,6])
plt.hist(df.loc[~idxIgnore,'N_Repro']/df.loc[~idxIgnore,'N_Study'], bins=xi,
align='left',label='All studies')
# overlay the physionet 2012 datasets
studies_pn2012 = ['che2016recurrent',
'ding2016mortality',
'johnson2012patient',
'johnson2014data']
idxShow = np.in1d(df['Cohort'], studies_pn2012)
plt.hist(df.loc[idxShow,'N_Repro']/df.loc[idxShow,'N_Study'], bins=xi,
align='left',label='PhysioNet 2012')
plt.ylabel('Number of studies')
plt.xlabel('Fold increase in sample size')
plt.legend(loc='upper right')
plt.show()
In [7]:
# exclude studies which had a fixed sample size
# add physionet2012 studies to this
studies_ignore = ['caballero2015dynamically',
'che2016recurrent','ding2016mortality',
'johnson2012patient', 'johnson2014data']
idxIgnore = np.in1d(df['Cohort'], studies_ignore)
# scatter plot
xi = np.arange(0.5, 2.6, 0.1)
plt.figure(figsize=[12,8])
plt.plot(df.loc[~idxIgnore,'N_Repro'],df.loc[~idxIgnore,'N_Study'], marker='o', markersize=12, linestyle='none',
label='All studies')
# overlay the fixed sample size datasets
plt.plot(df.loc[idxIgnore,'N_Repro'],df.loc[idxIgnore,'N_Study'],
marker='s', markersize=12, linestyle='none',
label='Fixed sample size')
plt.plot([0, 45000], [0, 45000], 'k--', linewidth=2)
plt.xlabel('Reproduced sample size')
plt.ylabel('Original sample size')
plt.legend(loc='lower right')
plt.show()
In [112]:
# bland-altman
# exclude studies which had a fixed sample size
# add physionet2012 studies to this
studies_ignore = ['caballero2015dynamically',
'che2016recurrent','ding2016mortality',
'johnson2012patient', 'johnson2014data']
idxIgnore = np.in1d(df['Cohort'], studies_ignore)
xi = np.arange(0.5, 2.6, 0.1)
plt.figure(figsize=[12,8])
plt.plot((df.loc[~idxIgnore,'N_Repro']+df.loc[~idxIgnore,'N_Study'])/2.0,
df.loc[~idxIgnore,'N_Repro']-df.loc[~idxIgnore,'N_Study'],
marker='o', markersize=12, linestyle='none',
label='All studies')
# overlay the physionet 2012 datasets
plt.plot((df.loc[idxIgnore,'N_Repro']+df.loc[idxIgnore,'N_Study'])/2.0,
df.loc[idxIgnore,'N_Repro']-df.loc[idxIgnore,'N_Study'],
marker='s', markersize=12, linestyle='none',
label='Fixed sample size')
# plot average bias
plt.plot([0, 45000],
np.ones(2)*np.mean(df.loc[~idxIgnore,'N_Repro']-df.loc[~idxIgnore,'N_Study']),
'k--', linewidth=2)
plt.title('All experiments - sample size difference'.format(np.sum(~idxIgnore)))
plt.xlabel('Average sample size')
plt.ylabel('Reproduction - original')
plt.legend(loc='upper right')
plt.show()
print('Isolating to studies with non-fixed sample size...')
N = np.abs(df.loc[~idxIgnore,'N_Repro'] - df.loc[~idxIgnore,'N_Study'])
print('{}/{} ({:2.2f}%) had abs(repro-study) > 1000.'.format(
np.sum( N > 1000 ), N.shape[0], 100.0*np.mean( N > 1000 )
))
In [60]:
df.head()
Out[60]:
In [62]:
import tableone
t1 = tableone.TableOne(df, continuous=['N_Study','N_Repro','Y_Study','Y_Repro',
'Study','GB','LR'])
In [110]:
plt.boxplot?
In [113]:
plt.figure(figsize=[12,8])
df['N_Diff'] = df['N_Repro'] - df['N_Study']
cols = ['N_Study','N_Repro','N_Diff']
studies_ignore = ['caballero2015dynamically',
'che2016recurrent','ding2016mortality',
'johnson2012patient', 'johnson2014data']
idxIgnore = np.in1d(df['Cohort'], studies_ignore)
# only use mimic 2 studies so sample size is comparable
idxIgnore = idxIgnore | (df['mimic']==3)
df_uniq = df.loc[~idxIgnore, ['Cohort'] + cols].drop_duplicates()
cols_dict = {'N_Study': 'Original', 'N_Repro': 'Reproduction', 'N_Diff': 'Difference'}
plt.boxplot(df_uniq[cols].values, labels=[cols_dict[x] for x in cols],
notch=False, sym='o',
boxprops={'linewidth':2},
whiskerprops={'linewidth':2},
flierprops={'linewidth':2},
capprops={'linewidth':2})
plt.ylabel('Sample size')
plt.title('{} experiments on MIMIC-II'.format(df_uniq.shape[0]))
plt.show()
In [105]:
df_uniq
Out[105]:
In [9]:
# exclude studies which had a fixed sample size
# add physionet2012 studies to this
studies_ignore = ['caballero2015dynamically',
'che2016recurrent','ding2016mortality',
'johnson2012patient', 'johnson2014data']
idxIgnore = np.in1d(df['Cohort'], studies_ignore)
# scatter plot
xi = np.arange(0.5, 2.6, 0.1)
plt.figure(figsize=[12,8])
plt.plot(df.loc[~idxIgnore,'Y_Repro'],df.loc[~idxIgnore,'Y_Study'], marker='o', markersize=12, linestyle='none',
label='All studies')
# overlay the fixed sample size datasets
plt.plot(df.loc[idxIgnore,'Y_Repro'],df.loc[idxIgnore,'Y_Study'],
marker='s', markersize=12, linestyle='none',
label='Fixed sample size')
plt.plot([0, 50], [0, 50], 'k--', linewidth=2)
plt.xlabel('Reproduced sample size')
plt.ylabel('Original sample size')
plt.legend(loc='lower right')
plt.show()
In [16]:
# bland-altman
# exclude studies which had a fixed sample size
# add physionet2012 studies to this
studies_ignore = ['caballero2015dynamically',
'che2016recurrent','ding2016mortality',
'johnson2012patient', 'johnson2014data']
idxIgnore = np.in1d(df['Cohort'], studies_ignore)
xi = np.arange(0.5, 2.6, 0.1)
plt.figure(figsize=[12,8])
plt.plot((df.loc[~idxIgnore,'Y_Repro']+df.loc[~idxIgnore,'Y_Study'])/2.0,
df.loc[~idxIgnore,'Y_Repro']-df.loc[~idxIgnore,'Y_Study'],
marker='o', markersize=12, linestyle='none',
label='All studies')
# overlay the physionet 2012 datasets
plt.plot((df.loc[idxIgnore,'Y_Repro']+df.loc[idxIgnore,'Y_Study'])/2.0,
df.loc[idxIgnore,'Y_Repro']-df.loc[idxIgnore,'Y_Study'],
marker='s', markersize=12, linestyle='none',
label='Fixed sample size')
# plot average bias
plt.plot([0, 50],
np.ones(2)*np.mean(df.loc[~idxIgnore,'Y_Repro']-df.loc[~idxIgnore,'Y_Study']),
'k--', linewidth=2)
plt.xlabel('Average mortality rate')
plt.ylabel('Reproduction - original')
plt.title('% Mortality comparison')
plt.legend(loc='upper right')
plt.show()
In [118]:
# bland-altman
# exclude studies which had a fixed sample size
# add physionet2012 studies to this
studies_ignore = ['caballero2015dynamically',
'che2016recurrent','ding2016mortality',
'johnson2012patient', 'johnson2014data']
idxIgnore = np.in1d(df['Cohort'], studies_ignore)
xi = np.arange(0.5, 2.6, 0.1)
plt.figure(figsize=[12,8])
plt.plot((df.loc[~idxIgnore,'LR']+df.loc[~idxIgnore,'Study'])/2.0,
df.loc[~idxIgnore,'LR']-df.loc[~idxIgnore,'Study'],
marker='o', markersize=12, linestyle='none',
label='All studies')
# overlay the physionet 2012 datasets
plt.plot((df.loc[idxIgnore,'LR']+df.loc[idxIgnore,'Study'])/2.0,
df.loc[idxIgnore,'LR']-df.loc[idxIgnore,'Study'],
marker='s', markersize=12, linestyle='none',
label='Fixed sample size')
# plot average bias
plt.plot([0.7, 1],
np.ones(2)*np.mean(df.loc[~idxIgnore,'LR']-df.loc[~idxIgnore,'Study']),
'k--', linewidth=2)
plt.title('Logistic Regression AUROC comparison')
plt.xlabel('Average AUROC')
plt.ylabel('Reproduction - original')
#plt.legend(loc='upper right')
plt.xlim([0.7,1.0])
plt.show()
print('{}/{} ({:2.2f}%) had repro > study.'.format(
np.sum( df['LR'] > df['Study'] ), df.shape[0],
100.0*np.mean( df['LR'] > df['Study'] )
))
print('If isolating to studies with non-fixed sample size...')
print('{}/{} ({:2.2f}%) had repro > study.'.format(
np.sum( df.loc[~idxIgnore,'LR'] > df.loc[~idxIgnore,'Study'] ), np.sum(~idxIgnore),
100.0*np.mean( df.loc[~idxIgnore,'LR'] > df.loc[~idxIgnore,'Study'] )
))
In [20]:
# bland-altman
# exclude studies which had a fixed sample size
# add physionet2012 studies to this
studies_ignore = ['caballero2015dynamically',
'che2016recurrent','ding2016mortality',
'johnson2012patient', 'johnson2014data']
idxIgnore = np.in1d(df['Cohort'], studies_ignore)
xi = np.arange(0.5, 2.6, 0.1)
plt.figure(figsize=[12,8])
plt.plot((df.loc[~idxIgnore,'GB']+df.loc[~idxIgnore,'Study'])/2.0,
df.loc[~idxIgnore,'GB']-df.loc[~idxIgnore,'Study'],
marker='o', markersize=12, linestyle='none',
label='All studies')
# overlay the physionet 2012 datasets
plt.plot((df.loc[idxIgnore,'GB']+df.loc[idxIgnore,'Study'])/2.0,
df.loc[idxIgnore,'GB']-df.loc[idxIgnore,'Study'],
marker='s', markersize=12, linestyle='none',
label='Fixed sample size')
# plot average bias
plt.plot([0.75, 1],
np.ones(2)*np.mean(df.loc[~idxIgnore,'GB']-df.loc[~idxIgnore,'Study']),
'k--', linewidth=2)
plt.xlabel('Average AUROC')
plt.ylabel('Reproduction - original AUROC')
plt.legend(loc='upper right')
plt.show()
print('{}/{} ({:2.2f}%) had repro > study.'.format(
np.sum( df['GB'] > df['Study'] ), df.shape[0],
100.0*np.mean( df['GB'] > df['Study'] )
))
print('If isolating to studies with non-fixed sample size...')
print('{}/{} ({:2.2f}%) had repro > study.'.format(
np.sum( df.loc[~idxIgnore,'GB'] > df.loc[~idxIgnore,'Study'] ), np.sum(~idxIgnore),
100.0*np.mean( df.loc[~idxIgnore,'GB'] > df.loc[~idxIgnore,'Study'] )
))
In [39]:
from sklearn import linear_model
from sklearn import metrics
Out[39]:
In [116]:
# ignore studies with fixed sample size
studies_ignore = ['caballero2015dynamically',
'che2016recurrent','ding2016mortality',
'johnson2012patient', 'johnson2014data']
idxIgnore = np.in1d(df['Cohort'], studies_ignore)
# repeat for different transforms of sample size
yi = [df.loc[~idxIgnore,'N_Repro']/df.loc[~idxIgnore,'N_Study'],
df.loc[~idxIgnore,'N_Repro']-df.loc[~idxIgnore,'N_Study'],
np.log(df.loc[~idxIgnore,'N_Repro']/df.loc[~idxIgnore,'N_Study'])]
X = df.loc[~idxIgnore,'pages'].values.reshape(-1,1)
N = X.shape[0]
xi = np.asarray([0,35]).reshape(-1,1)
for y in yi:
# scatter plot
plt.figure(figsize=[12,8])
plt.plot(X, y,
marker='o', markersize=12, linestyle='none',
label='All studies')
# plot line of best fit
mdl = linear_model.LinearRegression()
mdl.fit(X, y.values)
plt.plot(xi, mdl.predict(xi), 'k--',linewidth=4)
#plt.plot([0, 50], [0, 50], 'k--', linewidth=2)
plt.title('Sample size vs page count - R$^2$={:0.2f}'.format(
metrics.r2_score(y,mdl.predict(X))))
plt.xlabel('Page count')
plt.ylabel('Sample size fold increase')
#plt.legend(loc='lower right')
plt.show()
In [ ]:
In [15]:
df.groupby(u'split')['Cohort'].count()
Out[15]: