In [72]:
%matplotlib inline
%run __init__.py
import pickle
import warnings
import numpy as np
import matplotlib.pyplot as plt
import pandas
import utils
from utils import loading, scoring
from gerkin import dream,fit1,fit2,params
In [73]:
descriptors = loading.get_descriptors(format=True)
letters = utils.letters
all_CIDs = sorted(loading.get_CIDs(['training','leaderboard']))
all_CID_dilutions = sorted(loading.get_CID_dilutions(['training','leaderboard']))
norep_CID_dilutions = sorted(loading.get_CID_dilutions(['training-norep','leaderboard']))
rep_CIDs = sorted(loading.get_CIDs(['replicated']))
rep_CID_dilutions = sorted(loading.get_CID_dilutions(['replicated']))
features = loading.get_molecular_data(['dragon','morgan'],all_CIDs)
In [74]:
X,_,_,_,_,_ = dream.make_X(features,all_CID_dilutions)
X_train = X.loc[norep_CID_dilutions]
X_test_other = dream.filter_X_dilutions(X.loc[rep_CID_dilutions],target_dilution='high')
X_test_int = dream.filter_X_dilutions(X.loc[rep_CID_dilutions],target_dilution=-3)
In [75]:
# Same as above, but preparing data for the linear model
X_lin = dream.quad_prep(features,all_CID_dilutions)
X_train_lin = X_lin.loc[norep_CID_dilutions]
X_test_lin_other = dream.quad_prep(features,rep_CID_dilutions,dilution='high')
X_test_lin_int = dream.quad_prep(features,rep_CID_dilutions,dilution=-3)
In [122]:
Y_train = loading.load_perceptual_data(['training-norep','leaderboard'])
Y_train_imp = dream.impute(Y_train['Subject'],'median').unstack('Descriptor')
Y_test = loading.load_perceptual_data(['replicated'])
Y_test_other = dream.filter_Y_dilutions(Y_test,'low')
Y_test_int = dream.filter_Y_dilutions(Y_test,-3)
Y_test_gold = dream.filter_Y_dilutions(Y_test,'gold',keep_replicates=True)
In [77]:
# Load optimal parameters (from cross-validation) for random forest model
trans_params = params.get_trans_params(Y_train, descriptors, plot=False)
use_et, max_features, max_depth, min_samples_leaf, trans_weight, regularize, use_mask = params.get_other_params()
In [78]:
# Create a mask to indicate which molecules have intensity values at 1/1000 dilution
intensity_mask = [x[0] for x in loading.get_CID_dilutions('replicated') if x[1]==-3]
In [79]:
use_saved_rf_model = True
if use_saved_rf_model:
with open('../../data/rfcs_norep_1.pickle','rb') as f:
rfcs_norep_1 = pickle.load(f)
else:
n_estimators = 25
rfcs_norep_1,score_1,rs_1 = fit1.rfc_final(X_train,Y_train_imp,
max_features,min_samples_leaf,max_depth,use_et,regularize,
n_estimators=n_estimators)
with open('../../data/rfcs_norep_1.pickle','wb') as f:
pickle.dump(rfcs_norep_1,f)
In [80]:
# Use the model to create the prediction arrays
Y_pred_dec = loading.make_prediction_files(rfcs_norep_1,X_test_int,X_test_other,
'replicated',1,intensity_mask=intensity_mask,Y_test=None,
write=False,regularize=regularize)
In [81]:
# Print a summary of the prediction quality
predicted = Y_pred_dec.to_frame()
observed = dream.filter_Y_dilutions(Y_test,'gold')['Subject']
scoring.score_summary(predicted,observed,mask=True)
Out[81]:
In [82]:
# Load or compute the feature ranks for the training set
use_saved_linear_feature_ranks = True
if use_saved_linear_feature_ranks:
lin_ranked = np.load('../../data/lin_ranked_test_retest.npy')
else:
# -1 is to leave out the leak feature
lin_ranked = fit2.compute_linear_feature_ranks(X_train_lin,Y_train,n_resampling=10)
np.save('../../data/lin_ranked_test_retest',lin_ranked)
In [83]:
# Run the linear model (fast)
Y_pred_lin = fit1.compute_linear_predictions(X_train_lin,X_test_lin_int,X_test_lin_other,Y_train_imp,
lin_ranked,alpha=10.0,max_features=1000)
In [84]:
# Use an average of the two models as the prediction
y_pred = Y_pred_dec.copy()
for subject in range(1,50):
y_pred[subject] = (Y_pred_dec[subject]+Y_pred_lin[subject])/2
In [170]:
# Compute model/test correlations and test/retest correlations
# Extract the indices of the 20 replicated molecules.
#rep_indices = np.where(data[0,:,1,:,0].mean(axis=1).mask == False)[0]
# Subchallenge 1: Jacknife samples of the Coefficients of Error.
sc1_cv = pd.Series(index=descriptors) # Will hold Model vs. Test
trt1_cv = pd.Series(index=descriptors) # Will hold Test vs Retest
for d,descriptor in enumerate(descriptors): # Iterate over each descriptor.
rs_sc1 = pd.Series(index=range(1,50)) # Hold model vs test correlations for each the 49 subjects.
rs_trt = pd.Series(index=range(1,50)) # Hold test vs retest correlations for each the 49 subjects.
for subject in range(1,50): # Iterate over subjects.
o = Y_test_gold['Subject'][subject].unstack('CID').loc[(descriptor,0)]
r = Y_test_gold['Subject'][subject].unstack('CID').loc[(descriptor,1)]
p = y_pred[subject][descriptor] # Select the predicted values for the same molecules.
# To avoid biasing towards test or retest, compute the length-40 vector of concatenated test and retest
# values for this subject/descriptor. Compute correlated of corresponding length-40 vector which just has
# the 20 predicted values listed twice.
rs_sc1[subject] = pd.concat((o,r)).corr(pd.concat((p,p)))
# Now compute correlation between test and retest.
rs_trt[subject] = o.corr(r)
# Compute the mean across subjects of these within-subject correlations.
sc1_cv[descriptor] = rs_sc1.mean()
trt1_cv[descriptor] = rs_trt.mean()
In [202]:
# Same as above, but to compute jacknife samples to get error bars.
sc1_cv_jn = pd.DataFrame(index=rep_CIDs,columns=descriptors) # Will hold Model vs. Test
trt1_cv_jn = pd.DataFrame(index=rep_CIDs,columns=descriptors) # Will hold Test vs Retest
for d,descriptor in enumerate(descriptors): # Iterate over each descriptor.
CIDs = list(set(Y_test_gold.loc[descriptor].index.get_level_values('CID')))
for i,CID in enumerate(CIDs): # Which CID to holdout for jackknife estimation
rs_sc1 = pd.Series(index=range(1,50)) # Hold model vs test correlations for each the 49 subjects.
rs_trt = pd.Series(index=range(1,50)) # Hold test vs retest correlations for each the 49 subjects.
for subject in range(1,50): # Iterate over subjects.
o = Y_test_gold['Subject'][subject].unstack('CID').loc[(descriptor,0)].drop(CID)
r = Y_test_gold['Subject'][subject].unstack('CID').loc[(descriptor,1)].drop(CID)
p = y_pred[subject][descriptor].drop(CID) # Select the predicted values for the same molecules.
rs_sc1[subject] = pd.concat((o,r)).corr(pd.concat((p,p)))
rs_trt[subject] = o.corr(r)
sc1_cv_jn.loc[CID,descriptor] = rs_sc1.mean()
trt1_cv_jn.loc[CID,descriptor] = rs_trt.mean()
In [204]:
# Make sure that means of jackknife samples are approximately equal to directly computed means
assert np.allclose(sc1_cv,sc1_cv_jn.mean(axis=0),atol=0.05)
assert np.allclose(trt1_cv,trt1_cv_jn.mean(axis=0),atol=0.05)
In [205]:
from scipy.odr import Model,RealData,ODR
In [231]:
# Code for plotting the test-retest plot
def plot_r(sc_mean,trt_mean,subchallenge,sc_err=None,trt_err=None,scale='auto'):
plt.figure(figsize=(12,12))
x = np.linspace(-1,1,100)
plt.plot(x,x,'--',c='r',linewidth=4)
plt.errorbar(trt_mean,sc_mean,xerr=trt_err,yerr=sc_err,c='white',ecolor='black',fmt='o',markersize=24)
plt.xlabel('Test-Retest Correlation',size=30)
plt.ylabel('Model-Test Correlation',size=30)
plt.xlim(0,0.7)
plt.ylim(0,0.7)
from scipy.stats import linregress, ttest_rel
_,p = ttest_rel(sc_mean,trt_mean)
r = np.corrcoef(sc_mean,trt_mean)[0,1]
z = np.zeros(100) # Add 0's for stability
# Orthogonal distance regression (uses uncertainties)
def lin_func(p, x):
m, b = p
return m*x + b
data = RealData(np.concatenate((z,trt_mean)), np.concatenate((z,sc_mean)),
sx=np.concatenate((z+0.01,trt_err)), sy=np.concatenate((z+0.01,sc_err)))
lin_model = Model(lin_func)
odr = ODR(data, lin_model, beta0=[1., 0.])
out = odr.run()
# Plain linear regression (does not use uncertainties)
coefs = linregress(np.concatenate((z,trt_mean)),np.concatenate((z,sc_mean))) # Zeros force the intercept to be zero
slope = out.beta[0] # coefs.slope
stderr = out.sd_beta[0] # coefs.stderr
intercept = out.beta[1] # coefs.intercept
plt.plot(x,x*slope + intercept,'-',c='k')
plt.text(0.01,0.45,'slope = %.2f+/-%.2f' % (slope,stderr),size=22)
for d,descriptor in enumerate(descriptors):
plt.text(trt_mean[descriptor],sc_mean[descriptor],letters[d],fontdict={'color':'blue','size':21,'weight':'bold'},
horizontalalignment='center',verticalalignment='center')
In [232]:
plot_r(sc1_cv_jn.mean(axis=0),trt1_cv_jn.mean(axis=0),1,sc_err=sc1_cv_jn.std(axis=0),trt_err=trt1_cv_jn.std(axis=0))
plt.tick_params(axis='both', which='major', labelsize=18)
#plt.savefig('../../figures/test-retest_sc1.eps',format='eps')
In [319]:
# p-values for each of the points in the plot
# Compared against the null-hypothesis that they are on the line
from scipy.stats import multivariate_normal,chi2
n_rvs = 1000000
def compute_pvals(sc_cv_jn,trt_cv_jn):
ps = pd.Series(index=descriptors)
for descriptor in descriptors:
rvs = multivariate_normal.rvs([sc_cv_jn.mean(axis=0)[descriptor],trt_cv_jn.mean(axis=0)[descriptor]],
[[sc_cv_jn.std(axis=0)[descriptor]**2,0],
[0,trt_cv_jn.std(axis=0)[descriptor]**2]],
n_rvs)
ps[descriptor] = ((rvs[:,1] - rvs[:,0]) < 0).sum()/n_rvs
ps_fdr = pd.Series(index=descriptors)
for descriptor in descriptors:
ps_fdr[descriptor] = ps[descriptor] * len(descriptors) / (ps.rank()[descriptor])
if ps_fdr[descriptor] < 0.001:
stars = '***'
elif ps_fdr[descriptor] < 0.01:
stars = '**'
elif ps_fdr[descriptor] < 0.05:
stars = '*'
else:
stars = ''
print("%s: %.4f %s" % (descriptor,ps_fdr[descriptor],stars))
fisher = -np.log(ps).sum()*2
print("Pooled p = %.3g" % (1-chi2.cdf(fisher,42)))
fisher = -np.log(ps.drop('Intensity')).sum()*2
print("Pooled p = %.3g (ignoring intensity)" % (1-chi2.cdf(fisher,42)))
fisher = -np.log(ps.drop(['Intensity','Pleasantness'])).sum()*2
print("Pooled p = %.3g (ignoring intensity and pleasantness)" % (1-chi2.cdf(fisher,42)))
compute_pvals(sc1_cv_jn,trt1_cv_jn)
In [280]:
use_saved_rf_model = False
if use_saved_rf_model:
with open('../../data/rfcs_norep_2.pickle','rb') as f:
rfcs_norep_2 = pickle.load(f)
else:
n_estimators = 3
rfcs_norep_2,score_2,rs_2 = fit2.rfc_final(X_train,Y_train_imp,Y_train,
max_features,min_samples_leaf,max_depth,use_et,use_mask,trans_weight,trans_params,
n_estimators=n_estimators)
with open('../../data/rfcs_norep_2.pickle','wb') as f:
pickle.dump(rfcs_norep_1,f)
In [282]:
# Use the model to create the prediction arrays
Y_pred_dec = loading.make_prediction_files(rfcs_norep_2,X_test_int,X_test_other,
'replicated',2,intensity_mask=intensity_mask,Y_test=None,
write=False)
In [285]:
# Print a summary of the prediction quality
predicted = Y_pred_dec.to_frame()
observed = dream.filter_Y_dilutions(Y_test,'gold')['Subject']
scoring.score_summary2(predicted,observed,mask=True)
Out[285]:
In [287]:
Y_pred_dec
Out[287]:
In [313]:
# Use an average of the two models as the prediction
y_pred = (Y_pred_dec['mean'] + Y_pred_lin.mean(axis=0))/2
In [314]:
# Compute model/test correlations and test/retest correlations
# Extract the indices of the 20 replicated molecules.
#rep_indices = np.where(data[0,:,1,:,0].mean(axis=1).mask == False)[0]
# Subchallenge 1: Jacknife samples of the Coefficients of Error.
sc2_cv = pd.Series(index=descriptors) # Will hold Model vs. Test
trt2_cv = pd.Series(index=descriptors) # Will hold Test vs Retest
for d,descriptor in enumerate(descriptors): # Iterate over each descriptor.
o = Y_test_gold.mean(axis=1).unstack('CID').loc[(descriptor,0)]
r = Y_test_gold.mean(axis=1).unstack('CID').loc[(descriptor,1)]
p = y_pred[descriptor] # Select the predicted values for the same molecules.
# To avoid biasing towards test or retest, compute the length-40 vector of concatenated test and retest
# values for this subject/descriptor. Compute correlated of corresponding length-40 vector which just has
# the 20 predicted values listed twice.
rs_sc2 = pd.concat((o,r)).corr(pd.concat((p,p)))
# Now compute correlation between test and retest.
rs_trt = o.corr(r)
sc2_cv[descriptor] = rs_sc2
trt2_cv[descriptor] = rs_trt
In [315]:
# Same as above, but to compute jacknife samples to get error bars.
sc2_cv_jn = pd.DataFrame(index=rep_CIDs,columns=descriptors) # Will hold Model vs. Test
trt2_cv_jn = pd.DataFrame(index=rep_CIDs,columns=descriptors) # Will hold Test vs Retest
for d,descriptor in enumerate(descriptors): # Iterate over each descriptor.
CIDs = list(set(Y_test_gold.loc[descriptor].index.get_level_values('CID')))
for i,CID in enumerate(CIDs): # Which CID to holdout for jackknife estimation
o = Y_test_gold.mean(axis=1).unstack('CID').loc[(descriptor,0)].drop(CID)
r = Y_test_gold.mean(axis=1).unstack('CID').loc[(descriptor,1)].drop(CID)
p = y_pred[descriptor].drop(CID) # Select the predicted values for the same molecules.
rs_sc2 = pd.concat((o,r)).corr(pd.concat((p,p)))
rs_trt = o.corr(r)
sc2_cv_jn.loc[CID,descriptor] = rs_sc2
trt2_cv_jn.loc[CID,descriptor] = rs_trt
In [316]:
# Make sure that means of jackknife samples are approximately equal to directly computed means
assert np.allclose(sc2_cv,sc2_cv_jn.mean(axis=0),atol=0.05)
assert np.allclose(trt2_cv,trt2_cv_jn.mean(axis=0),atol=0.05)
In [318]:
plot_r(sc2_cv,trt2_cv,1,sc2_cv_jn.std(axis=0),trt_err=trt2_cv_jn.std(axis=0))
plt.xlim(0,1.0)
plt.ylim(0,1.0);
plt.tick_params(axis='both', which='major', labelsize=18);
#plt.savefig('../../figures/test-retest_sc2.eps',format='eps')
In [320]:
compute_pvals(sc2_cv_jn,trt2_cv_jn)
In [325]:
plot_r(trt2_cv,trt2_cv,1,trt2_cv_jn.std(axis=0),trt_err=trt2_cv_jn.std(axis=0))
plt.xlim(0,1.0)
plt.ylim(0,1.0);
plt.tick_params(axis='both', which='major', labelsize=18);
#plt.savefig('../../figures/test-retest_cartoon_2.eps',format='eps')
In [339]:
plot_r(pd.Series(index=descriptors,data=np.random.randn(21)/25),trt2_cv,1,sc2_cv_jn.std(axis=0),trt_err=trt2_cv_jn.std(axis=0))
plt.xlim(0,1.0)
plt.ylim(-1.0,1.0);
plt.tick_params(axis='both', which='major', labelsize=18);
#plt.savefig('../../figures/test-retest_sc2.eps',format='eps')