In [1]:
# Preliminaries to work with the data.
%matplotlib inline
%run __init__.py
import matplotlib as mpl
from scipy.stats import ttest_rel
import utils
from utils import loading, scoring
from gerkin import dream,params#,fit2
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
In [2]:
# Load the data
descriptors = loading.get_descriptors(format=True)
sets = ['training','leaderboard']#,'testset'])
all_CIDs = loading.get_CIDs(sets)
all_CID_dilutions = loading.get_CID_dilutions(sets)
#features_full = dream.get_molecular_data(['dragon','episuite','morgan','nspdk','gramian'],all_CIDs)
features_drag_morg = loading.get_molecular_data(['dragon','morgan'],all_CIDs)
features_drag = loading.get_molecular_data(['dragon'],all_CIDs)
In [40]:
# Create the feature and descriptor arrays
X_drag,_,_,_,_,_ = dream.make_X(features_drag,all_CID_dilutions)
X_drag_morg,_,_,_,_,_ = dream.make_X(features_drag_morg,all_CID_dilutions)
X_drag_sq = dream.quad_prep(features_drag,all_CID_dilutions)
X_drag_morg_sq = dream.quad_prep(features_drag_morg,all_CID_dilutions)
Y = loading.load_perceptual_data(sets)
In [43]:
X_drag.drop('mean_dilution',1,inplace=True)
X_drag_morg.drop('mean_dilution',1,inplace=True)
X_drag_sq.drop('mean_dilution',1,inplace=True)
X_drag_morg_sq.drop('mean_dilution',1,inplace=True)
In [53]:
Y_mean = Y['Subject'].mean(axis=1).unstack('Descriptor')
In [54]:
n_splits = 100
use_saved_linear_feature_ranks = True
if use_saved_linear_feature_ranks:
lin_ranked_drag_morg = np.load('../../data/lin_ranked_rick.npy')
else:
lin_ranked_drag_morg = fit2.compute_linear_feature_ranks_cv(X_drag_morg_sq,Y_mean,
n_resampling=10,n_splits=n_splits)
np.save('../../data/lin_ranked_rick',lin_ranked_drag_morg)
In [55]:
use_saved_linear_feature_ranks = True
if use_saved_linear_feature_ranks:
lin_ranked_drag = np.load('../../data/lin_ranked_rick_drag.npy')
else:
lin_ranked_drag = fit2.compute_linear_feature_ranks_cv(X_drag_sq,Y__mean,
n_resampling=10,n_splits=n_splits)
np.save('../../data/lin_ranked_rick_drag',lin_ranked_drag)
In [7]:
use_saved_linear_model = True
max_features = 1000
alpha = 10.0
if use_saved_linear_model:
rs_lin_with_morgan = np.load('../../data/rs_lin_with_morgan.npy')
else:
rs_lin_with_morgan = fit2.master_cv(X_drag_morg_sq,Y_mean,n_splits=n_splits,model='ridge',
alpha=alpha,feature_list=lin_ranked_drag_morg[:,:,:max_features])
np.save('../../data/rs_lin_with_morgan',rs_lin_with_morgan)
In [8]:
use_saved_linear_model = True
if use_saved_linear_model:
rs_lin_without_morgan = np.load('../../data/rs_lin_without_morgan.npy')
else:
rs_lin_without_morgan = fit2.master_cv(X_drag_sq,Y_mean,n_splits=n_splits,model='ridge',
alpha=alpha,feature_list=lin_ranked_drag[:,:,:max_features])
np.save('../../data/rs_lin_without_morgan',rs_lin_without_morgan)
In [56]:
use_saved_rf_model = True
if use_saved_rf_model:
rs_dec_with_morgan = np.load('../../data/rs_dec_with_morgan.npy')
else:
rs_dec_with_morgan = fit2.master_cv(X_drag_morg,Y,n_estimators=50,n_splits=n_splits,model='rf')
np.save('../../data/rs_dec_with_morgan',rs_dec_with_morgan)
In [57]:
use_saved_rf_model = True
if use_saved_rf_model:
rs_dec_without_morgan = np.load('../../data/rs_dec_without_morgan.npy')
else:
rs_dec_without_morgan = fit2.master_cv(X_drag,Y,n_estimators=50,n_splits=n_splits,model='rf')
np.save('../../data/rs_dec_without_morgan',rs_dec_without_morgan)
In [58]:
n_splits = 50
# Compute differences between with and without Morgan.
wit = [rs_dec_with_morgan[:,:n_splits],rs_lin_with_morgan[:,:n_splits]]
wout = [rs_dec_without_morgan[:,:n_splits],rs_lin_without_morgan[:,:n_splits]]
diff = [wit[i].mean(axis=1) - wout[i].mean(axis=1) for i in range(2)]
order = [np.argsort(diff[i])[::-1] for i in range(2)]
order[1] = order[0]
ts,ps = [np.zeros(21),np.zeros(21)],[np.zeros(21),np.zeros(21)]
# Compute p-values and sort by effect size
for i in range(2):
for j in range(21):
ts[i][j],ps[i][j] = ttest_rel(wit[i][j,:],wout[i][j,:])
ps[i][j] = ps[i][j]*21/(np.argsort(ps[i][j])+1) # FDR correction
ts[i] = ts[i][order[i]]
ps[i] = ps[i][order[i]]
diff[i] = diff[i][order[i]]
wit[i] = wit[i][order[i],:]
wout[i] = wout[i][order[i],:]
# Compute error bars
yerr = [(wit[i] - wout[i]).std(axis=1)/np.sqrt(n_splits) for i in range(2)]
yerr1 = [[yerr[i][j] if diff[i][j]>=0 else np.nan for j in range(21)] for i in range(2)]
yerr2 = [[yerr[i][j] if diff[i][j]<0 else np.nan for j in range(21)] for i in range(2)]
In [67]:
# Plot the correlations, with and without the Morgan features.
mpl.rcParams.update({'font.size': 12})
fig,ax = plt.subplots(4,1,figsize=(15,16),sharex=False)
for i in range(2):
ax[2*i].bar(np.arange(21),wit[i].mean(axis=1),color='r',yerr=yerr1[i],
error_kw=dict(ecolor='gray', lw=2, capsize=5, capthick=2),alpha=0.5,label='+morgan')
ax[2*i].bar(np.arange(21),wout[i].mean(axis=1),color='b',yerr=yerr2[i],
error_kw=dict(ecolor='gray', lw=2, capsize=5, capthick=2),alpha=0.5,label='--morgan')
ax[2*i+1].bar(np.arange(21),
diff[i],
yerr=yerr[i],color='k',#width=0.4,
error_kw=dict(ecolor='gray', lw=2, capsize=5, capthick=2),
alpha=0.5)
for j in range(21):
if ps[i][j] < 0.001:
star = '***'
elif ps[i][j] < 0.01:
star = '**'
elif ps[i][j] < 0.05:
star = '*'
else:
star = ''
ax[2*i+1].text(j,diff[i][j]+(-1 if diff[i][j]<0 else 1)*(yerr[i][j]+0.015),star,
size=12,horizontalalignment='center',verticalalignment='center')
ax[2*i].legend(fontsize=12)
ax[2*i].set_ylim(0,1)
ax[2*i+1].set_ylim(diff[i].min()-0.08,diff[i].max()+0.08)
ax[2*i].set_ylabel('Model/Data\nCorrelation')
ax[2*i+1].set_ylabel('Correlation increase\nwith Morgan')
ax[2*i+1].yaxis.labelpad = 25
ax[2*1+1].set_yticks(np.arange(-0.1,0.3,0.1))
for i in range(4):
ax[i].set_xlim(-0.5,20.5)
ax[i].set_xticks(np.arange(21))
ax[i].set_xticklabels([descriptors[j] for j in order[int(i/2)]],rotation=90)
ax[0].set_title('Random Forest Model')
ax[2].set_title('Quadratic Model')
plt.tight_layout()
plt.savefig('../../figures/with_without_morgan.eps',format='eps')