In [132]:
# Preliminaries to work with the data.
%matplotlib inline
%run __init__.py
import opc_python
from opc_python.utils import loading, scoring
from opc_python.gerkin import dream,params
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
In [38]:
training_leaderboard_CIDs = sorted(loading.get_CIDs('training')+loading.get_CIDs('leaderboard'))
len(training_leaderboard_CIDs)
Out[38]:
In [2]:
descriptors = loading.get_descriptors(format=True)
all_CIDs = sorted(loading.get_CIDs('training')+loading.get_CIDs('leaderboard')+loading.get_CIDs('testset'))
mdx_full = dream.get_molecular_data(['dragon','episuite','morgan','nspdk','gramian'],all_CIDs)
mdx_drag_morg = dream.get_molecular_data(['dragon','morgan'],all_CIDs)
mdx_drag = dream.get_molecular_data(['dragon'],all_CIDs)
In [3]:
# Create the feature matrices from the feature dicts.
from sklearn.preprocessing import Imputer,MinMaxScaler
X_drag,good1,good2,means,stds,imputer = dream.make_X(mdx_drag,['training','leaderboard'])
X_drag_morg,good1,good2,means,stds,imputer = dream.make_X(mdx_drag_morg,['training','leaderboard'])
def quad_prep(mdx):
X_temp,_,_,_,_,_ = dream.make_X(mdx,['training','leaderboard'],raw=True)
X_temp[np.isnan(X_temp)] = 0
X_scaled = MinMaxScaler().fit_transform(X_temp[:,:-2])
X_scaled_sq = np.hstack((X_scaled,X_scaled**2,X_temp[:,-2:]))
return X_scaled_sq
X_drag_sq = quad_prep(mdx_drag)
X_drag_morg_sq = quad_prep(mdx_drag_morg)
Y_all,imputer = dream.make_Y_obs(['training','leaderboard'],target_dilution=None,imputer='mask')
In [4]:
lin_importances = np.zeros((250,21,14613)) # 10 splits, 21 descriptors, 14613 features without the leak (7606*2 + negLogD)
lin_ranked = np.zeros((250,21,14613)).astype(int) # Matrix to store the score rankings.
lin_features = pd.read_csv('../../data/linear_scores_and_features/features_dragon_morgan.csv',nrows=0)
for split in range(250):
if split % 25 == 0:
print("Finished loading the first %d splits" % split)
for desc in range(21):
scores = pd.read_csv('../../data/linear_scores_and_features/LB_scores_morgan%d/scores_%d.csv' % (split,desc),index_col=0)
if 'Intensity' in scores.index:
scores.drop(['0','Intensity'],inplace=1) # Get rid of that weird "0" feature and the leak feature.
if 'neglog10d' not in scores.index:
scores.loc[14612] = 1 # Add in the dilution if it isn't there. Assume it is as important as possible.
scores['int'] = np.arange(14613).astype(int) # Temp index to use for ranking.
lin_ranked[split,desc,:] = scores.sort_values(by='0',ascending=0)['int'] # Sort and store the ranking.
In [5]:
lin_ranked[0,0,:] # The feature ranking for the first split, for intensity.
# negLogD (14612) should appear first if this is working.
Out[5]:
In [25]:
from sklearn.cross_validation import ShuffleSplit
from sklearn.linear_model import Ridge
from sklearn.feature_selection import RFE
from sklearn.linear_model import RandomizedLasso
n_features = [1,2,3,4,5,10,33,100,333,1000,3333,10000]
n_splits = 100
DoubleSS = dream.DoubleSS
Y = Y_all['mean_std'] # Use the perceptual means (second 21 columns are std's which are ignored here)
In [26]:
def feature_sweep(X_all,Y,n_estimators,n_splits=n_splits,n_features=n_features,model='rf',wrong_split=False,max_features='auto',
max_depth=None,min_samples_leaf=1,alpha=1.0,rfe=False,use_rick_lin_ranks=False,random_state=0):
rs = np.ma.zeros((21,len(n_features),n_splits)) # Empty matrix to store correlations.
n_obs = int(X_all.shape[0]/2) # Number of molecules.
shuffle_split = ShuffleSplit(n_obs,n_splits,test_size=0.17,random_state=0) # This will produce the splits in
# train/test_big that I put on GitHub
for col in range(0,21): # For each descriptor.
print(col)
X = X_all[:,:-1] # Remove high/low dilution feature, i.e. remove the leak.
observed = Y[:,col] # Perceptual data for this descriptor.
n_features_ = list(np.array(n_features)+(col==0))
n_features
cv = DoubleSS(shuffle_split, col, X_all[:,-2]) # Produce the correct train and test indices.
for j,(train,test) in enumerate(cv):
if model == 'rf': # If the model is random forest regression.
if col==0:
est = ExtraTreesRegressor(n_estimators=n_estimators,max_features=max_features,max_depth=max_depth,
min_samples_leaf=min_samples_leaf,n_jobs=8,random_state=random_state)
else:
est = RandomForestRegressor(n_estimators=n_estimators,max_features=max_features,max_depth=max_depth,
min_samples_leaf=min_samples_leaf,oob_score=False,n_jobs=8,random_state=random_state)
elif model == 'ridge': # If the model is ridge regression.
est = Ridge(alpha=alpha,fit_intercept=True, normalize=False,
copy_X=True, max_iter=None, tol=0.001, solver='auto', random_state=random_state)
if rfe:
rfe = RFE(estimator=est, step=n_features_, n_features_to_select=1)
rfe.fit(X[train,:],observed[train])
else:
est.fit(X[train,:],observed[train]) # Fit the model on the training data.
if model == 'rf':
importance_ranks = np.argsort(est.feature_importances_)[::-1] # Use feature importances to get ranks.
elif model == 'ridge':
if use_rick_lin_ranks:
importance_ranks = lin_ranked_rick[int(j+wrong_split) % n_splits,col,:] # Use the pre-computed ranks.
else:
importance_ranks = lin_ranked[int(j+wrong_split) % n_splits,col,:] # Use the pre-computed ranks.
for i,n_feat in enumerate(n_features_):
if col==0:
n_feat += 1 # Add one for intensity since negLogD is worthless when all concentrations are 1/1000.
if hasattr(est,'max_features') and est.max_features not in [None,'auto']:
if n_feat < est.max_features:
est.max_features = n_feat
if rfe:
est.fit(X[train,:][:,rfe.ranking_<=(1+i)],observed[train])
predicted = est.predict(X[test,:][:,rfe.ranking_<=(1+i)])
else:
#est.max_features = None
est.fit(X[train,:][:,importance_ranks[:n_feat]],
observed[train]) # Fit the model on the training data with max_features features.
predicted = est.predict(X[test,:][:,importance_ranks[:n_feat]]) # Predict the test data.
rs[col,i,j] = np.corrcoef(predicted,observed[test])[1,0] # Compute the correlation coefficient.
means = rs[col,:,:].mean(axis=1)
sems = rs[col,:,:].std(axis=1)/np.sqrt(n_splits)
return rs
In [8]:
# The right way (with feature scores and data from the same splits)
rs_lin_right = feature_sweep(X_drag_morg_sq,Y,50,n_splits=n_splits,model='ridge',alpha=1.0,rfe=False)
In [9]:
# The wrong way (with feature scores and data from different splits)
rs_lin_wrong = feature_sweep(X_drag_morg_sq,Y,50,n_splits=n_splits,model='ridge',wrong_split=True,alpha=1.0,rfe=False)
In [21]:
def plot(right,wrong,labels=['right','wrong']):
fig,axes = plt.subplots(3,7,sharex=False,sharey=True,figsize=(20,10))
for col,ax in enumerate(axes.flat):
ax.errorbar(n_features,right[col,:,:].mean(axis=1),right[col,:,:].std(axis=1)/np.sqrt(n_splits),color='red',label=labels[0])
ax.errorbar(n_features,wrong[col,:,:].mean(axis=1),wrong[col,:,:].std(axis=1)/np.sqrt(n_splits),color='blue',label=labels[1])
if col==0:
handles, labels = ax.get_legend_handles_labels()
lg = ax.legend(handles[0:], labels[0:], loc=4, fontsize=16)
lg.draw_frame(False)
ax.set_xlim(0.5,20000)
ax.set_ylim(0,0.8)
ax.set_yticks(np.linspace(0,0.6,4))
ax.set_yticklabels([_ for _ in np.linspace(0,0.6,4)],size=20)
ax.set_xticklabels(n_features,size=20)
ax.set_xscale('log')
ax.set_title(descriptors[col], size=25)
plt.tight_layout()
fig.text(0.5, -0.025, 'Number of features', ha='center', size=25)
fig.text(-0.02, 0.5, 'Correlation', va='center', rotation='vertical', size=25);
In [12]:
plot(rs_lin_right,rs_lin_wrong)
In [13]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
In [27]:
lin_ranked_rick = np.zeros((n_splits,21,14613)).astype(int) # Matrix to store the score rankings.
rl = RandomizedLasso(alpha=0.025,selection_threshold=0.025,n_resampling=10,random_state=25,n_jobs=1)
n_obs = int(X_drag_morg_sq.shape[0]/2) # Number of molecules.
shuffle_split = ShuffleSplit(n_obs,n_splits,test_size=0.17,random_state=0) # This will produce the splits in
# train/test_big that I put on GitHub
X_all = X_drag_morg_sq
for col in range(21):
cv = DoubleSS(shuffle_split, col, X_all[:,-2]) # Produce the correct train and test indices.
for j,(train,test) in enumerate(cv):
print(descriptors[col],j)
observed = Y[train,col]
X = X_all[train,:-1] # Remove leak feature
rl.fit(X,observed)
lin_ranked_rick[j,col,:] = np.argsort(rl.all_scores_.ravel())[::-1]
In [29]:
#np.save('../../data/lin_ranked_rick',lin_ranked_rick)
In [88]:
rs_lin_rick_right_10 = feature_sweep(X_drag_morg_sq,Y,50,n_splits=n_splits,model='ridge',wrong_split=False,alpha=10.0,rfe=False,use_rick_lin_ranks=True)
In [89]:
rs_lin_rick_wrong_10 = feature_sweep(X_drag_morg_sq,Y,50,n_splits=n_splits,model='ridge',wrong_split=True,alpha=10.0,rfe=False,use_rick_lin_ranks=True)
In [90]:
plot(rs_lin_rick_right,rs_lin_rick_right_10)
In [91]:
plot(rs_lin_rick_right_10,rs_lin_rick_wrong_10)
In [36]:
#np.save('../../data/rs_lin_rick_right',rs_lin_rick_right.data)
In [128]:
rs0 = np.load('../../data/forest_rs_250_0.npy')
rs0_9 = np.load('../../data/forest_rs_250_0-9.npy')
rs10_20 = np.load('../../data/forest_rs_250_10-20.npy')
rs_forest = rs0_9.copy()
rs_forest[0,:,:] = rs0[0,:,:]
rs_forest[10:,:,:] = rs10_20[10:,:,:]
rs_forest = np.ma.array(rs_forest,mask=np.isnan(rs_forest))
In [129]:
plot(rs_lin_rick_right_10,rs_forest[:,:,:n_splits],labels=['quad','forest'])
plt.savefig('../../figures/quad_forest_vs_features.eps',format='eps')
In [97]:
lin_ranked_rick_drag = np.zeros((n_splits,21,9739)).astype(int) # Matrix to store the score rankings.
rl = RandomizedLasso(alpha=0.025,selection_threshold=0.025,n_resampling=10,random_state=25,n_jobs=1)
n_obs = int(X_drag_sq.shape[0]/2) # Number of molecules.
shuffle_split = ShuffleSplit(n_obs,n_splits,test_size=0.17,random_state=0) # This will produce the splits in
# train/test_big that I put on GitHub
X_all = X_drag_sq
for col in range(21):
cv = DoubleSS(shuffle_split, col, X_all[:,-2]) # Produce the correct train and test indices.
for j,(train,test) in enumerate(cv):
print(descriptors[col],j)
observed = Y[train,col]
X = X_all[train,:-1] # Remove leak feature
rl.fit(X,observed)
lin_ranked_rick_drag[j,col,:] = np.argsort(rl.all_scores_.ravel())[::-1]
In [95]:
n_splits = 25
In [119]:
def master_cv(X_all,Y,n_estimators,n_splits=25,model='rfc',alpha=10.0,random_state=0,feature_list=slice(None)):
rs = np.zeros((21,n_splits))
n_obs = int(X_all.shape[0]/2)
shuffle_split = ShuffleSplit(n_obs,n_splits,test_size=0.17,random_state=0) # This random state *must* be zero.
for col in range(21):
print(col)
X = X_all[:,:-1] # Remove high/low dilution feature.
observed = Y[:,col]
n_features_ = list(np.array(n_features)+(col==0))
cv = DoubleSS(shuffle_split, col, X_all[:,-2])
for j,(train,test) in enumerate(cv):
#print(col,j)
if model == 'rfc':
if col==0:
est = ExtraTreesRegressor(n_estimators=n_estimators,max_features=max_features[col], max_depth=max_depth[col],
min_samples_leaf=min_samples_leaf[col],
n_jobs=8,random_state=0)
else:
est = RandomForestRegressor(n_estimators=n_estimators,max_features=max_features[col], max_depth=max_depth[col],
min_samples_leaf=min_samples_leaf[col],
oob_score=False,n_jobs=8,random_state=0)
elif model == 'ridge':
est = Ridge(alpha=alpha,random_state=random_state)
est.fit(X[train,:][:,feature_list[j,col,:]],observed[train])
predicted = est.predict(X[test,:][:,feature_list[j,col,:]])
rs[col,j] = np.corrcoef(predicted,observed[test])[1,0]
mean = rs[col,:].mean()
sem = rs[col,:].std()/np.sqrt(n_splits)
print(('Desc. %d: %.3f' % (col,mean)))
return rs
In [125]:
rs_lin_drag = master_cv(X_drag_sq,Y,None,n_splits=25,model='ridge',alpha=10.0,feature_list=lin_ranked_rick_drag[:,:,:1000])
In [126]:
rs_lin_drag_morg = master_cv(X_drag_morg_sq,Y,None,n_splits=25,model='ridge',alpha=10.0,feature_list=lin_ranked_rick[:,:,:1000])
In [123]:
from scipy.stats import ttest_rel
wit = [rs_lin_drag_morg.copy(),rs_lin_drag_morg.copy()]
wout = [rs_lin_drag.copy(),rs_lin_drag.copy()]
diff = [wit[i].mean(axis=1) - wout[i].mean(axis=1) for i in range(2)]
order = [np.argsort(diff[i])[::-1] for i in range(2)]
ts,ps = [np.zeros(21),np.zeros(21)],[np.zeros(21),np.zeros(21)]
# Compute p-values and sort by effect size
for i in range(2):
for j in range(21):
ts[i][j],ps[i][j] = ttest_rel(wit[i][j,:],wout[i][j,:])
ps[i][j] = ps[i][j]*21/(np.argsort(ps[i][j])+1) # FDR correction
ts[i] = ts[i][order[i]]
ps[i] = ps[i][order[i]]
diff[i] = diff[i][order[i]]
wit[i] = wit[i][order[i],:]
wout[i] = wout[i][order[i],:]
fig,ax = plt.subplots(4,1,figsize=(15,16),sharex=True)
yerr = [(wit[i] - wout[i]).std(axis=1)/np.sqrt(n_splits) for i in range(2)]
yerr1 = [[yerr[i][j] if diff[i][j]>=0 else np.nan for j in range(21)] for i in range(2)]
yerr2 = [[yerr[i][j] if diff[i][j]<0 else np.nan for j in range(21)] for i in range(2)]
for i in range(2):
ax[2*i].bar(np.arange(21),wit[i].mean(axis=1),color='r',yerr=yerr1[i],
error_kw=dict(ecolor='gray', lw=2, capsize=5, capthick=2),alpha=0.5,label='+morgan')
ax[2*i].bar(np.arange(21),wout[i].mean(axis=1),color='b',yerr=yerr2[i],
error_kw=dict(ecolor='gray', lw=2, capsize=5, capthick=2),alpha=0.5,label='--morgan')
ax[2*i+1].bar(np.arange(21),
diff[i],
yerr=yerr[i],color='k',#width=0.4,
error_kw=dict(ecolor='gray', lw=2, capsize=5, capthick=2),
alpha=0.5)
for j in range(21):
if ps[i][j] < 0.001:
star = '***'
elif ps[i][j] < 0.01:
star = '**'
elif ps[i][j] < 0.05:
star = '*'
else:
star = ''
ax[2*i+1].text(j+0.42,diff[i][j]+(-1 if diff[i][j]<0 else 1)*(yerr[i][j]+0.015),star,
size=12,horizontalalignment='center',verticalalignment='center')
ax[2*i].legend(fontsize=12)
ax[2*i].set_ylim(0,1)
ax[2*i+1].set_ylim(diff[i].min()-0.08,diff[i].max()+0.08)
ax[2*i].set_ylabel('Model/Data\nCorrelation')
ax[2*i+1].set_ylabel('Correlation increase\nwith Morgan')
ax[2*i+1].yaxis.labelpad = 25
ax[2*1+1].set_yticks(np.arange(-0.1,0.3,0.1))
for i in range(4):
ax[i].set_xlim(0,21)
ax[i].set_xticks(np.arange(21)+0.4)
ax[i].set_xticklabels([_.split('/')[0].lower() for _ in [descriptors[j] for j in order[int(i/2)]]],rotation=90)
plt.tight_layout()
"""
plt.figure(figsize=(10,10))
plt.errorbar(wout.mean(axis=1),
wit.mean(axis=1),
xerr=wout.std(axis=1)/np.sqrt(n_splits),
yerr=wit.std(axis=1)/np.sqrt(n_splits),
fmt='o',markersize=0)
for j in range(21):
plt.text(wout.mean(axis=1)[j],wit.mean(axis=1)[j],letters[j],
fontdict={'size':14,'weight':'bold'},horizontalalignment='center',verticalalignment='center')
plt.plot([0,1],[0,1],'--')
plt.xlabel('without Morgan')
plt.ylabel('with Morgan');
""";
plt.savefig('/Users/rgerkin/Desktop/with_without_morgan.eps',format='eps')
In [127]:
from scipy.stats import ttest_rel
wit = [rs_lin_drag_morg.copy(),rs_lin_drag_morg.copy()]
wout = [rs_lin_drag.copy(),rs_lin_drag.copy()]
diff = [wit[i].mean(axis=1) - wout[i].mean(axis=1) for i in range(2)]
order = [np.argsort(diff[i])[::-1] for i in range(2)]
ts,ps = [np.zeros(21),np.zeros(21)],[np.zeros(21),np.zeros(21)]
# Compute p-values and sort by effect size
for i in range(2):
for j in range(21):
ts[i][j],ps[i][j] = ttest_rel(wit[i][j,:],wout[i][j,:])
ps[i][j] = ps[i][j]*21/(np.argsort(ps[i][j])+1) # FDR correction
ts[i] = ts[i][order[i]]
ps[i] = ps[i][order[i]]
diff[i] = diff[i][order[i]]
wit[i] = wit[i][order[i],:]
wout[i] = wout[i][order[i],:]
fig,ax = plt.subplots(4,1,figsize=(15,16),sharex=True)
yerr = [(wit[i] - wout[i]).std(axis=1)/np.sqrt(n_splits) for i in range(2)]
yerr1 = [[yerr[i][j] if diff[i][j]>=0 else np.nan for j in range(21)] for i in range(2)]
yerr2 = [[yerr[i][j] if diff[i][j]<0 else np.nan for j in range(21)] for i in range(2)]
for i in range(2):
ax[2*i].bar(np.arange(21),wit[i].mean(axis=1),color='r',yerr=yerr1[i],
error_kw=dict(ecolor='gray', lw=2, capsize=5, capthick=2),alpha=0.5,label='+morgan')
ax[2*i].bar(np.arange(21),wout[i].mean(axis=1),color='b',yerr=yerr2[i],
error_kw=dict(ecolor='gray', lw=2, capsize=5, capthick=2),alpha=0.5,label='--morgan')
ax[2*i+1].bar(np.arange(21),
diff[i],
yerr=yerr[i],color='k',#width=0.4,
error_kw=dict(ecolor='gray', lw=2, capsize=5, capthick=2),
alpha=0.5)
for j in range(21):
if ps[i][j] < 0.001:
star = '***'
elif ps[i][j] < 0.01:
star = '**'
elif ps[i][j] < 0.05:
star = '*'
else:
star = ''
ax[2*i+1].text(j+0.42,diff[i][j]+(-1 if diff[i][j]<0 else 1)*(yerr[i][j]+0.015),star,
size=12,horizontalalignment='center',verticalalignment='center')
ax[2*i].legend(fontsize=12)
ax[2*i].set_ylim(0,1)
ax[2*i+1].set_ylim(diff[i].min()-0.08,diff[i].max()+0.08)
ax[2*i].set_ylabel('Model/Data\nCorrelation')
ax[2*i+1].set_ylabel('Correlation increase\nwith Morgan')
ax[2*i+1].yaxis.labelpad = 25
ax[2*1+1].set_yticks(np.arange(-0.1,0.3,0.1))
for i in range(4):
ax[i].set_xlim(0,21)
ax[i].set_xticks(np.arange(21)+0.4)
ax[i].set_xticklabels([_.split('/')[0].lower() for _ in [descriptors[j] for j in order[int(i/2)]]],rotation=90)
plt.tight_layout()
"""
plt.figure(figsize=(10,10))
plt.errorbar(wout.mean(axis=1),
wit.mean(axis=1),
xerr=wout.std(axis=1)/np.sqrt(n_splits),
yerr=wit.std(axis=1)/np.sqrt(n_splits),
fmt='o',markersize=0)
for j in range(21):
plt.text(wout.mean(axis=1)[j],wit.mean(axis=1)[j],letters[j],
fontdict={'size':14,'weight':'bold'},horizontalalignment='center',verticalalignment='center')
plt.plot([0,1],[0,1],'--')
plt.xlabel('without Morgan')
plt.ylabel('with Morgan');
""";
plt.savefig('/Users/rgerkin/Desktop/with_without_morgan.eps',format='eps')
In [ ]: