Figure S7A


In [1]:
# Preliminaries to work with the data.   
%matplotlib inline
%run __init__.py
from utils import loading, scoring, prog
from gerkin import dream,params,fit2
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import matplotlib as mpl
mpl.rcParams.update({'font.size':14})

In [2]:
# Load the data
descriptors = loading.get_descriptors(format='True')
all_CIDs = loading.get_CIDs(['training','leaderboard','testset'])
testset_CIDs = loading.get_CIDs(['testset'])
all_CID_dilutions = loading.get_CID_dilutions(['training','leaderboard','testset'])
#mdx_full = dream.get_molecular_data(['dragon','episuite','morgan','nspdk','gramian'],all_CIDs)
features = loading.get_molecular_data(['dragon','morgan'],all_CIDs)


Dragon has 4869 features for 476 molecules.
Morgan has 2437 features for 476 molecules.
There are now 7306 total features.

In [3]:
# Create the feature and descriptor arrays 
X,_,_,_,_,_ = dream.make_X(features,all_CID_dilutions)
X_train = X.drop(testset_CIDs)
X_test = X.drop(X_train.index)


The X matrix now has shape (814x5519) molecules by non-NaN good molecular descriptors

In [ ]:
# Load and split perceptual data
Y_train = loading.load_perceptual_data(['training','leaderboard'])
Y_train = Y_train.groupby(level=['Descriptor','CID','Dilution']).mean() # Average over replicates
Y_test = loading.load_perceptual_data('testset')

Load or compute the random forest model


In [ ]:
from sklearn.model_selection import ShuffleSplit
from sklearn.ensemble import RandomForestRegressor

n_subjects = 49
n_splits = 25
trans_params = params.get_trans_params(Y_train, descriptors, plot=False)
use_et, max_features, max_depth, min_samples_leaf, trans_weight, regularize, use_mask = params.get_other_params()
ss = ShuffleSplit(n_splits=n_splits,test_size=(24./49),random_state=0)
rs_in = pd.DataFrame(index=range(n_splits),columns=descriptors) # Same subject, different molecules correlations.  
rs_out = pd.DataFrame(index=range(n_splits),columns=descriptors) # Different subject, different molecule correlations.
for d,descriptor in enumerate(descriptors):
    print("%d) %s" % (d,descriptor))
    rfc = RandomForestRegressor(n_estimators=30, max_features='auto', random_state=0)
    for i,(train,test) in enumerate(ss.split(range(n_subjects))):
        prog(i,n_splits)
        train+=1; test+=1; # Subjects are 1-indexed.  
        rfc.fit(X_train,Y_train['Subject'][train].mean(axis=1).loc[descriptor])
        Y_test_in = Y_test['Subject'][train].mean(axis=1).loc[descriptor]
        Y_test_out = Y_test['Subject'][test].mean(axis=1).loc[descriptor]
        Y_predicted = rfc.predict(X_test)
        rs_in.loc[i,descriptor] = np.corrcoef(Y_predicted,Y_test_in)[0,1]
        rs_out.loc[i,descriptor] = np.corrcoef(Y_predicted,Y_test_out)[0,1]


0) Intensity
4.00% [--                                                ]

In [ ]:
# 25 x 30
fig,axes = plt.subplots(2,2,figsize=(10,10))
ax = axes.flat
ax[0].errorbar(range(len(descriptors)),rs_in.mean(),yerr=rs_in.sem(),
             color='k',fmt='o-',label='Same %d subjects' % 25)
ax[0].errorbar(range(len(descriptors)),rs_out.mean(),yerr=rs_out.sem(),
             color='r',fmt='o-',label='Different %d subjects' % 24)
order = rs_in.mean().sort_values()[::-1].index
ax[1].errorbar(range(len(descriptors)),rs_in.mean()[order],yerr=rs_in.sem()[order],
             color='k',fmt='o-',label='Same %d subjects' % 25)
ax[1].errorbar(range(len(descriptors)),rs_out.mean()[order],yerr=rs_out.sem()[order],
             color='r',fmt='o-',label='Different %d subjects' % 24)
for i in [0,1]:
    ax[i].set_xlim(-0.5,len(descriptors)-0.5)
    ax[i].set_ylim(0,0.82)
    ax[i].set_xticklabels(order,rotation=90);
    ax[i].set_ylabel('Correlation')
    ax[i].legend(fontsize=10)
ax[2].errorbar(rs_in.mean(),rs_out.mean(),
             xerr=rs_in.sem(),yerr=rs_out.sem(),
             color='k',fmt='o')
ax[2].plot([0,1],[0,1],'--')
ax[2].set_xlim(0,0.82)
ax[2].set_xlabel('Correlation\n(Same 25 subjects)')
ax[2].set_ylabel('Correlation\n(Different 25 subjects)')
order = (rs_in-rs_out).mean().sort_values()[::-1].index
ax[3].errorbar(range(len(descriptors)),(rs_in-rs_out).mean()[order],
               yerr=(rs_in-rs_out).sem()[order],
               color='k',fmt='o-')
ax[3].plot([0,len(descriptors)],[0,0],'--')
ax[3].set_xlim(-0.5,len(descriptors)-0.5)
ax[3].set_ylim(-0.05,0.1)
ax[3].set_xticklabels(order,rotation=90);
ax[3].set_ylabel('Correlation Difference')

plt.tight_layout()
plt.savefig('../../figures/subject-splits.eps',format='eps')

In [ ]:
print('%.3f +/- %.3f, with maximum value %.3f' % \
      ((rs_in-rs_out).mean().mean(),(rs_in-rs_out).mean().std(),(rs_in-rs_out).mean().max()))
from scipy.stats import ttest_rel,chi2
# No FDR correction
chi2_ = 0
for d,descriptor in enumerate(descriptors):
    p = ttest_rel(rs_in[descriptor],rs_out[descriptor])[1]
    chi2_ += -2*np.log(p)
    print('%s%.3f' % ((descriptor+':').ljust(15),p))
p_pooled = 1-chi2.cdf(chi2_,2*len(descriptors))
print("Pooled p-value = %.3g" % p_pooled)