In [1]:
# Preliminaries to work with the data.
%matplotlib inline
%run __init__.py
from utils import loading, scoring, prog
from gerkin import dream,params,fit2
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import matplotlib as mpl
mpl.rcParams.update({'font.size':14})
In [2]:
# Load the data
descriptors = loading.get_descriptors(format='True')
all_CIDs = loading.get_CIDs(['training','leaderboard','testset'])
testset_CIDs = loading.get_CIDs(['testset'])
all_CID_dilutions = loading.get_CID_dilutions(['training','leaderboard','testset'])
#mdx_full = dream.get_molecular_data(['dragon','episuite','morgan','nspdk','gramian'],all_CIDs)
features = loading.get_molecular_data(['dragon','morgan'],all_CIDs)
In [3]:
# Create the feature and descriptor arrays
X,_,_,_,_,_ = dream.make_X(features,all_CID_dilutions)
X_train = X.drop(testset_CIDs)
X_test = X.drop(X_train.index)
In [ ]:
# Load and split perceptual data
Y_train = loading.load_perceptual_data(['training','leaderboard'])
Y_train = Y_train.groupby(level=['Descriptor','CID','Dilution']).mean() # Average over replicates
Y_test = loading.load_perceptual_data('testset')
In [ ]:
from sklearn.model_selection import ShuffleSplit
from sklearn.ensemble import RandomForestRegressor
n_subjects = 49
n_splits = 25
trans_params = params.get_trans_params(Y_train, descriptors, plot=False)
use_et, max_features, max_depth, min_samples_leaf, trans_weight, regularize, use_mask = params.get_other_params()
ss = ShuffleSplit(n_splits=n_splits,test_size=(24./49),random_state=0)
rs_in = pd.DataFrame(index=range(n_splits),columns=descriptors) # Same subject, different molecules correlations.
rs_out = pd.DataFrame(index=range(n_splits),columns=descriptors) # Different subject, different molecule correlations.
for d,descriptor in enumerate(descriptors):
print("%d) %s" % (d,descriptor))
rfc = RandomForestRegressor(n_estimators=30, max_features='auto', random_state=0)
for i,(train,test) in enumerate(ss.split(range(n_subjects))):
prog(i,n_splits)
train+=1; test+=1; # Subjects are 1-indexed.
rfc.fit(X_train,Y_train['Subject'][train].mean(axis=1).loc[descriptor])
Y_test_in = Y_test['Subject'][train].mean(axis=1).loc[descriptor]
Y_test_out = Y_test['Subject'][test].mean(axis=1).loc[descriptor]
Y_predicted = rfc.predict(X_test)
rs_in.loc[i,descriptor] = np.corrcoef(Y_predicted,Y_test_in)[0,1]
rs_out.loc[i,descriptor] = np.corrcoef(Y_predicted,Y_test_out)[0,1]
In [ ]:
# 25 x 30
fig,axes = plt.subplots(2,2,figsize=(10,10))
ax = axes.flat
ax[0].errorbar(range(len(descriptors)),rs_in.mean(),yerr=rs_in.sem(),
color='k',fmt='o-',label='Same %d subjects' % 25)
ax[0].errorbar(range(len(descriptors)),rs_out.mean(),yerr=rs_out.sem(),
color='r',fmt='o-',label='Different %d subjects' % 24)
order = rs_in.mean().sort_values()[::-1].index
ax[1].errorbar(range(len(descriptors)),rs_in.mean()[order],yerr=rs_in.sem()[order],
color='k',fmt='o-',label='Same %d subjects' % 25)
ax[1].errorbar(range(len(descriptors)),rs_out.mean()[order],yerr=rs_out.sem()[order],
color='r',fmt='o-',label='Different %d subjects' % 24)
for i in [0,1]:
ax[i].set_xlim(-0.5,len(descriptors)-0.5)
ax[i].set_ylim(0,0.82)
ax[i].set_xticklabels(order,rotation=90);
ax[i].set_ylabel('Correlation')
ax[i].legend(fontsize=10)
ax[2].errorbar(rs_in.mean(),rs_out.mean(),
xerr=rs_in.sem(),yerr=rs_out.sem(),
color='k',fmt='o')
ax[2].plot([0,1],[0,1],'--')
ax[2].set_xlim(0,0.82)
ax[2].set_xlabel('Correlation\n(Same 25 subjects)')
ax[2].set_ylabel('Correlation\n(Different 25 subjects)')
order = (rs_in-rs_out).mean().sort_values()[::-1].index
ax[3].errorbar(range(len(descriptors)),(rs_in-rs_out).mean()[order],
yerr=(rs_in-rs_out).sem()[order],
color='k',fmt='o-')
ax[3].plot([0,len(descriptors)],[0,0],'--')
ax[3].set_xlim(-0.5,len(descriptors)-0.5)
ax[3].set_ylim(-0.05,0.1)
ax[3].set_xticklabels(order,rotation=90);
ax[3].set_ylabel('Correlation Difference')
plt.tight_layout()
plt.savefig('../../figures/subject-splits.eps',format='eps')
In [ ]:
print('%.3f +/- %.3f, with maximum value %.3f' % \
((rs_in-rs_out).mean().mean(),(rs_in-rs_out).mean().std(),(rs_in-rs_out).mean().max()))
from scipy.stats import ttest_rel,chi2
# No FDR correction
chi2_ = 0
for d,descriptor in enumerate(descriptors):
p = ttest_rel(rs_in[descriptor],rs_out[descriptor])[1]
chi2_ += -2*np.log(p)
print('%s%.3f' % ((descriptor+':').ljust(15),p))
p_pooled = 1-chi2.cdf(chi2_,2*len(descriptors))
print("Pooled p-value = %.3g" % p_pooled)