In [1]:
# Preliminaries to work with the data.
%matplotlib inline
%run __init__.py
from utils import loading, scoring, prog
from gerkin import dream,params,fit2
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
In [2]:
# Load CIDs for all relevant molecules.
# dream: from the DREAM challenge
# new: from new unpublished data collected by A. Keller.
# shared: the overlap
dream_CIDs = loading.get_CIDs(['training','leaderboard','testset'])
df_new_CIDs = pd.read_csv('../../data/newdata_cids.txt',delimiter='\t',index_col=0,header=None,names=['CID'])['CID']
new_CIDs = list(df_new_CIDs)
shared_CIDs = list(set(dream_CIDs).intersection(new_CIDs))
dream_only_CIDs = list(set(dream_CIDs).difference(new_CIDs))
new_only_CIDs = list(set(new_CIDs).difference(dream_CIDs))
# Delete this line when we get these features.
In [3]:
# At time of publication, the features for one molecule, Bourgeonal,
# had not been computed and included in the analysis.
# We have since added the features for this molecule and updated the analysis,
# with no qualitative change in the results.
#new_CIDs.remove(64832) # Remove Bourgeonal since we have no Dragon features for it.
# Delete this line when we get these features.
#new_only_CIDs.remove(64832) # Remove Bourgeonal since we have no Dragon features for it.
# Delete this line when we get these features.
In [4]:
# Load all the perceptual data from the DREAM challenge.
df_dream = loading.load_perceptual_data(['training','leaderboard','testset'])
df_dream.head()
Out[4]:
In [5]:
# Load and format the new unpublished perceptual data from A. Keller.
df = pd.read_excel('../../data/0869 final data Pablo.xlsx',index_col=0,header=None).transpose().drop(1)
df = df.drop(df.loc[df['Subject Identifier']=='citronella'].index) # This is a mixture and so not used in this project
df = df.drop(df.loc[df['Subject Identifier']=='bourgeonal'].index) # I don't have Dragon features for this.
# Delete line when those features are available.
mux = pd.MultiIndex.from_arrays(df.values[:,[0,3,1,2]].transpose(), names=['Descriptor','CID','Dilution','Solvent'])
df = pd.DataFrame(data=df.values[:,4:], index=mux, dtype=float)
# Compute log diliutions
dilutions = set(df.index.levels[2])
dilutions.remove('pure')
log_dilutions = {dilution:-np.round(np.log10(float(dilution[2:].replace(',',''))),2) for dilution in dilutions}
log_dilutions['pure'] = 0
df = df.rename(index=log_dilutions)
# Fix odorant names
odorants = {x:x.replace(' (1M solution)','').replace(' (w/v)','') for x in set(df.index.levels[1])}
df = df.rename(index=odorants)
df = df.rename(index={x:x.title() for x in df.index.levels[0]})
# Sort by descriptor, then odor, then solvent, then dilution
df = df.sortlevel()
# Drop rows where the odorant is NaN.
df = df.loc[~(df.index.labels[1] == -1)]
df.insert(0,'Name',[ix[1] for ix in df.index])
df.insert(1,'Solvent',[ix[3] for ix in df.index])
df = df.rename(index={name:int(df_new_CIDs.loc[name]) for name in df_new_CIDs.index})
#df = df.rename(columns={x:'S%d'%x for x in df.columns if type(x) is int})
df.index = df.index.droplevel('Solvent')
df.insert(1,'Replicate',[i % 2 for i in range(len(df.index))])
df = df.set_index('Replicate',append=True)
df = df.sortlevel()
df.columns = [['Metadata']*2+['Subject']*403,df.columns]
df_new = df.copy()
df_new.head()
Out[5]:
In [6]:
# Make lits of all CIDs and dilutions for both datasets.
dream_CID_dilutions = loading.get_CID_dilutions(['training','leaderboard','testset'])
new_CID_dilutions = [tuple(x[1:3]) for x in df_new.index.values]
all_CID_dilutions = set(dream_CID_dilutions + new_CID_dilutions)
dream_only_CID_dilutions = [x for x in dream_CID_dilutions if x[0] not in new_CIDs]
shared_CID_dilutions_old = [x for x in dream_CID_dilutions if x[0] in new_CIDs]
shared_CID_dilutions_new = [x for x in new_CID_dilutions if x[0] in dream_CIDs]
new_only_CID_dilutions = [tuple(x[1:3]) for x in df_new.index.values if x[1] not in dream_CIDs]
In [7]:
# Load the Dragon and Morgan features for all of the DREAM molecules.
dream_features = loading.get_molecular_data(['dragon','morgan'],dream_CIDs)
dream_features.head()
Out[7]:
In [8]:
# Load Dragon features for all of the "new" molecules.
new_features = pd.read_excel('../../data/dragon descriptors for 0869 odors.xlsx',index_col=0)
# Bourgeonal features were not computed until later.
bourgeonal_features= pd.read_csv('../../data/CID64832_dragon_descriptors.txt',delimiter='\t',index_col=1).drop('No.',1)
bourgeonal_features.index.name = 'CID'
new_features = pd.concat((new_features,bourgeonal_features))
new_features.head()
Out[8]:
In [9]:
# Make a single data frame will all of these features, but only use Dragon ones because we didn't
# compute the Morgan fingerprints for the new molecules.
all_features = pd.concat([dream_features['dragon'].drop('complexity from pubmed',1),new_features])
all_features.head()
Out[9]:
In [10]:
# Create the feature and descriptor arrays
X,good1,good2,means,stds,imputer = dream.make_X(all_features,all_CID_dilutions)
In [11]:
# Functions to select only the appropriate dilutions (to match the challenge rules).
def intensity_dilutions(CID_dilutions):
int_CID_dilutions = []
CIDs = list(set([x[0] for x in CID_dilutions]))
for CID in CIDs:
dilutions = np.array([dilution for CID_,dilution in CID_dilutions if CID_==CID])
rel_dilutions = np.abs(dilutions + 3)
int_CID_dilutions += [(CID,dilutions[rel_dilutions.argmin()])]
return int_CID_dilutions
def max_dilutions(CID_dilutions):
max_CID_dilutions = []
CIDs = list(set([x[0] for x in CID_dilutions]))
for CID in CIDs:
dilutions = np.array([dilution for CID_,dilution in CID_dilutions if CID_==CID])
max_CID_dilutions += [(CID,dilutions.max())]
return max_CID_dilutions
In [12]:
# Fit the models for each scenario.
# In all cases, the model is trained with the molecules used in the DREAM challenge.
# There are 4 test cases, corresponding to:
# old_old: The molecules common to DREAM and the new data, using the same subjects (i.e. in sample)
# old_new: Same as above, but using data from the new subjects.
# new_new: Only new molecules not used in the DREAM challenge, using new subjects.
# new: Combination of the last two, using new subjects but a combination of old and new molecules.
from sklearn.ensemble import RandomForestRegressor
use_et, max_features, max_depth, min_samples_leaf, trans_weight, regularize, use_mask = params.get_other_params()
kinds = {'old_old':shared_CID_dilutions_old,
'old_new':shared_CID_dilutions_new,
'new_new':new_only_CID_dilutions,
'new':new_CID_dilutions}
rs = {kind:np.zeros(2) for kind in kinds}
for col,desc in enumerate(['Intensity','Pleasantness']):
print("Training model for %s..." % desc)
Y_dream = df_dream['Subject'].loc[desc,:].mean(level=['CID','Dilution']).mean(skipna=True,axis=1)
Y_new = df_new['Subject'].loc[desc,:].mean(level=['CID','Dilution']).mean(skipna=True,axis=1)
rfc = RandomForestRegressor(n_estimators=100, max_features=max_features[col], max_depth=max_depth[col],
min_samples_leaf=min_samples_leaf[col], random_state=0)
X_train = X.loc[dream_CID_dilutions]
Y_train = Y_dream.loc[dream_CID_dilutions]
rfc.fit(X_train,Y_train)
for kind,CID_dilutions in kinds.items():
if desc == 'Intensity':
CID_dilutions = intensity_dilutions(CID_dilutions)
else:
CID_dilutions = max_dilutions(CID_dilutions)
X_test = X.loc[CID_dilutions]
Y_ = Y_dream if kind=='old_old' else Y_new
Y_test = Y_.loc[CID_dilutions]
Y_predicted = rfc.predict(X_test)
rs[kind][col] = np.corrcoef(Y_predicted,Y_test)[0,1]
In [13]:
# Plot the results.
import matplotlib as mpl
mpl.rcParams.update({'font.size':14})
plt.figure(figsize=(4,6))
plt.errorbar(np.arange(2)-0.15,rs['old_new'],yerr=np.tanh(1/np.sqrt(32-3)),color='r',marker='o',linestyle='',markersize=10,label='New 403 subjects,\n32 old molecules')
plt.errorbar(np.arange(2),rs['new_new'],yerr=np.tanh(1/np.sqrt(15-3)),color='g',marker='o',linestyle='',markersize=10,label='New 403 subjects,\n15 new molecules')
plt.errorbar(np.arange(2)+0.15,rs['new'],yerr=np.tanh(1/np.sqrt(47-3)),color='b',marker='o',linestyle='',markersize=10,label='New 403 subjects,\nall 47 molecules')
plt.xlim(-0.5,1.5)
plt.ylim(0,1)
plt.xticks(range(2),['Intensity','Pleasantness'],rotation=90);
plt.ylabel('Correlation')
plt.legend(fontsize=10,loc=3,frameon=0)
plt.tight_layout()
plt.savefig('../../figures/new_predictions.eps',format='eps')