Figure S7B



In [1]:

    
# Preliminaries to work with the data.   
%matplotlib inline
%run __init__.py
from utils import loading, scoring, prog
from gerkin import dream,params,fit2
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd



In [2]:

    
# Load CIDs for all relevant molecules.  
# dream: from the DREAM challenge
# new: from new unpublished data collected by A. Keller.
# shared: the overlap
dream_CIDs = loading.get_CIDs(['training','leaderboard','testset'])
df_new_CIDs = pd.read_csv('../../data/newdata_cids.txt',delimiter='\t',index_col=0,header=None,names=['CID'])['CID']
new_CIDs = list(df_new_CIDs)
shared_CIDs = list(set(dream_CIDs).intersection(new_CIDs))
dream_only_CIDs = list(set(dream_CIDs).difference(new_CIDs))
new_only_CIDs = list(set(new_CIDs).difference(dream_CIDs))
                        # Delete this line when we get these features.



In [3]:

    
# At time of publication, the features for one molecule, Bourgeonal, 
# had not been computed and included in the analysis.  
# We have since added the features for this molecule and updated the analysis, 
# with no qualitative change in the results.  

#new_CIDs.remove(64832) # Remove Bourgeonal since we have no Dragon features for it.
                        # Delete this line when we get these features.  
#new_only_CIDs.remove(64832) # Remove Bourgeonal since we have no Dragon features for it.
                        # Delete this line when we get these features.



In [4]:

    
# Load all the perceptual data from the DREAM challenge.  
df_dream = loading.load_perceptual_data(['training','leaderboard','testset'])
df_dream.head()









    Out[4]:






  
    
      
      
      
      
      Metadata
      Subject
    
    
      
      
      
      
      Name
      Solvent
      1
      2
      3
      4
      5
      6
      7
      8
      ...
      40
      41
      42
      43
      44
      45
      46
      47
      48
      49
    
    
      Descriptor
      CID
      Dilution
      Replicate
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      Intensity
      126
      -3.0
      0
      4-Hydroxybenzaldehyde
      None
      7.0
      55.0
      89.0
      29.0
      0.0
      0.0
      6.0
      46.0
      ...
      0.0
      0.0
      2.0
      28.0
      33.0
      64.0
      0.0
      0.0
      0.0
      18.0
    
    
      -1.0
      0
      4-Hydroxybenzaldehyde
      None
      37.0
      64.0
      0.0
      100.0
      58.0
      0.0
      18.0
      51.0
      ...
      76.0
      49.0
      0.0
      78.0
      21.0
      50.0
      50.0
      85.0
      9.0
      7.0
    
    
      176
      -7.0
      0
      acetic acid
      None
      0.0
      0.0
      0.0
      20.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      1.0
      0.0
      24.0
      52.0
      0.0
      0.0
      0.0
      0.0
    
    
      -5.0
      0
      acetic acid
      None
      0.0
      83.0
      69.0
      0.0
      67.0
      0.0
      0.0
      57.0
      ...
      0.0
      0.0
      0.0
      0.0
      3.0
      0.0
      0.0
      0.0
      14.0
      0.0
    
    
      177
      -5.0
      0
      acetaldehyde
      None
      3.0
      60.0
      0.0
      10.0
      0.0
      0.0
      83.0
      0.0
      ...
      28.0
      0.0
      0.0
      27.0
      29.0
      37.0
      55.0
      6.0
      0.0
      0.0
    
  

5 rows × 51 columns



In [5]:

    
# Load and format the new unpublished perceptual data from A. Keller.  
df = pd.read_excel('../../data/0869 final data Pablo.xlsx',index_col=0,header=None).transpose().drop(1)
df = df.drop(df.loc[df['Subject Identifier']=='citronella'].index) # This is a mixture and so not used in this project
df = df.drop(df.loc[df['Subject Identifier']=='bourgeonal'].index) # I don't have Dragon features for this.  
                                                                  # Delete line when those features are available.  
mux = pd.MultiIndex.from_arrays(df.values[:,[0,3,1,2]].transpose(), names=['Descriptor','CID','Dilution','Solvent'])
df = pd.DataFrame(data=df.values[:,4:], index=mux, dtype=float)

# Compute log diliutions
dilutions = set(df.index.levels[2])
dilutions.remove('pure')
log_dilutions = {dilution:-np.round(np.log10(float(dilution[2:].replace(',',''))),2) for dilution in dilutions}
log_dilutions['pure'] = 0
df = df.rename(index=log_dilutions)

# Fix odorant names
odorants = {x:x.replace(' (1M solution)','').replace(' (w/v)','') for x in set(df.index.levels[1])}
df = df.rename(index=odorants)
df = df.rename(index={x:x.title() for x in df.index.levels[0]})

# Sort by descriptor, then odor, then solvent, then dilution
df = df.sortlevel() 

# Drop rows where the odorant is NaN.  
df = df.loc[~(df.index.labels[1] == -1)]

df.insert(0,'Name',[ix[1] for ix in df.index])
df.insert(1,'Solvent',[ix[3] for ix in df.index])
df = df.rename(index={name:int(df_new_CIDs.loc[name]) for name in df_new_CIDs.index})
#df = df.rename(columns={x:'S%d'%x for x in df.columns if type(x) is int})
df.index = df.index.droplevel('Solvent')
df.insert(1,'Replicate',[i % 2 for i in range(len(df.index))])
df = df.set_index('Replicate',append=True)
df = df.sortlevel()
df.columns = [['Metadata']*2+['Subject']*403,df.columns]
df_new = df.copy()
df_new.head()









    Out[5]:






  
    
      
      
      
      
      Metadata
      Subject
    
    
      
      
      
      
      Name
      Solvent
      0
      1
      2
      3
      4
      5
      6
      7
      ...
      393
      394
      395
      396
      397
      398
      399
      400
      401
      402
    
    
      Descriptor
      CID
      Dilution
      Replicate
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      Intensity
      460
      -6.0
      0
      guaiacol
      paraffin oil
      32.0
      9.0
      54.0
      32.0
      38.0
      0.0
      73.0
      61.0
      ...
      4.0
      49.0
      39.0
      0.0
      12.0
      25.0
      19.0
      0.0
      6.0
      31.0
    
    
      1
      guaiacol
      paraffin oil
      9.0
      25.0
      83.0
      40.0
      56.0
      0.0
      74.0
      15.0
      ...
      0.0
      68.0
      46.0
      13.0
      0.0
      3.0
      13.0
      27.0
      58.0
      12.0
    
    
      -5.0
      0
      guaiacol
      paraffin oil
      0.0
      0.0
      0.0
      0.0
      11.0
      0.0
      3.0
      0.0
      ...
      0.0
      12.0
      49.0
      0.0
      53.0
      15.0
      12.0
      0.0
      12.0
      23.0
    
    
      1
      guaiacol
      paraffin oil
      6.0
      0.0
      8.0
      17.0
      63.0
      0.0
      11.0
      6.0
      ...
      0.0
      58.0
      46.0
      0.0
      0.0
      18.0
      9.0
      47.0
      53.0
      33.0
    
    
      -4.0
      0
      guaiacol
      paraffin oil
      60.0
      79.0
      90.0
      82.0
      71.0
      18.0
      80.0
      66.0
      ...
      46.0
      79.0
      56.0
      88.0
      67.0
      35.0
      20.0
      78.0
      72.0
      62.0
    
  

5 rows × 405 columns



In [6]:

    
# Make lits of all CIDs and dilutions for both datasets.  
dream_CID_dilutions = loading.get_CID_dilutions(['training','leaderboard','testset'])
new_CID_dilutions = [tuple(x[1:3]) for x in df_new.index.values]
all_CID_dilutions = set(dream_CID_dilutions + new_CID_dilutions)
dream_only_CID_dilutions = [x for x in dream_CID_dilutions if x[0] not in new_CIDs]
shared_CID_dilutions_old = [x for x in dream_CID_dilutions if x[0] in new_CIDs]
shared_CID_dilutions_new = [x for x in new_CID_dilutions if x[0] in dream_CIDs]
new_only_CID_dilutions = [tuple(x[1:3]) for x in df_new.index.values if x[1] not in dream_CIDs]



In [7]:

    
# Load the Dragon and Morgan features for all of the DREAM molecules.
dream_features = loading.get_molecular_data(['dragon','morgan'],dream_CIDs)
dream_features.head()









    



Dragon has 4869 features for 476 molecules.
Morgan has 2437 features for 476 molecules.
There are now 7306 total features.






    Out[7]:






  
    
      
      dragon
      ...
      morgan
    
    
      
      complexity from pubmed
      MW
      AMW
      Sv
      Se
      Sp
      Si
      Mv
      Me
      Mp
      ...
      91305518
      91411526
      91541756
      91552833
      91563027
      91595028
      91614181
      91617014
      91617930
      91618238
    
    
      CID
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      126
      93.1
      122.13
      8.142
      10.010
      15.305
      10.193
      16.664
      0.667
      1.020
      0.680
      ...
      0.003578
      0.018182
      0.118421
      0.017192
      0.145251
      0.013652
      0.056206
      0.047945
      0.011730
      0.105263
    
    
      176
      31.0
      60.06
      7.508
      4.483
      8.422
      4.432
      9.249
      0.560
      1.053
      0.554
      ...
      0.011152
      0.014320
      0.091603
      0.030488
      0.037975
      0.009695
      0.024631
      0.036900
      0.015129
      0.064516
    
    
      177
      10.3
      44.06
      6.294
      3.768
      7.095
      3.977
      8.040
      0.538
      1.014
      0.568
      ...
      0.003731
      0.009592
      0.031008
      0.018405
      0.025641
      0.002774
      0.009901
      0.014870
      0.006070
      0.043956
    
    
      180
      26.3
      58.09
      5.809
      5.295
      9.978
      5.739
      11.455
      0.530
      0.998
      0.574
      ...
      0.011152
      0.014320
      0.061069
      0.030488
      0.025316
      0.009695
      0.024631
      0.044280
      0.015129
      0.043011
    
    
      196
      114.0
      146.16
      7.308
      11.493
      20.727
      11.625
      22.914
      0.575
      1.036
      0.581
      ...
      0.032086
      0.027149
      0.116883
      0.096866
      0.044199
      0.028630
      0.055944
      0.074830
      0.046784
      0.103448
    
  

5 rows × 7306 columns



In [8]:

    
# Load Dragon features for all of the "new" molecules.  
new_features = pd.read_excel('../../data/dragon descriptors for 0869 odors.xlsx',index_col=0)
# Bourgeonal features were not computed until later.  
bourgeonal_features= pd.read_csv('../../data/CID64832_dragon_descriptors.txt',delimiter='\t',index_col=1).drop('No.',1)
bourgeonal_features.index.name = 'CID'
new_features = pd.concat((new_features,bourgeonal_features))
new_features.head()









    Out[8]:






  
    
      
      AAC
      AECC
      ALOGP
      ALOGP2
      AMR
      AMW
      AROM
      ARR
      ASP
      ATS1e
      ...
      piPC01
      piPC02
      piPC03
      piPC04
      piPC05
      piPC06
      piPC07
      piPC08
      piPC09
      piPC10
    
    
      CID
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      7794
      1.124
      6.364
      3.019
      9.114
      49.297
      5.320
      0.0
      0.0
      0.709
      3.342
      ...
      2.565
      2.773
      2.639
      2.639
      2.485
      2.565
      2.197
      2.197
      0.000
      0.000
    
    
      8164
      1.198
      7.917
      3.175
      10.078
      49.690
      5.384
      0.0
      0.0
      0.906
      3.460
      ...
      2.565
      2.639
      2.398
      2.303
      2.197
      2.079
      1.946
      1.792
      1.609
      1.386
    
    
      8842
      1.094
      6.364
      3.049
      9.297
      50.486
      5.042
      0.0
      0.0
      0.448
      3.417
      ...
      2.485
      2.708
      2.565
      2.485
      2.398
      2.485
      1.946
      1.609
      0.000
      0.000
    
    
      10409
      1.068
      7.188
      5.280
      27.881
      69.656
      5.101
      0.0
      0.0
      0.175
      3.777
      ...
      2.890
      2.996
      2.996
      2.996
      2.996
      2.996
      2.996
      2.996
      2.996
      2.996
    
    
      10947
      1.061
      7.353
      5.532
      30.603
      74.205
      5.074
      0.0
      0.0
      0.178
      3.841
      ...
      2.944
      3.091
      3.091
      3.178
      3.091
      3.091
      3.091
      3.091
      3.091
      3.091
    
  

5 rows × 4871 columns



In [9]:

    
# Make a single data frame will all of these features, but only use Dragon ones because we didn't
# compute the Morgan fingerprints for the new molecules.
all_features = pd.concat([dream_features['dragon'].drop('complexity from pubmed',1),new_features])
all_features.head()









    Out[9]:






  
    
      
      AAC
      AECC
      ALOGP
      ALOGP2
      AMR
      AMW
      AROM
      ARR
      ASP
      ATS1e
      ...
      piPC01
      piPC02
      piPC03
      piPC04
      piPC05
      piPC06
      piPC07
      piPC08
      piPC09
      piPC10
    
    
      CID
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      126
      1.429
      4.444
      1.322
      1.748
      34.343
      8.142
      1.0
      0.667
      0.580
      2.810
      ...
      2.639
      3.114
      3.590
      3.987
      4.467
      4.176
      3.446
      0.0
      0.0
      0.0
    
    
      176
      1.500
      1.750
      -0.230
      0.053
      12.644
      7.508
      0.0
      0.000
      0.256
      2.167
      ...
      1.609
      1.792
      0.000
      0.000
      0.000
      0.000
      0.000
      0.0
      0.0
      0.0
    
    
      177
      1.379
      1.667
      -0.182
      0.033
      11.718
      6.294
      0.0
      0.000
      0.672
      1.959
      ...
      1.386
      1.099
      0.000
      0.000
      0.000
      0.000
      0.000
      0.0
      0.0
      0.0
    
    
      180
      1.295
      1.750
      -0.244
      0.060
      16.195
      5.809
      0.0
      0.000
      0.215
      2.300
      ...
      1.609
      1.792
      0.000
      0.000
      0.000
      0.000
      0.000
      0.0
      0.0
      0.0
    
    
      196
      1.485
      5.800
      0.502
      0.252
      32.739
      7.308
      0.0
      0.000
      0.798
      3.061
      ...
      2.485
      2.708
      2.303
      2.197
      2.079
      1.946
      2.303
      0.0
      0.0
      0.0
    
  

5 rows × 4871 columns



In [10]:

    
# Create the feature and descriptor arrays 
X,good1,good2,means,stds,imputer = dream.make_X(all_features,all_CID_dilutions)









    



The X matrix now has shape (886x3083) molecules by non-NaN good molecular descriptors



In [11]:

    
# Functions to select only the appropriate dilutions (to match the challenge rules).
def intensity_dilutions(CID_dilutions):
    int_CID_dilutions = []
    CIDs = list(set([x[0] for x in CID_dilutions]))
    for CID in CIDs:
        dilutions = np.array([dilution for CID_,dilution in CID_dilutions if CID_==CID])
        rel_dilutions = np.abs(dilutions + 3)
        int_CID_dilutions += [(CID,dilutions[rel_dilutions.argmin()])]
    return int_CID_dilutions

def max_dilutions(CID_dilutions):
    max_CID_dilutions = []
    CIDs = list(set([x[0] for x in CID_dilutions]))
    for CID in CIDs:
        dilutions = np.array([dilution for CID_,dilution in CID_dilutions if CID_==CID])
        max_CID_dilutions += [(CID,dilutions.max())]
    return max_CID_dilutions

Compute the random forest model



In [12]:

    
# Fit the models for each scenario.  
# In all cases, the model is trained with the molecules used in the DREAM challenge.  
# There are 4 test cases, corresponding to: 
# old_old: The molecules common to DREAM and the new data, using the same subjects (i.e. in sample)
# old_new: Same as above, but using data from the new subjects.  
# new_new: Only new molecules not used in the DREAM challenge, using new subjects.  
# new: Combination of the last two, using new subjects but a combination of old and new molecules.  
from sklearn.ensemble import RandomForestRegressor
use_et, max_features, max_depth, min_samples_leaf, trans_weight, regularize, use_mask = params.get_other_params()
kinds = {'old_old':shared_CID_dilutions_old,
         'old_new':shared_CID_dilutions_new,
         'new_new':new_only_CID_dilutions,
         'new':new_CID_dilutions}
rs = {kind:np.zeros(2) for kind in kinds}
for col,desc in enumerate(['Intensity','Pleasantness']):
    print("Training model for %s..." % desc)
    Y_dream = df_dream['Subject'].loc[desc,:].mean(level=['CID','Dilution']).mean(skipna=True,axis=1)
    Y_new = df_new['Subject'].loc[desc,:].mean(level=['CID','Dilution']).mean(skipna=True,axis=1)
    rfc = RandomForestRegressor(n_estimators=100, max_features=max_features[col], max_depth=max_depth[col], 
                                    min_samples_leaf=min_samples_leaf[col], random_state=0)
    X_train = X.loc[dream_CID_dilutions]
    Y_train = Y_dream.loc[dream_CID_dilutions]
    rfc.fit(X_train,Y_train)
    for kind,CID_dilutions in kinds.items():
        if desc == 'Intensity':
            CID_dilutions = intensity_dilutions(CID_dilutions)
        else:
            CID_dilutions = max_dilutions(CID_dilutions)
        X_test = X.loc[CID_dilutions]
        Y_ = Y_dream if kind=='old_old' else Y_new
        Y_test = Y_.loc[CID_dilutions]
        Y_predicted = rfc.predict(X_test)
        rs[kind][col] = np.corrcoef(Y_predicted,Y_test)[0,1]









    



Training model for Intensity...
Training model for Pleasantness...



In [13]:

    
# Plot the results.  
import matplotlib as mpl
mpl.rcParams.update({'font.size':14})
plt.figure(figsize=(4,6))
plt.errorbar(np.arange(2)-0.15,rs['old_new'],yerr=np.tanh(1/np.sqrt(32-3)),color='r',marker='o',linestyle='',markersize=10,label='New 403 subjects,\n32 old molecules')
plt.errorbar(np.arange(2),rs['new_new'],yerr=np.tanh(1/np.sqrt(15-3)),color='g',marker='o',linestyle='',markersize=10,label='New 403 subjects,\n15 new molecules')
plt.errorbar(np.arange(2)+0.15,rs['new'],yerr=np.tanh(1/np.sqrt(47-3)),color='b',marker='o',linestyle='',markersize=10,label='New 403 subjects,\nall 47 molecules')
plt.xlim(-0.5,1.5)
plt.ylim(0,1)
plt.xticks(range(2),['Intensity','Pleasantness'],rotation=90);
plt.ylabel('Correlation')
plt.legend(fontsize=10,loc=3,frameon=0)
plt.tight_layout()
plt.savefig('../../figures/new_predictions.eps',format='eps')

				Metadata		Subject
				Name	Solvent	1	2	3	4	5	6	7	8	...	40	41	42	43	44	45	46	47	48	49
Descriptor	CID	Dilution	Replicate
Intensity	126	-3.0	0	4-Hydroxybenzaldehyde	None	7.0	55.0	89.0	29.0	0.0	0.0	6.0	46.0	...	0.0	0.0	2.0	28.0	33.0	64.0	0.0	0.0	0.0	18.0
	126	-1.0	0	4-Hydroxybenzaldehyde	None	37.0	64.0	0.0	100.0	58.0	0.0	18.0	51.0	...	76.0	49.0	0.0	78.0	21.0	50.0	50.0	85.0	9.0	7.0
	176	-7.0	0	acetic acid	None	0.0	0.0	0.0	20.0	0.0	0.0	0.0	0.0	...	0.0	0.0	1.0	0.0	24.0	52.0	0.0	0.0	0.0	0.0
	176	-5.0	0	acetic acid	None	0.0	83.0	69.0	0.0	67.0	0.0	0.0	57.0	...	0.0	0.0	0.0	0.0	3.0	0.0	0.0	0.0	14.0	0.0
	177	-5.0	0	acetaldehyde	None	3.0	60.0	0.0	10.0	0.0	0.0	83.0	0.0	...	28.0	0.0	0.0	27.0	29.0	37.0	55.0	6.0	0.0	0.0

	dragon										...	morgan
	complexity from pubmed	MW	AMW	Sv	Se	Sp	Si	Mv	Me	Mp	...	91305518	91411526	91541756	91552833	91563027	91595028	91614181	91617014	91617930	91618238
CID
126	93.1	122.13	8.142	10.010	15.305	10.193	16.664	0.667	1.020	0.680	...	0.003578	0.018182	0.118421	0.017192	0.145251	0.013652	0.056206	0.047945	0.011730	0.105263
176	31.0	60.06	7.508	4.483	8.422	4.432	9.249	0.560	1.053	0.554	...	0.011152	0.014320	0.091603	0.030488	0.037975	0.009695	0.024631	0.036900	0.015129	0.064516
177	10.3	44.06	6.294	3.768	7.095	3.977	8.040	0.538	1.014	0.568	...	0.003731	0.009592	0.031008	0.018405	0.025641	0.002774	0.009901	0.014870	0.006070	0.043956
180	26.3	58.09	5.809	5.295	9.978	5.739	11.455	0.530	0.998	0.574	...	0.011152	0.014320	0.061069	0.030488	0.025316	0.009695	0.024631	0.044280	0.015129	0.043011
196	114.0	146.16	7.308	11.493	20.727	11.625	22.914	0.575	1.036	0.581	...	0.032086	0.027149	0.116883	0.096866	0.044199	0.028630	0.055944	0.074830	0.046784	0.103448

	AAC	AECC	ALOGP	ALOGP2	AMR	AMW	AROM	ARR	ASP	ATS1e	...	piPC01	piPC02	piPC03	piPC04	piPC05	piPC06	piPC07	piPC08	piPC09	piPC10
CID
7794	1.124	6.364	3.019	9.114	49.297	5.320	0.0	0.0	0.709	3.342	...	2.565	2.773	2.639	2.639	2.485	2.565	2.197	2.197	0.000	0.000
8164	1.198	7.917	3.175	10.078	49.690	5.384	0.0	0.0	0.906	3.460	...	2.565	2.639	2.398	2.303	2.197	2.079	1.946	1.792	1.609	1.386
8842	1.094	6.364	3.049	9.297	50.486	5.042	0.0	0.0	0.448	3.417	...	2.485	2.708	2.565	2.485	2.398	2.485	1.946	1.609	0.000	0.000
10409	1.068	7.188	5.280	27.881	69.656	5.101	0.0	0.0	0.175	3.777	...	2.890	2.996	2.996	2.996	2.996	2.996	2.996	2.996	2.996	2.996
10947	1.061	7.353	5.532	30.603	74.205	5.074	0.0	0.0	0.178	3.841	...	2.944	3.091	3.091	3.178	3.091	3.091	3.091	3.091	3.091	3.091

	AAC	AECC	ALOGP	ALOGP2	AMR	AMW	AROM	ARR	ASP	ATS1e	...	piPC01	piPC02	piPC03	piPC04	piPC05	piPC06	piPC07	piPC08	piPC09	piPC10
CID
126	1.429	4.444	1.322	1.748	34.343	8.142	1.0	0.667	0.580	2.810	...	2.639	3.114	3.590	3.987	4.467	4.176	3.446	0.0	0.0	0.0
176	1.500	1.750	-0.230	0.053	12.644	7.508	0.0	0.000	0.256	2.167	...	1.609	1.792	0.000	0.000	0.000	0.000	0.000	0.0	0.0	0.0
177	1.379	1.667	-0.182	0.033	11.718	6.294	0.0	0.000	0.672	1.959	...	1.386	1.099	0.000	0.000	0.000	0.000	0.000	0.0	0.0	0.0
180	1.295	1.750	-0.244	0.060	16.195	5.809	0.0	0.000	0.215	2.300	...	1.609	1.792	0.000	0.000	0.000	0.000	0.000	0.0	0.0	0.0
196	1.485	5.800	0.502	0.252	32.739	7.308	0.0	0.000	0.798	3.061	...	2.485	2.708	2.303	2.197	2.079	1.946	2.303	0.0	0.0	0.0