In [1]:
# Preliminaries to work with the data.   
%matplotlib inline
%run __init__.py
from utils import loading, scoring
from gerkin import dream,params,fit2,dravnieks
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from scipy.stats import pearsonr,spearmanr
import scipy.io as sio
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import ShuffleSplit


/Users/rgerkin/Dropbox/miniconda3/lib/python3.5/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)

In [2]:
dream_CIDs = loading.get_CIDs(['training','leaderboard','testset'])
dream_CID_dilutions = loading.get_CID_dilutions(['training','leaderboard','testset'])

In [3]:
df_dream = loading.load_perceptual_data(['training','leaderboard','testset'])
# Average over replicates, then pick lowest dilution
low_dilutions = df_dream.sort_index(level=['Descriptor','CID','Dilution']).groupby(level=['Descriptor','CID','Dilution'])\
                        .mean().fillna(999).groupby(level=['Descriptor','CID']).last().replace(999,float('NaN'))
descriptors = loading.get_descriptors(format=True)

In [4]:
dream_features = loading.get_molecular_data(['dragon','morgan'],dream_CIDs)


Dragon has 4869 features for 476 molecules.
Morgan has 2437 features for 476 molecules.
There are now 7306 total features.

In [5]:
drav_cas = pd.read_table(os.path.join('../../data','dravnieks_cas.txt'),delimiter=' ',index_col=0)
drav_descriptors = pd.read_table(os.path.join('../../data','dravnieks_descriptors.txt'),index_col=None,header=0,names=['Descriptor'])
drav_descriptors['Descriptor'] = [' '.join(x.split(' ')[1:]) for x in drav_descriptors['Descriptor']]
drav_data_raw = sio.loadmat(os.path.join('../../data','dravnieks.mat'))
drav_data = pd.DataFrame(drav_data_raw['pu'].T,index=drav_cas['C.A.S.'],columns=drav_descriptors['Descriptor'])
drav_cas_cid = pd.read_table(os.path.join('../../data','dravnieks_cas_cid.txt'),header=None,index_col=0,names=['CID'])
drav_cas_cid[:] = drav_cas_cid.values.astype(int)
drav_cas_cid.index.name = 'C.A.S.'
drav_data = drav_data.join(drav_cas_cid,how='left').set_index('CID') # Join with CIDs
drav_data.drop_duplicates(inplace=True) # Remove duplicate data
drav_data = drav_data[drav_data.index>0]#.sort_index() # Remove negative indices
drav_data = drav_data.groupby(level='CID').first() # Remove duplicate indices
drav_data.head()


Out[5]:
FRUITY, CITRUS LEMON GRAPEFRUIT ORANGE FRUITY, OTHER THAN CITRUS PINEAPPLE GRAPE JUICE STRAWBERRY APPLE (FRUIT) PEAR ... PUTRID, FOUL, DECAYED FECAL (LIKE MANURE) CADAVEROUS (DEAD ANIMAL) SICKENING DRY, POWDERY CHALKY LIGHT HEAVY COOL, COOLING WARM
CID
240 5.04 3.60 0.72 1.44 22.30 2.16 0.72 5.76 0.00 1.44 ... 0.72 0.00 0.00 5.04 5.04 0.72 20.86 27.34 13.67 21.58
264 0.00 0.71 0.00 0.00 0.71 0.00 0.00 0.00 0.00 0.00 ... 47.86 35.00 13.57 65.71 1.43 0.00 2.86 38.57 0.00 11.43
323 2.86 3.57 0.00 0.00 14.29 3.57 0.71 2.86 1.43 0.71 ... 3.57 0.00 0.00 7.14 7.86 1.43 24.29 13.57 12.14 20.71
326 10.71 16.43 2.14 5.00 7.14 1.43 0.00 2.14 0.00 2.14 ... 6.43 0.00 0.71 20.00 12.14 2.14 16.43 24.29 14.29 17.86
342 0.00 0.00 0.00 0.00 2.16 0.72 0.00 0.00 0.00 0.72 ... 2.88 4.32 2.16 14.39 12.95 0.00 12.23 24.46 7.19 14.39

5 rows × 146 columns


In [36]:
drav_data.to_csv('/Users/rgerkin/Desktop/dravnieks.csv')

In [6]:
drav_dragon = pd.read_table('../../data/dragondravnieks.txt',index_col=1).drop('No.',axis=1)
drav_dragon.index.name = 'CID'
drav_dragon.head()


Out[6]:
MW AMW Sv Se Sp Si Mv Me Mp Mi ... Psychotic-80 Psychotic-50 Hypertens-80 Hypertens-50 Hypnotic-80 Hypnotic-50 Neoplastic-80 Neoplastic-50 Infective-80 Infective-50
CID
342 108.15 6.759 9.822 15.862 10.500 17.870 0.614 0.991 0.656 1.117 ... 0 0 0 0 0 0 0 0 0 0
798 117.16 7.323 10.602 15.753 11.290 17.744 0.663 0.985 0.706 1.109 ... 0 0 0 0 0 0 0 0 0 0
999 136.16 7.564 11.537 18.189 11.955 20.080 0.641 1.011 0.664 1.116 ... 0 0 0 0 0 0 0 0 0 0
1127 88.19 6.784 7.294 12.611 8.693 14.581 0.561 0.970 0.669 1.122 ... 0 0 0 0 0 0 0 0 0 0
1140 92.15 6.143 9.107 14.534 10.046 16.661 0.607 0.969 0.670 1.111 ... 0 0 0 0 0 0 0 0 0 0

5 rows × 4870 columns


In [7]:
shared_features = list(set(drav_dragon).intersection(dream_features['dragon']))
dragon_all = pd.concat((drav_dragon,dream_features['dragon']))[shared_features].sort_index()
dragon_all.head()


Out[7]:
XMOD GATS4i Chi1_EA(ri) R2e SpMax4_Bh(p) SsssPbH CATS2D_08_DL B09[C-P] Mor05v P2v ... CSI F04[I-B] WiA_H2 TDB09u F07[P-Si] F09[O-Cl] F10[C-F] HATS2e Mor30u SM3_Dz(e)
CID
126 27.239 1.015 4.402 0.887 2.260 0.0 0 0 -1.534 0.255 ... 77 0 0.364 0.000 0 0 0 0.204 0.043 7.885
176 11.547 0.000 1.506 1.529 0.380 0.0 0 0 -0.398 0.297 ... 9 0 0.625 0.000 0 0 0 1.196 0.065 3.596
177 9.192 0.000 0.913 1.672 0.380 0.0 0 0 -0.326 0.206 ... 6 0 0.750 0.000 0 0 0 1.505 0.047 1.729
180 10.970 0.000 1.408 1.761 0.380 0.0 0 0 -0.567 0.306 ... 9 0 0.625 0.000 0 0 0 0.991 0.077 3.804
196 30.065 0.880 4.461 1.590 2.185 0.0 0 0 -1.387 0.074 ... 100 0 0.290 0.054 0 0 0 0.440 0.103 9.676

5 rows × 4867 columns


In [8]:
dream_data = low_dilutions.mean(skipna=True,axis=1).unstack('Descriptor')[descriptors]
dream_data.head()


Out[8]:
Descriptor Intensity Pleasantness Bakery Sweet Fruit Fish Garlic Spices Cold Sour ... Acid Warm Musky Sweaty Ammonia Decayed Wood Grass Flower Chemical
CID
126 49.551020 48.956522 0.630435 24.347826 6.239130 0.282609 1.608696 3.652174 4.000000 7.500000 ... 2.847826 0.913043 8.304348 1.500000 1.804348 4.978261 1.260870 1.347826 9.260870 14.239130
176 11.551020 48.461538 2.538462 6.692308 1.230769 0.000000 7.192308 8.230769 5.769231 15.307692 ... 3.500000 4.230769 6.769231 7.153846 3.615385 3.038462 1.807692 1.961538 4.115385 3.384615
177 33.265306 45.315789 8.421053 20.078947 1.763158 0.000000 0.631579 2.500000 7.000000 10.447368 ... 4.447368 4.526316 3.421053 1.947368 3.789474 3.263158 1.026316 4.000000 2.184211 18.789474
180 16.714286 55.437500 3.375000 15.531250 6.468750 2.281250 2.593750 4.375000 7.625000 9.031250 ... 3.218750 10.031250 7.531250 0.031250 5.281250 6.218750 8.625000 6.593750 12.625000 10.500000
196 22.183673 53.218750 1.750000 18.312500 1.312500 0.000000 1.906250 6.156250 5.562500 6.218750 ... 3.218750 9.437500 6.781250 6.093750 2.781250 6.906250 1.187500 2.218750 9.625000 7.750000

5 rows × 21 columns


In [9]:
shared_CIDs = list(set(dream_data.index).intersection(drav_data.index))
print("There are %d molecules in common between DREAM and Dravnieks" % len(shared_CIDs))
drav_only_CIDs = list(set(drav_data.index).difference(dream_data.index))
drav_only_CIDs.remove(122510) # Couldn't compute Dragon features for this one.  
print("There are %d molecules in Dravnieks that are not in DREAM" % len(drav_only_CIDs))
dream_only_CIDs = list(set(dream_CIDs).difference(shared_CIDs))
print("There are %d molecules in DREAM that are not in Dravnieks" % len(dream_only_CIDs))


There are 58 molecules in common between DREAM and Dravnieks
There are 70 molecules in Dravnieks that are not in DREAM
There are 418 molecules in DREAM that are not in Dravnieks

In [27]:
shared_CID_dilutions = [x for x in dream_CID_dilutions if x[0] in shared_CIDs]
drav_only_CID_dilutions = [(x,-3) for x in drav_only_CIDs]
dream_only_CID_dilutions = [x for x in dream_CID_dilutions if x[0] in dream_only_CIDs]
all_CID_dilutions = dream_only_CID_dilutions + drav_only_CID_dilutions

In [11]:
X,good1,good2,means,stds,imputer = dream.make_X(dragon_all,None,CID_dilutions=all_CID_dilutions)


The X matrix now has shape (884x3115) molecules by non-NaN good molecular descriptors

In [12]:
for dream_desc,drav_desc in [('Sweet','SWEET'),('Garlic','GARLIC, ONION'),('Bakery','VANILLA')]:
    d1 = dream_data[dream_desc].loc[shared_CIDs]
    d2 = drav_data[drav_desc].loc[shared_CIDs]
    pr = pearsonr(d1,d2)[0]
    sr = spearmanr(d1,d2)[0]
    print('%.2f (%.2f): %s vs %s' % (pr,sr,dream_desc,drav_desc))


0.82 (0.86): Sweet vs SWEET
0.80 (0.54): Garlic vs GARLIC, ONION
0.82 (0.51): Bakery vs VANILLA

In [13]:
def intensity_dilutions(CID_dilutions):
    int_CID_dilutions = []
    CIDs = list(set([x[0] for x in CID_dilutions]))
    for CID in CIDs:
        dilutions = np.array([dilution for CID_,dilution in CID_dilutions if CID_==CID])
        rel_dilutions = np.abs(dilutions + 3)
        int_CID_dilutions += [(CID,dilutions[rel_dilutions.argmin()])]
    return int_CID_dilutions

def max_dilutions(CID_dilutions):
    max_CID_dilutions = []
    CIDs = list(set([x[0] for x in CID_dilutions]))
    for CID in CIDs:
        dilutions = np.array([dilution for CID_,dilution in CID_dilutions if CID_==CID])
        max_CID_dilutions += [(CID,dilutions.max())]
    return max_CID_dilutions

In [33]:
use_et, max_features, max_depth, min_samples_leaf, trans_weight, regularize, use_mask = params.get_other_params()
kinds = {'shared1':shared_CID_dilutions,
         'shared2':shared_CID_dilutions,
         'drav_only':drav_only_CID_dilutions,
        }
dream_drav = {'Sweet':'SWEET','Garlic':'GARLIC, ONION','Flower':'FLORAL','Bakery':'VANILLA'}
rs = pd.DataFrame(columns=kinds,index=dream_drav.items())
for col,(dream_desc,drav_desc) in enumerate(dream_drav.items()):#descriptors):
    print(dream_desc,drav_desc)
    Y_dream = df_dream['Subject'].loc[dream_desc,:].mean(level=['CID','Dilution']).mean(skipna=True,axis=1)
    Y_drav = drav_data[drav_desc]
    rfc = RandomForestRegressor(n_estimators=100, max_features=max_features[col], max_depth=max_depth[col], 
                                min_samples_leaf=min_samples_leaf[col], random_state=0)
    X_train = X.loc[dream_only_CID_dilutions]
    Y_train = Y_dream.loc[dream_only_CID_dilutions]
    rfc.fit(X_train,Y_train)
    for kind,CID_dilutions in kinds.items():
        if dream_desc == 'Intensity':
            CID_dilutions = intensity_dilutions(CID_dilutions)
        else:
            CID_dilutions = max_dilutions(CID_dilutions)
        X_test = X.loc[CID_dilutions]
        Y_ = Y_dream if kind=='shared1' else Y_drav
        CIDs = [x[0] for x in CID_dilutions]
        Y_test = Y_.loc[CID_dilutions if kind=='shared1' else CIDs]
        Y_predicted = rfc.predict(X_test)
        rs[kind][(dream_desc,drav_desc)] = np.corrcoef(Y_predicted,Y_test)[0,1]


Garlic GARLIC, ONION
Bakery VANILLA
Sweet SWEET
Flower FLORAL

In [32]:
rs # 30 trees


Out[32]:
shared1 drav_only shared2
(Garlic, GARLIC, ONION) 0.652968 0.769221 0.59541
(Bakery, VANILLA) 0.644034 0.338058 0.775768
(Sweet, SWEET) 0.618017 0.606856 0.59415
(Flower, FLORAL) 0.486432 0.475694 0.385654

In [34]:
rs # 100 trees


Out[34]:
shared1 drav_only shared2
(Garlic, GARLIC, ONION) 0.682178 0.81974 0.641363
(Bakery, VANILLA) 0.704299 0.381339 0.822294
(Sweet, SWEET) 0.669137 0.623467 0.626174
(Flower, FLORAL) 0.505666 0.513412 0.472563