notebook.community

Edit and run



In [1]:

    
# Preliminaries to work with the data.   
%matplotlib inline
%run __init__.py
from utils import loading, scoring
from gerkin import dream,params,fit2,dravnieks
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from scipy.stats import pearsonr,spearmanr
import scipy.io as sio
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import ShuffleSplit









    



/Users/rgerkin/Dropbox/miniconda3/lib/python3.5/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)



In [2]:

    
dream_CIDs = loading.get_CIDs(['training','leaderboard','testset'])
dream_CID_dilutions = loading.get_CID_dilutions(['training','leaderboard','testset'])



In [3]:

    
df_dream = loading.load_perceptual_data(['training','leaderboard','testset'])
# Average over replicates, then pick lowest dilution
low_dilutions = df_dream.sort_index(level=['Descriptor','CID','Dilution']).groupby(level=['Descriptor','CID','Dilution'])\
                        .mean().fillna(999).groupby(level=['Descriptor','CID']).last().replace(999,float('NaN'))
descriptors = loading.get_descriptors(format=True)



In [4]:

    
dream_features = loading.get_molecular_data(['dragon','morgan'],dream_CIDs)









    



Dragon has 4869 features for 476 molecules.
Morgan has 2437 features for 476 molecules.
There are now 7306 total features.



In [5]:

    
drav_cas = pd.read_table(os.path.join('../../data','dravnieks_cas.txt'),delimiter=' ',index_col=0)
drav_descriptors = pd.read_table(os.path.join('../../data','dravnieks_descriptors.txt'),index_col=None,header=0,names=['Descriptor'])
drav_descriptors['Descriptor'] = [' '.join(x.split(' ')[1:]) for x in drav_descriptors['Descriptor']]
drav_data_raw = sio.loadmat(os.path.join('../../data','dravnieks.mat'))
drav_data = pd.DataFrame(drav_data_raw['pu'].T,index=drav_cas['C.A.S.'],columns=drav_descriptors['Descriptor'])
drav_cas_cid = pd.read_table(os.path.join('../../data','dravnieks_cas_cid.txt'),header=None,index_col=0,names=['CID'])
drav_cas_cid[:] = drav_cas_cid.values.astype(int)
drav_cas_cid.index.name = 'C.A.S.'
drav_data = drav_data.join(drav_cas_cid,how='left').set_index('CID') # Join with CIDs
drav_data.drop_duplicates(inplace=True) # Remove duplicate data
drav_data = drav_data[drav_data.index>0]#.sort_index() # Remove negative indices
drav_data = drav_data.groupby(level='CID').first() # Remove duplicate indices
drav_data.head()









    Out[5]:






  
    
      
      FRUITY, CITRUS
      LEMON
      GRAPEFRUIT
      ORANGE
      FRUITY, OTHER THAN CITRUS
      PINEAPPLE
      GRAPE JUICE
      STRAWBERRY
      APPLE (FRUIT)
      PEAR
      ...
      PUTRID, FOUL, DECAYED
      FECAL (LIKE MANURE)
      CADAVEROUS (DEAD ANIMAL)
      SICKENING
      DRY, POWDERY
      CHALKY
      LIGHT
      HEAVY
      COOL, COOLING
      WARM
    
    
      CID
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      240
      5.04
      3.60
      0.72
      1.44
      22.30
      2.16
      0.72
      5.76
      0.00
      1.44
      ...
      0.72
      0.00
      0.00
      5.04
      5.04
      0.72
      20.86
      27.34
      13.67
      21.58
    
    
      264
      0.00
      0.71
      0.00
      0.00
      0.71
      0.00
      0.00
      0.00
      0.00
      0.00
      ...
      47.86
      35.00
      13.57
      65.71
      1.43
      0.00
      2.86
      38.57
      0.00
      11.43
    
    
      323
      2.86
      3.57
      0.00
      0.00
      14.29
      3.57
      0.71
      2.86
      1.43
      0.71
      ...
      3.57
      0.00
      0.00
      7.14
      7.86
      1.43
      24.29
      13.57
      12.14
      20.71
    
    
      326
      10.71
      16.43
      2.14
      5.00
      7.14
      1.43
      0.00
      2.14
      0.00
      2.14
      ...
      6.43
      0.00
      0.71
      20.00
      12.14
      2.14
      16.43
      24.29
      14.29
      17.86
    
    
      342
      0.00
      0.00
      0.00
      0.00
      2.16
      0.72
      0.00
      0.00
      0.00
      0.72
      ...
      2.88
      4.32
      2.16
      14.39
      12.95
      0.00
      12.23
      24.46
      7.19
      14.39
    
  

5 rows × 146 columns



In [36]:

    
drav_data.to_csv('/Users/rgerkin/Desktop/dravnieks.csv')



In [6]:

    
drav_dragon = pd.read_table('../../data/dragondravnieks.txt',index_col=1).drop('No.',axis=1)
drav_dragon.index.name = 'CID'
drav_dragon.head()









    Out[6]:






  
    
      
      MW
      AMW
      Sv
      Se
      Sp
      Si
      Mv
      Me
      Mp
      Mi
      ...
      Psychotic-80
      Psychotic-50
      Hypertens-80
      Hypertens-50
      Hypnotic-80
      Hypnotic-50
      Neoplastic-80
      Neoplastic-50
      Infective-80
      Infective-50
    
    
      CID
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      342
      108.15
      6.759
      9.822
      15.862
      10.500
      17.870
      0.614
      0.991
      0.656
      1.117
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      798
      117.16
      7.323
      10.602
      15.753
      11.290
      17.744
      0.663
      0.985
      0.706
      1.109
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      999
      136.16
      7.564
      11.537
      18.189
      11.955
      20.080
      0.641
      1.011
      0.664
      1.116
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      1127
      88.19
      6.784
      7.294
      12.611
      8.693
      14.581
      0.561
      0.970
      0.669
      1.122
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      1140
      92.15
      6.143
      9.107
      14.534
      10.046
      16.661
      0.607
      0.969
      0.670
      1.111
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
  

5 rows × 4870 columns



In [7]:

    
shared_features = list(set(drav_dragon).intersection(dream_features['dragon']))
dragon_all = pd.concat((drav_dragon,dream_features['dragon']))[shared_features].sort_index()
dragon_all.head()









    Out[7]:






  
    
      
      XMOD
      GATS4i
      Chi1_EA(ri)
      R2e
      SpMax4_Bh(p)
      SsssPbH
      CATS2D_08_DL
      B09[C-P]
      Mor05v
      P2v
      ...
      CSI
      F04[I-B]
      WiA_H2
      TDB09u
      F07[P-Si]
      F09[O-Cl]
      F10[C-F]
      HATS2e
      Mor30u
      SM3_Dz(e)
    
    
      CID
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      126
      27.239
      1.015
      4.402
      0.887
      2.260
      0.0
      0
      0
      -1.534
      0.255
      ...
      77
      0
      0.364
      0.000
      0
      0
      0
      0.204
      0.043
      7.885
    
    
      176
      11.547
      0.000
      1.506
      1.529
      0.380
      0.0
      0
      0
      -0.398
      0.297
      ...
      9
      0
      0.625
      0.000
      0
      0
      0
      1.196
      0.065
      3.596
    
    
      177
      9.192
      0.000
      0.913
      1.672
      0.380
      0.0
      0
      0
      -0.326
      0.206
      ...
      6
      0
      0.750
      0.000
      0
      0
      0
      1.505
      0.047
      1.729
    
    
      180
      10.970
      0.000
      1.408
      1.761
      0.380
      0.0
      0
      0
      -0.567
      0.306
      ...
      9
      0
      0.625
      0.000
      0
      0
      0
      0.991
      0.077
      3.804
    
    
      196
      30.065
      0.880
      4.461
      1.590
      2.185
      0.0
      0
      0
      -1.387
      0.074
      ...
      100
      0
      0.290
      0.054
      0
      0
      0
      0.440
      0.103
      9.676
    
  

5 rows × 4867 columns



In [8]:

    
dream_data = low_dilutions.mean(skipna=True,axis=1).unstack('Descriptor')[descriptors]
dream_data.head()









    Out[8]:






  
    
      Descriptor
      Intensity
      Pleasantness
      Bakery
      Sweet
      Fruit
      Fish
      Garlic
      Spices
      Cold
      Sour
      ...
      Acid
      Warm
      Musky
      Sweaty
      Ammonia
      Decayed
      Wood
      Grass
      Flower
      Chemical
    
    
      CID
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      126
      49.551020
      48.956522
      0.630435
      24.347826
      6.239130
      0.282609
      1.608696
      3.652174
      4.000000
      7.500000
      ...
      2.847826
      0.913043
      8.304348
      1.500000
      1.804348
      4.978261
      1.260870
      1.347826
      9.260870
      14.239130
    
    
      176
      11.551020
      48.461538
      2.538462
      6.692308
      1.230769
      0.000000
      7.192308
      8.230769
      5.769231
      15.307692
      ...
      3.500000
      4.230769
      6.769231
      7.153846
      3.615385
      3.038462
      1.807692
      1.961538
      4.115385
      3.384615
    
    
      177
      33.265306
      45.315789
      8.421053
      20.078947
      1.763158
      0.000000
      0.631579
      2.500000
      7.000000
      10.447368
      ...
      4.447368
      4.526316
      3.421053
      1.947368
      3.789474
      3.263158
      1.026316
      4.000000
      2.184211
      18.789474
    
    
      180
      16.714286
      55.437500
      3.375000
      15.531250
      6.468750
      2.281250
      2.593750
      4.375000
      7.625000
      9.031250
      ...
      3.218750
      10.031250
      7.531250
      0.031250
      5.281250
      6.218750
      8.625000
      6.593750
      12.625000
      10.500000
    
    
      196
      22.183673
      53.218750
      1.750000
      18.312500
      1.312500
      0.000000
      1.906250
      6.156250
      5.562500
      6.218750
      ...
      3.218750
      9.437500
      6.781250
      6.093750
      2.781250
      6.906250
      1.187500
      2.218750
      9.625000
      7.750000
    
  

5 rows × 21 columns



In [9]:

    
shared_CIDs = list(set(dream_data.index).intersection(drav_data.index))
print("There are %d molecules in common between DREAM and Dravnieks" % len(shared_CIDs))
drav_only_CIDs = list(set(drav_data.index).difference(dream_data.index))
drav_only_CIDs.remove(122510) # Couldn't compute Dragon features for this one.  
print("There are %d molecules in Dravnieks that are not in DREAM" % len(drav_only_CIDs))
dream_only_CIDs = list(set(dream_CIDs).difference(shared_CIDs))
print("There are %d molecules in DREAM that are not in Dravnieks" % len(dream_only_CIDs))









    



There are 58 molecules in common between DREAM and Dravnieks
There are 70 molecules in Dravnieks that are not in DREAM
There are 418 molecules in DREAM that are not in Dravnieks



In [27]:

    
shared_CID_dilutions = [x for x in dream_CID_dilutions if x[0] in shared_CIDs]
drav_only_CID_dilutions = [(x,-3) for x in drav_only_CIDs]
dream_only_CID_dilutions = [x for x in dream_CID_dilutions if x[0] in dream_only_CIDs]
all_CID_dilutions = dream_only_CID_dilutions + drav_only_CID_dilutions



In [11]:

    
X,good1,good2,means,stds,imputer = dream.make_X(dragon_all,None,CID_dilutions=all_CID_dilutions)









    



The X matrix now has shape (884x3115) molecules by non-NaN good molecular descriptors



In [12]:

    
for dream_desc,drav_desc in [('Sweet','SWEET'),('Garlic','GARLIC, ONION'),('Bakery','VANILLA')]:
    d1 = dream_data[dream_desc].loc[shared_CIDs]
    d2 = drav_data[drav_desc].loc[shared_CIDs]
    pr = pearsonr(d1,d2)[0]
    sr = spearmanr(d1,d2)[0]
    print('%.2f (%.2f): %s vs %s' % (pr,sr,dream_desc,drav_desc))









    



0.82 (0.86): Sweet vs SWEET
0.80 (0.54): Garlic vs GARLIC, ONION
0.82 (0.51): Bakery vs VANILLA



In [13]:

    
def intensity_dilutions(CID_dilutions):
    int_CID_dilutions = []
    CIDs = list(set([x[0] for x in CID_dilutions]))
    for CID in CIDs:
        dilutions = np.array([dilution for CID_,dilution in CID_dilutions if CID_==CID])
        rel_dilutions = np.abs(dilutions + 3)
        int_CID_dilutions += [(CID,dilutions[rel_dilutions.argmin()])]
    return int_CID_dilutions

def max_dilutions(CID_dilutions):
    max_CID_dilutions = []
    CIDs = list(set([x[0] for x in CID_dilutions]))
    for CID in CIDs:
        dilutions = np.array([dilution for CID_,dilution in CID_dilutions if CID_==CID])
        max_CID_dilutions += [(CID,dilutions.max())]
    return max_CID_dilutions



In [33]:

    
use_et, max_features, max_depth, min_samples_leaf, trans_weight, regularize, use_mask = params.get_other_params()
kinds = {'shared1':shared_CID_dilutions,
         'shared2':shared_CID_dilutions,
         'drav_only':drav_only_CID_dilutions,
        }
dream_drav = {'Sweet':'SWEET','Garlic':'GARLIC, ONION','Flower':'FLORAL','Bakery':'VANILLA'}
rs = pd.DataFrame(columns=kinds,index=dream_drav.items())
for col,(dream_desc,drav_desc) in enumerate(dream_drav.items()):#descriptors):
    print(dream_desc,drav_desc)
    Y_dream = df_dream['Subject'].loc[dream_desc,:].mean(level=['CID','Dilution']).mean(skipna=True,axis=1)
    Y_drav = drav_data[drav_desc]
    rfc = RandomForestRegressor(n_estimators=100, max_features=max_features[col], max_depth=max_depth[col], 
                                min_samples_leaf=min_samples_leaf[col], random_state=0)
    X_train = X.loc[dream_only_CID_dilutions]
    Y_train = Y_dream.loc[dream_only_CID_dilutions]
    rfc.fit(X_train,Y_train)
    for kind,CID_dilutions in kinds.items():
        if dream_desc == 'Intensity':
            CID_dilutions = intensity_dilutions(CID_dilutions)
        else:
            CID_dilutions = max_dilutions(CID_dilutions)
        X_test = X.loc[CID_dilutions]
        Y_ = Y_dream if kind=='shared1' else Y_drav
        CIDs = [x[0] for x in CID_dilutions]
        Y_test = Y_.loc[CID_dilutions if kind=='shared1' else CIDs]
        Y_predicted = rfc.predict(X_test)
        rs[kind][(dream_desc,drav_desc)] = np.corrcoef(Y_predicted,Y_test)[0,1]









    



Garlic GARLIC, ONION
Bakery VANILLA
Sweet SWEET
Flower FLORAL



In [32]:

    
rs # 30 trees









    Out[32]:






  
    
      
      shared1
      drav_only
      shared2
    
  
  
    
      (Garlic, GARLIC, ONION)
      0.652968
      0.769221
      0.59541
    
    
      (Bakery, VANILLA)
      0.644034
      0.338058
      0.775768
    
    
      (Sweet, SWEET)
      0.618017
      0.606856
      0.59415
    
    
      (Flower, FLORAL)
      0.486432
      0.475694
      0.385654



In [34]:

    
rs # 100 trees









    Out[34]:






  
    
      
      shared1
      drav_only
      shared2
    
  
  
    
      (Garlic, GARLIC, ONION)
      0.682178
      0.81974
      0.641363
    
    
      (Bakery, VANILLA)
      0.704299
      0.381339
      0.822294
    
    
      (Sweet, SWEET)
      0.669137
      0.623467
      0.626174
    
    
      (Flower, FLORAL)
      0.505666
      0.513412
      0.472563

	FRUITY, CITRUS	LEMON	GRAPEFRUIT	ORANGE	FRUITY, OTHER THAN CITRUS	PINEAPPLE	GRAPE JUICE	STRAWBERRY	APPLE (FRUIT)	PEAR	...	PUTRID, FOUL, DECAYED	FECAL (LIKE MANURE)	CADAVEROUS (DEAD ANIMAL)	SICKENING	DRY, POWDERY	CHALKY	LIGHT	HEAVY	COOL, COOLING	WARM
CID
240	5.04	3.60	0.72	1.44	22.30	2.16	0.72	5.76	0.00	1.44	...	0.72	0.00	0.00	5.04	5.04	0.72	20.86	27.34	13.67	21.58
264	0.00	0.71	0.00	0.00	0.71	0.00	0.00	0.00	0.00	0.00	...	47.86	35.00	13.57	65.71	1.43	0.00	2.86	38.57	0.00	11.43
323	2.86	3.57	0.00	0.00	14.29	3.57	0.71	2.86	1.43	0.71	...	3.57	0.00	0.00	7.14	7.86	1.43	24.29	13.57	12.14	20.71
326	10.71	16.43	2.14	5.00	7.14	1.43	0.00	2.14	0.00	2.14	...	6.43	0.00	0.71	20.00	12.14	2.14	16.43	24.29	14.29	17.86
342	0.00	0.00	0.00	0.00	2.16	0.72	0.00	0.00	0.00	0.72	...	2.88	4.32	2.16	14.39	12.95	0.00	12.23	24.46	7.19	14.39

	MW	AMW	Sv	Se	Sp	Si	Mv	Me	Mp	Mi	...	Psychotic-80	Psychotic-50	Hypertens-80	Hypertens-50	Hypnotic-80	Hypnotic-50	Neoplastic-80	Neoplastic-50	Infective-80	Infective-50
CID
342	108.15	6.759	9.822	15.862	10.500	17.870	0.614	0.991	0.656	1.117	...	0	0	0	0	0	0	0	0	0	0
798	117.16	7.323	10.602	15.753	11.290	17.744	0.663	0.985	0.706	1.109	...	0	0	0	0	0	0	0	0	0	0
999	136.16	7.564	11.537	18.189	11.955	20.080	0.641	1.011	0.664	1.116	...	0	0	0	0	0	0	0	0	0	0
1127	88.19	6.784	7.294	12.611	8.693	14.581	0.561	0.970	0.669	1.122	...	0	0	0	0	0	0	0	0	0	0
1140	92.15	6.143	9.107	14.534	10.046	16.661	0.607	0.969	0.670	1.111	...	0	0	0	0	0	0	0	0	0	0

	XMOD	GATS4i	Chi1_EA(ri)	R2e	SpMax4_Bh(p)	SsssPbH	CATS2D_08_DL	B09[C-P]	Mor05v	P2v	...	CSI	F04[I-B]	WiA_H2	TDB09u	F07[P-Si]	F09[O-Cl]	F10[C-F]	HATS2e	Mor30u	SM3_Dz(e)
CID
126	27.239	1.015	4.402	0.887	2.260	0.0	0	0	-1.534	0.255	...	77	0	0.364	0.000	0	0	0	0.204	0.043	7.885
176	11.547	0.000	1.506	1.529	0.380	0.0	0	0	-0.398	0.297	...	9	0	0.625	0.000	0	0	0	1.196	0.065	3.596
177	9.192	0.000	0.913	1.672	0.380	0.0	0	0	-0.326	0.206	...	6	0	0.750	0.000	0	0	0	1.505	0.047	1.729
180	10.970	0.000	1.408	1.761	0.380	0.0	0	0	-0.567	0.306	...	9	0	0.625	0.000	0	0	0	0.991	0.077	3.804
196	30.065	0.880	4.461	1.590	2.185	0.0	0	0	-1.387	0.074	...	100	0	0.290	0.054	0	0	0	0.440	0.103	9.676

Descriptor	Intensity	Pleasantness	Bakery	Sweet	Fruit	Fish	Garlic	Spices	Cold	Sour	...	Acid	Warm	Musky	Sweaty	Ammonia	Decayed	Wood	Grass	Flower	Chemical
CID
126	49.551020	48.956522	0.630435	24.347826	6.239130	0.282609	1.608696	3.652174	4.000000	7.500000	...	2.847826	0.913043	8.304348	1.500000	1.804348	4.978261	1.260870	1.347826	9.260870	14.239130
176	11.551020	48.461538	2.538462	6.692308	1.230769	0.000000	7.192308	8.230769	5.769231	15.307692	...	3.500000	4.230769	6.769231	7.153846	3.615385	3.038462	1.807692	1.961538	4.115385	3.384615
177	33.265306	45.315789	8.421053	20.078947	1.763158	0.000000	0.631579	2.500000	7.000000	10.447368	...	4.447368	4.526316	3.421053	1.947368	3.789474	3.263158	1.026316	4.000000	2.184211	18.789474
180	16.714286	55.437500	3.375000	15.531250	6.468750	2.281250	2.593750	4.375000	7.625000	9.031250	...	3.218750	10.031250	7.531250	0.031250	5.281250	6.218750	8.625000	6.593750	12.625000	10.500000
196	22.183673	53.218750	1.750000	18.312500	1.312500	0.000000	1.906250	6.156250	5.562500	6.218750	...	3.218750	9.437500	6.781250	6.093750	2.781250	6.906250	1.187500	2.218750	9.625000	7.750000

	shared1	drav_only	shared2
(Garlic, GARLIC, ONION)	0.652968	0.769221	0.59541
(Bakery, VANILLA)	0.644034	0.338058	0.775768
(Sweet, SWEET)	0.618017	0.606856	0.59415
(Flower, FLORAL)	0.486432	0.475694	0.385654

	shared1	drav_only	shared2
(Garlic, GARLIC, ONION)	0.682178	0.81974	0.641363
(Bakery, VANILLA)	0.704299	0.381339	0.822294
(Sweet, SWEET)	0.669137	0.623467	0.626174
(Flower, FLORAL)	0.505666	0.513412	0.472563