In [1]:
# Preliminaries to work with the data.
%matplotlib inline
%run __init__.py
from utils import loading, scoring
from gerkin import dream,params,fit2,dravnieks
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from scipy.stats import pearsonr,spearmanr
import scipy.io as sio
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import ShuffleSplit
In [2]:
dream_CIDs = loading.get_CIDs(['training','leaderboard','testset'])
dream_CID_dilutions = loading.get_CID_dilutions(['training','leaderboard','testset'])
In [3]:
df_dream = loading.load_perceptual_data(['training','leaderboard','testset'])
# Average over replicates, then pick lowest dilution
low_dilutions = df_dream.sort_index(level=['Descriptor','CID','Dilution']).groupby(level=['Descriptor','CID','Dilution'])\
.mean().fillna(999).groupby(level=['Descriptor','CID']).last().replace(999,float('NaN'))
descriptors = loading.get_descriptors(format=True)
In [4]:
dream_features = loading.get_molecular_data(['dragon','morgan'],dream_CIDs)
In [5]:
drav_cas = pd.read_table(os.path.join('../../data','dravnieks_cas.txt'),delimiter=' ',index_col=0)
drav_descriptors = pd.read_table(os.path.join('../../data','dravnieks_descriptors.txt'),index_col=None,header=0,names=['Descriptor'])
drav_descriptors['Descriptor'] = [' '.join(x.split(' ')[1:]) for x in drav_descriptors['Descriptor']]
drav_data_raw = sio.loadmat(os.path.join('../../data','dravnieks.mat'))
drav_data = pd.DataFrame(drav_data_raw['pu'].T,index=drav_cas['C.A.S.'],columns=drav_descriptors['Descriptor'])
drav_cas_cid = pd.read_table(os.path.join('../../data','dravnieks_cas_cid.txt'),header=None,index_col=0,names=['CID'])
drav_cas_cid[:] = drav_cas_cid.values.astype(int)
drav_cas_cid.index.name = 'C.A.S.'
drav_data = drav_data.join(drav_cas_cid,how='left').set_index('CID') # Join with CIDs
drav_data.drop_duplicates(inplace=True) # Remove duplicate data
drav_data = drav_data[drav_data.index>0]#.sort_index() # Remove negative indices
drav_data = drav_data.groupby(level='CID').first() # Remove duplicate indices
drav_data.head()
Out[5]:
In [36]:
drav_data.to_csv('/Users/rgerkin/Desktop/dravnieks.csv')
In [6]:
drav_dragon = pd.read_table('../../data/dragondravnieks.txt',index_col=1).drop('No.',axis=1)
drav_dragon.index.name = 'CID'
drav_dragon.head()
Out[6]:
In [7]:
shared_features = list(set(drav_dragon).intersection(dream_features['dragon']))
dragon_all = pd.concat((drav_dragon,dream_features['dragon']))[shared_features].sort_index()
dragon_all.head()
Out[7]:
In [8]:
dream_data = low_dilutions.mean(skipna=True,axis=1).unstack('Descriptor')[descriptors]
dream_data.head()
Out[8]:
In [9]:
shared_CIDs = list(set(dream_data.index).intersection(drav_data.index))
print("There are %d molecules in common between DREAM and Dravnieks" % len(shared_CIDs))
drav_only_CIDs = list(set(drav_data.index).difference(dream_data.index))
drav_only_CIDs.remove(122510) # Couldn't compute Dragon features for this one.
print("There are %d molecules in Dravnieks that are not in DREAM" % len(drav_only_CIDs))
dream_only_CIDs = list(set(dream_CIDs).difference(shared_CIDs))
print("There are %d molecules in DREAM that are not in Dravnieks" % len(dream_only_CIDs))
In [27]:
shared_CID_dilutions = [x for x in dream_CID_dilutions if x[0] in shared_CIDs]
drav_only_CID_dilutions = [(x,-3) for x in drav_only_CIDs]
dream_only_CID_dilutions = [x for x in dream_CID_dilutions if x[0] in dream_only_CIDs]
all_CID_dilutions = dream_only_CID_dilutions + drav_only_CID_dilutions
In [11]:
X,good1,good2,means,stds,imputer = dream.make_X(dragon_all,None,CID_dilutions=all_CID_dilutions)
In [12]:
for dream_desc,drav_desc in [('Sweet','SWEET'),('Garlic','GARLIC, ONION'),('Bakery','VANILLA')]:
d1 = dream_data[dream_desc].loc[shared_CIDs]
d2 = drav_data[drav_desc].loc[shared_CIDs]
pr = pearsonr(d1,d2)[0]
sr = spearmanr(d1,d2)[0]
print('%.2f (%.2f): %s vs %s' % (pr,sr,dream_desc,drav_desc))
In [13]:
def intensity_dilutions(CID_dilutions):
int_CID_dilutions = []
CIDs = list(set([x[0] for x in CID_dilutions]))
for CID in CIDs:
dilutions = np.array([dilution for CID_,dilution in CID_dilutions if CID_==CID])
rel_dilutions = np.abs(dilutions + 3)
int_CID_dilutions += [(CID,dilutions[rel_dilutions.argmin()])]
return int_CID_dilutions
def max_dilutions(CID_dilutions):
max_CID_dilutions = []
CIDs = list(set([x[0] for x in CID_dilutions]))
for CID in CIDs:
dilutions = np.array([dilution for CID_,dilution in CID_dilutions if CID_==CID])
max_CID_dilutions += [(CID,dilutions.max())]
return max_CID_dilutions
In [33]:
use_et, max_features, max_depth, min_samples_leaf, trans_weight, regularize, use_mask = params.get_other_params()
kinds = {'shared1':shared_CID_dilutions,
'shared2':shared_CID_dilutions,
'drav_only':drav_only_CID_dilutions,
}
dream_drav = {'Sweet':'SWEET','Garlic':'GARLIC, ONION','Flower':'FLORAL','Bakery':'VANILLA'}
rs = pd.DataFrame(columns=kinds,index=dream_drav.items())
for col,(dream_desc,drav_desc) in enumerate(dream_drav.items()):#descriptors):
print(dream_desc,drav_desc)
Y_dream = df_dream['Subject'].loc[dream_desc,:].mean(level=['CID','Dilution']).mean(skipna=True,axis=1)
Y_drav = drav_data[drav_desc]
rfc = RandomForestRegressor(n_estimators=100, max_features=max_features[col], max_depth=max_depth[col],
min_samples_leaf=min_samples_leaf[col], random_state=0)
X_train = X.loc[dream_only_CID_dilutions]
Y_train = Y_dream.loc[dream_only_CID_dilutions]
rfc.fit(X_train,Y_train)
for kind,CID_dilutions in kinds.items():
if dream_desc == 'Intensity':
CID_dilutions = intensity_dilutions(CID_dilutions)
else:
CID_dilutions = max_dilutions(CID_dilutions)
X_test = X.loc[CID_dilutions]
Y_ = Y_dream if kind=='shared1' else Y_drav
CIDs = [x[0] for x in CID_dilutions]
Y_test = Y_.loc[CID_dilutions if kind=='shared1' else CIDs]
Y_predicted = rfc.predict(X_test)
rs[kind][(dream_desc,drav_desc)] = np.corrcoef(Y_predicted,Y_test)[0,1]
In [32]:
rs # 30 trees
Out[32]:
In [34]:
rs # 100 trees
Out[34]: