In [2]:
# Preliminaries to work with the data.
%matplotlib inline
%run __init__.py
from utils import loading, scoring
from gerkin import dream,params
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
DATA = '../../data/'
In [3]:
# Load the data
descriptors = loading.get_descriptors(format='True')
sets = ['training','leaderboard']
all_CIDs = []
for set_ in sets:
all_CIDs += loading.get_CIDs(set_)
all_CIDs = sorted(all_CIDs)
mdx = dream.get_molecular_data(['dragon','episuite','morgan','nspdk','gramian',],all_CIDs)
mdx_onlydragon = dream.get_molecular_data(['dragon',],all_CIDs)
In [17]:
# Create the feature and descriptor arrays
X_forest,good1,good2,means,stds,imputer = dream.make_X(mdx,sets)
X_forest_onlydragon,good1_onlydragon,good2_onlydragon,means_onlydragon,stds_onlydragon,imputer_onlydragon = dream.make_X(mdx_onlydragon,sets)
# -1 removes the CID; -6 removes six NaN-heavy episuite features; +2 adds the dilution information
assert len(good1) == len(mdx[0]) -1 +2 -6
Y_all,imputer = dream.make_Y_obs(sets,target_dilution=None,imputer='mask')
In [18]:
# Load or compute the random forest model correlations (obtained from cross-validation)
from sklearn.ensemble import RandomForestRegressor,ExtraTreesRegressor
#trans_params = params.get_trans_params(Y_all, descriptors, plot=False)
use_et, max_features, max_depth, min_samples_leaf, trans_weight, regularize, use_mask = params.get_other_params()
def compute_importance_ranks(X,Y,n_estimators=50,
max_features='auto',
max_depth=None,min_samples_leaf=1,
random_state=0):
importances = np.zeros((21,X.shape[1])) # Empty matrix to store feature importances.
importance_ranks = np.zeros((21,X.shape[1])) # Empty matrix to store feature importance ranks.
for col in range(0,21): # For each descriptor.
print(col)
observed = Y[:,col] # Perceptual data for this descriptor.
if col==0:
est = ExtraTreesRegressor(n_estimators=n_estimators,max_features=max_features,max_depth=max_depth,
min_samples_leaf=min_samples_leaf,n_jobs=8,random_state=random_state)
else:
est = RandomForestRegressor(n_estimators=n_estimators,max_features=max_features,max_depth=max_depth,
min_samples_leaf=min_samples_leaf,oob_score=False,n_jobs=8,random_state=random_state)
est.fit(X,observed) # Fit the model on the training data.
importances[col,:] = est.feature_importances_
importance_ranks[col,:] = np.argsort(est.feature_importances_)[::-1] # Use feature importances to get ranks.
return importances,importance_ranks
if False:
importances,importance_ranks = compute_importance_ranks(X_forest[:,:-1],Y_all['mean_std'],n_estimators=50)
np.save('../../data/importances_forest',importances)
np.save('../../data/importance_ranks_forest',importance_ranks)
else:
importances = np.load('../../data/importances_forest.npy')
importance_ranks = np.load('../../data/importance_ranks_forest.npy')
if True:
importances_onlydragon,importance_ranks_onlydragon = compute_importance_ranks(X_forest_onlydragon[:,:-1],Y_all['mean_std'],n_estimators=50)
np.save('../../data/importances_forest_onlydragon',importances_onlydragon)
np.save('../../data/importance_ranks_forest_onlydragon',importance_ranks_onlydragon)
else:
importances_onlydragon = np.load('../../data/importances_forest_onlydragon.npy')
importance_ranks_onlydragon = np.load('../../data/importance_ranks_forest_onlydragon.npy')
In [5]:
nspdk_CIDs = pd.read_csv('%s/derived/nspdk_cid.csv' % DATA, header=None, dtype='int').as_matrix().squeeze()
nspdk_dict = dream.make_nspdk_dict(all_CIDs)
nspdk_feature_numbers = list(nspdk_dict.keys())
x = pd.read_table('%s/DREAM_episuite_descriptors.txt' % DATA,index_col=0).drop('SMILES',1)
x = x.loc[all_CIDs]
x.iloc[:,47] = 1*(x.iloc[:,47]=='YES ')
episuite_names = list(x)
episuite = x.as_matrix()
_,good = dream.purge1_X(episuite)
episuite_names = [e for i,e in enumerate(episuite_names) if i in good]
with open('%s/morgan_sim.csv' % DATA) as f:
x = f.readline()
morgan_template_CIDs = [int(xi) for xi in x.split(',')[1:]]
assert len(morgan_template_CIDs) == len(nspdk_CIDs) == 2437
In [6]:
from utils import loading
headers,_ = loading.load_molecular_data()
dragon_feature_names = headers[1:]
# Replace with nspdkgramian range with nspdk_CIDs
all_feature_names = ['dragon_%s' % s for s in dragon_feature_names] + \
['episuite_%s' % x for x in episuite_names] + \
['morgan_%d' % x for x in morgan_template_CIDs] + \
['nspdk_%s' % s for s in nspdk_feature_numbers] + \
['nspdkgramian_%d' % i for i in range(2437)] + ['conc_absolute','conc_relative']
good_feature_names = [all_feature_names[i] for i in good2]
good_feature_names = good_feature_names[:-1] # Remove relative dilution since we didn't use this for the fit
assert(len(good_feature_names) == importance_ranks.shape[1])
In [7]:
new_ranks = pd.DataFrame(importances.T,index=good_feature_names,columns=descriptors)
old_ranks = pd.read_csv('/Users/rgerkin/Desktop/feature_importances.csv',index_col=0)
old_ranks.columns = descriptors
old_ranks.drop('conc_relative',inplace=True)
#old_ranks.drop([s for s in list(old_ranks.index) if 'episuite_' in s],inplace=True)
new_ranks.drop('conc_absolute',inplace=True)
old_ranks.drop('conc_absolute',inplace=True)
In [8]:
desc = 'Pleasantness'
from IPython.display import display
#pd.set_option('display.precision',2)
pd.set_option('display.float_format', lambda x: '%.4f' % x)
display(new_ranks.sort_values(desc,ascending=False)[[desc]].head(10))
display(old_ranks.sort_values(desc,ascending=False)[[desc]].head(10))
In [9]:
nspdk_CIDs[1249],nspdk_CIDs[764],nspdk_CIDs[390]
Out[9]:
In [10]:
diff = list(set(list(old_ranks.index)).difference(new_ranks.index))
old_ranks_2 = old_ranks.drop(diff,0).loc[new_ranks.index]
fig,axes = plt.subplots(7,3,figsize=(7,16))
for i,desc in enumerate(descriptors):
old_sorted = list(old_ranks_2.sort_values(desc,ascending=False).index)
new_sorted = list(new_ranks.sort_values(desc,ascending=False).index)
print(desc)
for j in range(10):
old_name = old_sorted[j]
print("\t%d -> %d (%s)" % (j,new_sorted.index(old_name),old_name))
ax = axes.flat[i]
#print(desc)
old = old_ranks_2[desc]
new = new_ranks[desc]
ax.scatter(old,new)
ax.set_xlim(0.0001,1)
ax.set_ylim(0.0001,1)
ax.set_xscale('log')
ax.set_yscale('log')
ax.set_title(desc)
plt.tight_layout()
In [11]:
for i,desc in enumerate(descriptors):
new_sorted = list(new_ranks.sort_values(desc,ascending=False).index)
print(desc)
for j in range(5):
new_name = new_sorted[j]
if 'nspdkgramian_' in new_name:
_,index = new_name.split('_')
cid = nspdk_CIDs[int(index)]
new_name = 'nspdkgramian_%d' % cid
print("\t%d) %s" % (j+1,new_name))
In [15]:
for i,desc in enumerate(descriptors):
old_sorted = list(old_ranks.sort_values(desc,ascending=False).index)
print(desc)
for j in range(20):
old_name = old_sorted[j]
if 'nspdkgramian_' in old_name:
_,index = old_name.split('_')
cid = nspdk_CIDs[int(index)]
new_name = 'nspdkgramian_%d' % cid
else:
new_name = old_name
print("\t%d) %s" % (j+1,new_name))
In [22]:
from utils import loading
headers,_ = loading.load_molecular_data()
dragon_feature_names = headers[1:]
all_feature_names_onlydragon = ['dragon_%s' % s for s in dragon_feature_names] + ['conc_absolute','conc_relative']
good_feature_names_onlydragon = [all_feature_names_onlydragon[i] for i in good2_onlydragon]
good_feature_names_onlydragon = good_feature_names_onlydragon[:-1] # Remove relative dilution since we didn't use this for the fit
assert(len(good_feature_names_onlydragon) == importance_ranks_onlydragon.shape[1])
new_ranks_onlydragon = pd.DataFrame(importances_onlydragon.T,index=good_feature_names_onlydragon,columns=descriptors)
new_ranks_onlydragon.drop('conc_absolute',inplace=True)
for i,desc in enumerate(descriptors):
new_sorted_onlydragon = list(new_ranks_onlydragon.sort_values(desc,ascending=False).index)
print(desc)
for j in range(20):
name = new_sorted_onlydragon[j]
print("\t%d) %s" % (j+1,name))
In [4]:
nspdk_dict = dream.make_nspdk_dict(all_CIDs)
In [9]:
x = 0
for key,value in list(nspdk_dict.items()):
x += len(value)
x /= len(nspdk_dict)
x
Out[9]:
In [8]:
len(nspdk_dict)
Out[8]:
In [ ]: