In [1]:
%matplotlib inline
import os
import sys
curr_path = os.getcwd()
gerkin_path = os.path.split(curr_path)[0]
olfaction_prediction_path = os.path.split(gerkin_path)[0]
sys.path.append(olfaction_prediction_path)
import opc_python
import numpy as np
import matplotlib.pyplot as plt
from opc_python.utils import loading, scoring
from opc_python.gerkin import dream,fit1,fit2,params
In [2]:
# Get all Chemical IDs and located the data directory.
all_CIDs = sorted(loading.get_CIDs('training')+loading.get_CIDs('leaderboard')+loading.get_CIDs('testset'))
mdx = dream.get_molecular_data(['dragon'],all_CIDs)
In [18]:
# Create the feature matrices from the feature dicts.
X_training,good1,good2,means,stds,imputer = dream.make_X(mdx,"training")
X_all,good1,good2,means,stds,imputer = dream.make_X(mdx,['training','leaderboard'],good1=good1,good2=good2,means=means,stds=stds)
X_testset_int,good1,good2,means,stds,imputer = dream.make_X(mdx,['testset'],target_dilution=-3,good1=good1,good2=good2,means=means,stds=stds)
X_testset_other,good1,good2,means,stds,imputer = dream.make_X(mdx,['testset'],target_dilution='high',good1=good1,good2=good2,means=means,stds=stds)
In [4]:
# Create descriptor matrices for the combined training and leaderboard sets.
# One is done with median imputation, and the other by masking missing values.
Y_all_imp,imputer = dream.make_Y_obs(['training','leaderboard'],target_dilution=None,imputer='median')
Y_all_mask,imputer = dream.make_Y_obs(['training','leaderboard'],target_dilution=None,imputer='mask')
In [5]:
perceptual_headers, perceptual_obs_data = loading.load_perceptual_data('training')
descriptors = [x.split('/')[1 if i==1 else 0] for i,x in enumerate(perceptual_headers[6:])]
trans_params = params.get_trans_params(Y_all_mask,descriptors,plot=True)
In [6]:
# Load optimal parameters (obtained from extensive cross-validation).
def get_params(i):
return {col:params.best[col][i] for col in range(42)}
use_et,max_features,max_depth,min_samples_leaf,regularize,use_mask = [get_params(i) for i in range(6)]
trans_weight = regularize.copy()
for i in range(21):
trans_weight[i] = regularize[i+21]
In [7]:
import pandas
# Compare to the results in the pre-computed "PredInsights_Fdiffodor.txt" file.
df_static = pandas.read_csv('../../data/PredInsights_Fdiffodor.txt',
delimiter='\t')
# First 5 rows of loaded data.
bad_features = list(df_static[:10]['Feature'])
molecular_headers, molecular_data = loading.load_molecular_data()
all_features = molecular_headers[1:]
bad_feature_indices = [all_features.index(bf) for bf in bad_features]
In [19]:
# Create the feature matrices from the feature dicts.
X_training_nobad,good1,good2,means,stds,imputer = dream.make_X(mdx,"training",bad=bad_feature_indices)
X_all_nobad,good1,good2,means,stds,imputer = dream.make_X(mdx,['training','leaderboard'],bad=bad_feature_indices,good1=good1,good2=good2,means=means,stds=stds)
X_testset_int_nobad,good1,good2,means,stds,imputer = dream.make_X(mdx,['testset'],target_dilution=-3,bad=bad_feature_indices,good1=good1,good2=good2,means=means,stds=stds)
X_testset_other_nobad,good1,good2,means,stds,imputer = dream.make_X(mdx,['testset'],target_dilution='high',bad=bad_feature_indices,good1=good1,good2=good2,means=means,stds=stds)
In [23]:
# Fit all available data.
# Ignoring warning that arises if too few trees are used.
# Ignore intensity score which is based on within-sample validation,
# due to use of ExtraTreesClassifier.
n_estimators = 500
rfcs_original,score,rs = fit2.rfc_final(X_all,Y_all_imp['mean_std'],Y_all_mask['mean_std'],
max_features,min_samples_leaf,max_depth,use_et,use_mask,
trans_weight,trans_params,n_estimators=n_estimators)
In [24]:
rfcs_nobad,score,rs = fit2.rfc_final(X_all_nobad,Y_all_imp['mean_std'],Y_all_mask['mean_std'],
max_features,min_samples_leaf,max_depth,use_et,use_mask,
trans_weight,trans_params,n_estimators=n_estimators)
In [25]:
# Make challenge 2 testset prediction files from the models.
loading.make_prediction_files(rfcs_original,X_testset_int,X_testset_other,'testset',2,name='testset_original',write=True,
trans_weight=trans_weight,trans_params=trans_params)
# Make challenge 2 testset prediction files from the models.
loading.make_prediction_files(rfcs_nobad,X_testset_int_nobad,X_testset_other_nobad,'testset',2,name='testset_nobad',write=True,
trans_weight=trans_weight,trans_params=trans_params)
Out[25]:
In [31]:
testset_CIDs = loading.get_CIDs('testset')
testset_CIDs.index(5862) # L-cysteine
Out[31]:
In [43]:
print('INTENSITY: %.1f' % rfcs_original[0].predict(X_testset_int[11,:]))
for i in range(1,21):
print('%s: %.1f' % (descriptors[i],
rfcs_original[i].predict(X_testset_other[11,:])))
In [42]:
descriptors
Out[42]:
In [ ]: