Han, Kehang (hkh12@mit.edu)
This notebook is designed to use a big set of tricyclics for testing the performance of new polycyclics thermo estimator. Currently the dataset contains 2903 tricyclics that passed isomorphic check.
In [ ]:
from rmgpy.data.rmg import RMGDatabase
from rmgpy import settings
from rmgpy.species import Species
from rmgpy.molecule import Molecule
from rmgpy.molecule import Group
from rmgpy.rmg.main import RMG
from rmgpy.cnn_framework.predictor import Predictor
from IPython.display import display
import numpy as np
import os
import pandas as pd
from pymongo import MongoClient
import logging
logging.disable(logging.CRITICAL)
In [ ]:
from bokeh.charts import Histogram
from bokeh.plotting import figure, show
from bokeh.io import output_notebook
output_notebook()
In [ ]:
host = 'mongodb://user:user@rmg.mit.edu/admin'
port = 27018
client = MongoClient(host, port)
In [ ]:
db = getattr(client, 'sdata134k')
db.collection_names()
In [ ]:
def get_data(db, collection_name):
collection = getattr(db, collection_name)
db_cursor = collection.find()
# collect data
print('reading data...')
db_mols = []
for db_mol in db_cursor:
db_mols.append(db_mol)
print('done')
return db_mols
In [ ]:
model = '/home/mjliu/Code/RMG-Py/examples/cnn/evaluate/test_model'
h298_predictor = Predictor()
predictor_input = os.path.join(model,
'predictor_input.py')
h298_predictor.load_input(predictor_input)
param_path = os.path.join(model,
'saved_model',
'full_train.h5')
h298_predictor.load_parameters(param_path)
In [ ]:
# fetch testing dataset
collection_name = 'small_cyclic_table'
db_mols = get_data(db, collection_name)
print len(db_mols)
In [ ]:
filterList = [
Group().fromAdjacencyList("""1 R u0 p0 c0 {2,[S,D,T]} {9,[S,D,T]}
2 R u0 p0 c0 {1,[S,D,T]} {3,[S,D,T]}
3 R u0 p0 c0 {2,[S,D,T]} {4,[S,D,T]}
4 R u0 p0 c0 {3,[S,D,T]} {5,[S,D,T]}
5 R u0 p0 c0 {4,[S,D,T]} {6,[S,D,T]}
6 R u0 p0 c0 {5,[S,D,T]} {7,[S,D,T]}
7 R u0 p0 c0 {6,[S,D,T]} {8,[S,D,T]}
8 R u0 p0 c0 {7,[S,D,T]} {9,[S,D,T]}
9 R u0 p0 c0 {1,[S,D,T]} {8,[S,D,T]}
"""),
Group().fromAdjacencyList("""1 R u0 p0 c0 {2,S} {5,S}
2 R u0 p0 c0 {1,S} {3,D}
3 R u0 p0 c0 {2,D} {4,S}
4 R u0 p0 c0 {3,S} {5,S}
5 R u0 p0 c0 {1,S} {4,S} {6,S} {9,S}
6 R u0 p0 c0 {5,S} {7,S}
7 R u0 p0 c0 {6,S} {8,D}
8 R u0 p0 c0 {7,D} {9,S}
9 R u0 p0 c0 {5,S} {8,S}
"""),
]
In [ ]:
test_size = 0
R = 1.987 # unit: cal/mol/K
validation_test_dict = {} # key: spec.label, value: (thermo_heuristic, thermo_qm)
spec_labels = []
spec_dict = {}
H298s_qm = []
Cp298s_qm = []
H298s_cnn = []
Cp298s_cnn = []
for db_mol in db_mols:
smiles_in = str(db_mol["SMILES_input"])
spec_in = Species().fromSMILES(smiles_in)
for grp in filterList:
if spec_in.molecule[0].isSubgraphIsomorphic(grp):
break
else:
spec_labels.append(smiles_in)
# qm: just free energy but not free energy of formation
G298_qm = float(db_mol["G298"])*627.51 # unit: kcal/mol
H298_qm = float(db_mol["Hf298(kcal/mol)"]) # unit: kcal/mol
Cv298_qm = float(db_mol["Cv298"]) # unit: cal/mol/K
Cp298_qm = Cv298_qm + R # unit: cal/mol/K
H298s_qm.append(H298_qm)
# cnn
H298_cnn = h298_predictor.predict(spec_in.molecule[0]) # unit: kcal/mol
H298s_cnn.append(H298_cnn)
spec_dict[smiles_in] = spec_in
In [ ]:
# create pandas dataframe
validation_test_df = pd.DataFrame(index=spec_labels)
validation_test_df['H298_cnn(kcal/mol)'] = pd.Series(H298s_cnn, index=validation_test_df.index)
validation_test_df['H298_qm(kcal/mol)'] = pd.Series(H298s_qm, index=validation_test_df.index)
heuristic_qm_diff = abs(validation_test_df['H298_cnn(kcal/mol)']-validation_test_df['H298_qm(kcal/mol)'])
validation_test_df['H298_cnn_qm_diff(kcal/mol)'] = pd.Series(heuristic_qm_diff, index=validation_test_df.index)
display(validation_test_df.head())
print "Validation test dataframe has {0} tricyclics.".format(len(spec_labels))
validation_test_df['H298_cnn_qm_diff(kcal/mol)'].describe()
In [ ]:
diff20_df = validation_test_df[(validation_test_df['H298_heuristic_qm_diff(kcal/mol)'] > 15)
& (validation_test_df['H298_heuristic_qm_diff(kcal/mol)'] <= 500)]
len(diff20_df)
In [ ]:
print len(diff20_df)
for smiles in diff20_df.index:
print "***********cnn = {0}************".format(diff20_df[diff20_df.index==smiles]['H298_cnn(kcal/mol)'])
print "***********qm = {0}************".format(diff20_df[diff20_df.index==smiles]['H298_qm(kcal/mol)'])
spe = spec_dict[smiles]
display(spe)
In [ ]:
p = figure(plot_width=500, plot_height=400)
# plot_df = validation_test_df[validation_test_df['H298_heuristic_qm_diff(kcal/mol)'] < 10]
plot_df = validation_test_df
# add a square renderer with a size, color, and alpha
p.circle(plot_df['H298_cnn(kcal/mol)'], plot_df['H298_qm(kcal/mol)'],
size=5, color="green", alpha=0.5)
x = np.array([-50, 200])
y = x
p.line(x=x, y=y, line_width=2, color='#636363')
p.line(x=x, y=y+10, line_width=2,line_dash="dashed", color='#bdbdbd')
p.line(x=x, y=y-10, line_width=2, line_dash="dashed", color='#bdbdbd')
p.xaxis.axis_label = "H298 CNN (kcal/mol)"
p.yaxis.axis_label = "H298 Quantum (kcal/mol)"
p.xaxis.axis_label_text_font_style = "normal"
p.yaxis.axis_label_text_font_style = "normal"
p.xaxis.axis_label_text_font_size = "16pt"
p.yaxis.axis_label_text_font_size = "16pt"
p.xaxis.major_label_text_font_size = "12pt"
p.yaxis.major_label_text_font_size = "12pt"
show(p)
In [ ]:
len(plot_df.index)
In [ ]:
from bokeh.models import Range1d
hist = Histogram(validation_test_df,
values='Cp298_heuristic_qm_diff(cal/mol/K)', xlabel='Cp Prediction Error (cal/mol/K)',
ylabel='Number of Testing Molecules',
bins=50,\
plot_width=500, plot_height=300)
# hist.y_range = Range1d(0, 1640)
hist.x_range = Range1d(0, 20)
show(hist)
In [ ]:
with open('validation_test_sdata134k_2903_pyPoly_dbPoly.csv', 'w') as fout:
validation_test_df.to_csv(fout)
In [ ]: