Thermochemistry Validation Test

Han, Kehang (hkh12@mit.edu)

This notebook is designed to use a big set of tricyclics for testing the performance of new polycyclics thermo estimator. Currently the dataset contains 2903 tricyclics that passed isomorphic check.

Set up


In [ ]:
from rmgpy.data.rmg import RMGDatabase
from rmgpy import settings
from rmgpy.species import Species
from rmgpy.molecule import Molecule
from rmgpy.molecule import Group
from rmgpy.rmg.main import RMG
from rmgpy.cnn_framework.predictor import Predictor

from IPython.display import display
import numpy as np
import os
import pandas as pd
from pymongo import MongoClient
import logging
logging.disable(logging.CRITICAL)

In [ ]:
from bokeh.charts import Histogram
from bokeh.plotting import figure, show
from bokeh.io import output_notebook
output_notebook()

In [ ]:
host = 'mongodb://user:user@rmg.mit.edu/admin'
port = 27018
client = MongoClient(host, port)

In [ ]:
db = getattr(client, 'sdata134k')
db.collection_names()

In [ ]:
def get_data(db, collection_name):
    collection = getattr(db, collection_name)
    db_cursor = collection.find()

    # collect data
    print('reading data...')
    db_mols = []
    for db_mol in db_cursor:
        db_mols.append(db_mol)
    print('done')

    return db_mols

In [ ]:
model = '/home/mjliu/Code/RMG-Py/examples/cnn/evaluate/test_model'

h298_predictor = Predictor()

predictor_input = os.path.join(model,
                              'predictor_input.py')

h298_predictor.load_input(predictor_input)

param_path = os.path.join(model,
                         'saved_model',
                         'full_train.h5')
h298_predictor.load_parameters(param_path)

In [ ]:
# fetch testing dataset
collection_name = 'large_linear_polycyclic_table'
db_mols = get_data(db, collection_name)
db_mols.extend(get_data(db, 'large_fused_polycyclic_table'))
print len(db_mols)

Validation Test

Collect data from heuristic algorithm and qm library


In [ ]:
filterList = [
    Group().fromAdjacencyList("""1  R u0 p0 c0 {2,[S,D,T]} {9,[S,D,T]}
    2  R u0 p0 c0 {1,[S,D,T]} {3,[S,D,T]}
    3  R u0 p0 c0 {2,[S,D,T]} {4,[S,D,T]}
    4  R u0 p0 c0 {3,[S,D,T]} {5,[S,D,T]}
    5  R u0 p0 c0 {4,[S,D,T]} {6,[S,D,T]}
    6  R u0 p0 c0 {5,[S,D,T]} {7,[S,D,T]}
    7  R u0 p0 c0 {6,[S,D,T]} {8,[S,D,T]}
    8  R u0 p0 c0 {7,[S,D,T]} {9,[S,D,T]}
    9  R u0 p0 c0 {1,[S,D,T]} {8,[S,D,T]}
    """),
    Group().fromAdjacencyList("""1  R u0 p0 c0 {2,S} {5,S}
    2  R u0 p0 c0 {1,S} {3,D}
    3  R u0 p0 c0 {2,D} {4,S}
    4  R u0 p0 c0 {3,S} {5,S}
    5  R u0 p0 c0 {1,S} {4,S} {6,S} {9,S}
    6  R u0 p0 c0 {5,S} {7,S}
    7  R u0 p0 c0 {6,S} {8,D}
    8  R u0 p0 c0 {7,D} {9,S}
    9  R u0 p0 c0 {5,S} {8,S}
    """),
]

In [ ]:
test_size = 0
R = 1.987 # unit: cal/mol/K
validation_test_dict = {} # key: spec.label, value: (thermo_heuristic, thermo_qm)

spec_labels = []
spec_dict = {}
H298s_qm = []
Cp298s_qm = []
H298s_cnn = []
Cp298s_cnn = []
for db_mol in db_mols:
    smiles_in = str(db_mol["SMILES_input"])
    spec_in = Species().fromSMILES(smiles_in)
    
    for grp in filterList:
        if spec_in.molecule[0].isSubgraphIsomorphic(grp):
            break
    else:
        spec_labels.append(smiles_in)

        # qm: just free energy but not free energy of formation
        G298_qm = float(db_mol["G298"])*627.51 # unit: kcal/mol
        H298_qm = float(db_mol["Hf298(kcal/mol)"]) # unit: kcal/mol
        Cv298_qm = float(db_mol["Cv298"]) # unit: cal/mol/K
        Cp298_qm = Cv298_qm + R # unit: cal/mol/K

        H298s_qm.append(H298_qm)

        # cnn
        H298_cnn = h298_predictor.predict(spec_in.molecule[0]) # unit: kcal/mol

        H298s_cnn.append(H298_cnn)

        spec_dict[smiles_in] = spec_in

Create pandas dataframe for easy data validation


In [ ]:
# create pandas dataframe
validation_test_df = pd.DataFrame(index=spec_labels)

validation_test_df['H298_cnn(kcal/mol)'] = pd.Series(H298s_cnn, index=validation_test_df.index)
validation_test_df['H298_qm(kcal/mol)'] = pd.Series(H298s_qm, index=validation_test_df.index)

heuristic_qm_diff = abs(validation_test_df['H298_cnn(kcal/mol)']-validation_test_df['H298_qm(kcal/mol)'])
validation_test_df['H298_cnn_qm_diff(kcal/mol)'] = pd.Series(heuristic_qm_diff, index=validation_test_df.index)
display(validation_test_df.head())
print "Validation test dataframe has {0} tricyclics.".format(len(spec_labels))
validation_test_df['H298_cnn_qm_diff(kcal/mol)'].describe()

categorize error sources


In [ ]:
diff20_df = validation_test_df[(validation_test_df['H298_heuristic_qm_diff(kcal/mol)'] > 15) 
                   & (validation_test_df['H298_heuristic_qm_diff(kcal/mol)'] <= 500)]
len(diff20_df)

In [ ]:
print len(diff20_df)
for smiles in diff20_df.index:
    print "***********cnn = {0}************".format(diff20_df[diff20_df.index==smiles]['H298_cnn(kcal/mol)'])
    
    print "***********qm = {0}************".format(diff20_df[diff20_df.index==smiles]['H298_qm(kcal/mol)'])
    
    spe = spec_dict[smiles]
    display(spe)

Parity Plot: heuristic vs. qm


In [ ]:
p = figure(plot_width=500, plot_height=400)

# plot_df = validation_test_df[validation_test_df['H298_heuristic_qm_diff(kcal/mol)'] < 10]
plot_df = validation_test_df

# add a square renderer with a size, color, and alpha
p.circle(plot_df['H298_cnn(kcal/mol)'], plot_df['H298_qm(kcal/mol)'], 
         size=5, color="green", alpha=0.5)

x = np.array([-50, 200])
y = x
p.line(x=x, y=y, line_width=2, color='#636363')
p.line(x=x, y=y+10, line_width=2,line_dash="dashed", color='#bdbdbd')
p.line(x=x, y=y-10, line_width=2, line_dash="dashed", color='#bdbdbd')

p.xaxis.axis_label = "H298 CNN (kcal/mol)"
p.yaxis.axis_label = "H298 Quantum (kcal/mol)"
p.xaxis.axis_label_text_font_style = "normal"
p.yaxis.axis_label_text_font_style = "normal"
p.xaxis.axis_label_text_font_size = "16pt"
p.yaxis.axis_label_text_font_size = "16pt"
p.xaxis.major_label_text_font_size = "12pt"
p.yaxis.major_label_text_font_size = "12pt"

show(p)

In [ ]:
len(plot_df.index)

Histogram of abs(heuristic-qm)


In [ ]:
from bokeh.models import Range1d

hist = Histogram(validation_test_df,
                 values='Cp298_heuristic_qm_diff(cal/mol/K)', xlabel='Cp Prediction Error (cal/mol/K)',
                 ylabel='Number of Testing Molecules',
                bins=50,\
                plot_width=500, plot_height=300)
# hist.y_range = Range1d(0, 1640)
hist.x_range = Range1d(0, 20)
show(hist)

In [ ]:
with open('validation_test_sdata134k_2903_pyPoly_dbPoly.csv', 'w') as fout:
    validation_test_df.to_csv(fout)

In [ ]: