Thermochemistry Validation Test

Han, Kehang (hkh12@mit.edu)

This notebook is designed to use a big set of tricyclics for testing the performance of new polycyclics thermo estimator. Currently the dataset contains 2903 tricyclics that passed isomorphic check.

Set up


In [ ]:
from rmgpy.data.rmg import RMGDatabase
from rmgpy import settings
from rmgpy.species import Species
from rmgpy.molecule import Group
from rmgpy.rmg.main import RMG
from IPython.display import display
import numpy as np
import os
import pandas as pd
from pymongo import MongoClient
import logging
logging.disable(logging.CRITICAL)

In [ ]:
from bokeh.charts import Histogram
from bokeh.plotting import figure, show
from bokeh.io import output_notebook
output_notebook()

In [ ]:
host = 'mongodb://user:user@rmg.mit.edu/admin'
port = 27018
client = MongoClient(host, port)

In [ ]:
db = getattr(client, 'sdata134k')
db.collection_names()

In [ ]:
def get_data(db, collection_name):
    collection = getattr(db, collection_name)
    db_cursor = collection.find()

    # collect data
    print('reading data...')
    db_mols = []
    for db_mol in db_cursor:
        db_mols.append(db_mol)
    print('done')

    return db_mols

In [ ]:
database = RMGDatabase()

In [ ]:
database.load(
    settings['database.directory'],
    thermoLibraries=[],
    kineticsFamilies='none',
    kineticsDepositories='none',
    reactionLibraries = []
)

thermoDatabase = database.thermo

In [ ]:
# fetch testing dataset
collection_name = 'large_linear_polycyclic_table'
db_mols = get_data(db, collection_name)
db_mols.extend(get_data(db, 'large_fused_polycyclic_table'))
print len(db_mols)

Validation Test

Collect data from heuristic algorithm and qm library


In [ ]:
filterList = [
    Group().fromAdjacencyList("""1  R u0 p0 c0 {2,S} {3,S} {5,S}
    2  R u0 p0 c0 {1,S} {3,S} {6,S}
    3  R u0 p0 c0 {1,S} {2,S} {4,S}
    4  R u0 p0 c0 {3,S} {5,S} {6,S}
    5  R u0 p0 c0 {1,S} {4,S} {6,S}
    6  R u0 p0 c0 {2,S} {4,S} {5,S}
    """),
    Group().fromAdjacencyList("""1  R u0 p0 c0 {2,S} {6,S} {7,S}
    2  R u0 p0 c0 {1,S} {3,S} {4,S}
    3  R u0 p0 c0 {2,S} {4,S} {8,S}
    4  R u0 p0 c0 {2,S} {3,S} {5,S}
    5  R u0 p0 c0 {4,S} {6,S} {8,S}
    6  R u0 p0 c0 {1,S} {5,S} {7,S}
    7  R u0 p0 c0 {1,S} {6,S} {8,S}
    8  R u0 p0 c0 {3,S} {5,S} {7,S}
    """),
    Group().fromAdjacencyList("""1  R u0 p0 c0 {2,S} {4,S} 
    2  R u0 p0 c0 {1,S} {3,S} {9,S} 
    3  R u0 p0 c0 {2,S} {4,S}
    4  R u0 p0 c0 {1,S} {3,S} {5,S}
    5  R u0 p0 c0 {4,S} {6,S} {8,S}
    6  R u0 p0 c0 {5,S} {7,S}
    7  R u0 p0 c0 {6,S} {8,S} {9,S} 
    8  R u0 p0 c0 {5,S} {7,S}
    9  R u0 p0 c0 {2,S} {7,S}
    """),
    Group().fromAdjacencyList("""1  R u0 p0 c0 {2,S} {9,S}
    2  R u0 p0 c0 {1,S} {3,S} {9,S}
    3  R u0 p0 c0 {2,S} {4,[S,D]}
    4  R u0 p0 c0 {3,[S,D]} {5,S}
    5  R u0 p0 c0 {4,S} {6,S} {8,S}
    6  R u0 p0 c0 {5,S} {7,S}
    7  R u0 p0 c0 {6,S} {8,S} {9,S}
    8  R u0 p0 c0 {5,S} {7,S}
    9  R u0 p0 c0 {1,S} {2,S} {7,S}
    """),
    Group().fromAdjacencyList("""1  R u0 p0 c0 {2,S} {9,S} 
    2  R u0 p0 c0 {1,S} {3,S} {9,S} 
    3  R u0 p0 c0 {2,S} {4,S}
    4  R u0 p0 c0 {3,S} {5,S} {7,S}
    5  R u0 p0 c0 {4,S} {6,S} 
    6  R u0 p0 c0 {5,S} {7,S} {8,S}
    7  R u0 p0 c0 {4,S} {6,S}
    8  R u0 p0 c0 {6,S} {9,S}
    9  R u0 p0 c0 {1,S} {2,S} {8,S} 
    """),
    Group().fromAdjacencyList("""1  R u0 p0 c0 {2,S} {5,S}
    2  R u0 p0 c0 {1,S} {3,D}
    3  R u0 p0 c0 {2,D} {4,S}
    4  R u0 p0 c0 {3,S} {5,S}
    5  R u0 p0 c0 {1,S} {4,S} {6,S} {9,S}
    6  R u0 p0 c0 {5,S} {7,S}
    7  R u0 p0 c0 {6,S} {8,D}
    8  R u0 p0 c0 {7,D} {9,S}
    9  R u0 p0 c0 {5,S} {8,S}
    """),
]

In [ ]:
test_size = 0
R = 1.987 # unit: cal/mol/K
validation_test_dict = {} # key: spec.label, value: (thermo_heuristic, thermo_qm)

spec_labels = []
spec_dict = {}
H298s_qm = []
Cp298s_qm = []
H298s_gav = []
Cp298s_gav = []
for db_mol in db_mols:
    smiles_in = str(db_mol["SMILES_input"])
    spec_in = Species().fromSMILES(smiles_in)
    
    # remove unwanted species
    for grp in filterList:
        if spec_in.molecule[0].isSubgraphIsomorphic(grp):
            break
    else:
        spec_in.generateResonanceIsomers()
        spec_labels.append(smiles_in)

        # qm: just free energy but not free energy of formation
        G298_qm = float(db_mol["G298"])*627.51 # unit: kcal/mol
        H298_qm = float(db_mol["Hf298(kcal/mol)"]) # unit: kcal/mol
        Cv298_qm = float(db_mol["Cv298"]) # unit: cal/mol/K
        Cp298_qm = Cv298_qm + R # unit: cal/mol/K

        H298s_qm.append(H298_qm)
        Cp298s_qm.append(Cp298_qm)

        # gav
        thermo_gav = thermoDatabase.getThermoDataFromGroups(spec_in)
        H298_gav = thermo_gav.H298.value_si/4184.0 # unit: kcal/mol
        Cp298_gav = thermo_gav.getHeatCapacity(298)/4.184 # unit: cal/mol

        H298s_gav.append(H298_gav)
        Cp298s_gav.append(Cp298_gav)

        spec_dict[smiles_in] = spec_in

Create pandas dataframe for easy data validation


In [ ]:
# create pandas dataframe
validation_test_df = pd.DataFrame(index=spec_labels)

validation_test_df['H298_heuristic(kcal/mol/K)'] = pd.Series(H298s_gav, index=validation_test_df.index)
validation_test_df['H298_qm(kcal/mol/K)'] = pd.Series(H298s_qm, index=validation_test_df.index)

heuristic_qm_diff = abs(validation_test_df['H298_heuristic(kcal/mol/K)']-validation_test_df['H298_qm(kcal/mol/K)'])
validation_test_df['H298_heuristic_qm_diff(kcal/mol/K)'] = pd.Series(heuristic_qm_diff, index=validation_test_df.index)
display(validation_test_df.head())
print "Validation test dataframe has {0} tricyclics.".format(len(spec_labels))
validation_test_df['H298_heuristic_qm_diff(kcal/mol/K)'].describe()

categorize error sources


In [ ]:
diff20_df = validation_test_df[(validation_test_df['H298_heuristic_qm_diff(kcal/mol/K)'] > 20) 
                   & (validation_test_df['H298_heuristic_qm_diff(kcal/mol/K)'] <= 80)]
len(diff20_df)

In [ ]:
print len(diff20_df)
for smiles in diff20_df.index:
    print "***********heur = {0}************".format(diff20_df[diff20_df.index==smiles]['H298_heuristic(kcal/mol/K)'])
    
    print "***********qm = {0}************".format(diff20_df[diff20_df.index==smiles]['H298_qm(kcal/mol/K)'])
    
    spe = spec_dict[smiles]
    display(spe)

Parity Plot: heuristic vs. qm


In [ ]:
p = figure(plot_width=500, plot_height=400)

# plot_df = validation_test_df[validation_test_df['H298_heuristic_qm_diff(kcal/mol)'] < 10]
plot_df = validation_test_df

# add a square renderer with a size, color, and alpha
p.circle(plot_df['H298_heuristic(kcal/mol/K)'], plot_df['H298_qm(kcal/mol/K)'], 
         size=5, color="green", alpha=0.5)

x = np.array([-50, 200])
y = x
p.line(x=x, y=y, line_width=2, color='#636363')
p.line(x=x, y=y+10, line_width=2,line_dash="dashed", color='#bdbdbd')
p.line(x=x, y=y-10, line_width=2, line_dash="dashed", color='#bdbdbd')

p.xaxis.axis_label = "H298 GAV (kcal/mol)"
p.yaxis.axis_label = "H298 Quantum (kcal/mol)"
p.xaxis.axis_label_text_font_style = "normal"
p.yaxis.axis_label_text_font_style = "normal"
p.xaxis.axis_label_text_font_size = "16pt"
p.yaxis.axis_label_text_font_size = "16pt"
p.xaxis.major_label_text_font_size = "12pt"
p.yaxis.major_label_text_font_size = "12pt"

show(p)

In [ ]:
len(plot_df.index)

Histogram of abs(heuristic-qm)


In [ ]:
from bokeh.models import Range1d

hist = Histogram(validation_test_df,
                 values='Cp298_heuristic_qm_diff(cal/mol/K)', xlabel='Cp Prediction Error (cal/mol/K)',
                 ylabel='Number of Testing Molecules',
                bins=50,\
                plot_width=500, plot_height=300)
# hist.y_range = Range1d(0, 1640)
hist.x_range = Range1d(0, 20)
show(hist)

In [ ]:
with open('validation_test_sdata134k_2903_pyPoly_dbPoly.csv', 'w') as fout:
    validation_test_df.to_csv(fout)

In [ ]: