Han, Kehang (hkh12@mit.edu)
This notebook is designed to use a big set of tricyclics for testing the performance of new polycyclics thermo estimator. Currently the dataset contains 2903 tricyclics that passed isomorphic check.
In [ ]:
from rmgpy.data.rmg import RMGDatabase
from rmgpy import settings
from rmgpy.species import Species
from rmgpy.molecule import Group
from rmgpy.rmg.main import RMG
from IPython.display import display
import numpy as np
import os
import pandas as pd
from pymongo import MongoClient
import logging
logging.disable(logging.CRITICAL)
In [ ]:
from bokeh.charts import Histogram
from bokeh.plotting import figure, show
from bokeh.io import output_notebook
output_notebook()
In [ ]:
host = 'mongodb://user:user@rmg.mit.edu/admin'
port = 27018
client = MongoClient(host, port)
In [ ]:
db = getattr(client, 'sdata134k')
db.collection_names()
In [ ]:
def get_data(db, collection_name):
collection = getattr(db, collection_name)
db_cursor = collection.find()
# collect data
print('reading data...')
db_mols = []
for db_mol in db_cursor:
db_mols.append(db_mol)
print('done')
return db_mols
In [ ]:
database = RMGDatabase()
In [ ]:
database.load(
settings['database.directory'],
thermoLibraries=[],
kineticsFamilies='none',
kineticsDepositories='none',
reactionLibraries = []
)
thermoDatabase = database.thermo
In [ ]:
# fetch testing dataset
collection_name = 'large_linear_polycyclic_table'
db_mols = get_data(db, collection_name)
db_mols.extend(get_data(db, 'large_fused_polycyclic_table'))
print len(db_mols)
In [ ]:
filterList = [
Group().fromAdjacencyList("""1 R u0 p0 c0 {2,S} {3,S} {5,S}
2 R u0 p0 c0 {1,S} {3,S} {6,S}
3 R u0 p0 c0 {1,S} {2,S} {4,S}
4 R u0 p0 c0 {3,S} {5,S} {6,S}
5 R u0 p0 c0 {1,S} {4,S} {6,S}
6 R u0 p0 c0 {2,S} {4,S} {5,S}
"""),
Group().fromAdjacencyList("""1 R u0 p0 c0 {2,S} {6,S} {7,S}
2 R u0 p0 c0 {1,S} {3,S} {4,S}
3 R u0 p0 c0 {2,S} {4,S} {8,S}
4 R u0 p0 c0 {2,S} {3,S} {5,S}
5 R u0 p0 c0 {4,S} {6,S} {8,S}
6 R u0 p0 c0 {1,S} {5,S} {7,S}
7 R u0 p0 c0 {1,S} {6,S} {8,S}
8 R u0 p0 c0 {3,S} {5,S} {7,S}
"""),
Group().fromAdjacencyList("""1 R u0 p0 c0 {2,S} {4,S}
2 R u0 p0 c0 {1,S} {3,S} {9,S}
3 R u0 p0 c0 {2,S} {4,S}
4 R u0 p0 c0 {1,S} {3,S} {5,S}
5 R u0 p0 c0 {4,S} {6,S} {8,S}
6 R u0 p0 c0 {5,S} {7,S}
7 R u0 p0 c0 {6,S} {8,S} {9,S}
8 R u0 p0 c0 {5,S} {7,S}
9 R u0 p0 c0 {2,S} {7,S}
"""),
Group().fromAdjacencyList("""1 R u0 p0 c0 {2,S} {9,S}
2 R u0 p0 c0 {1,S} {3,S} {9,S}
3 R u0 p0 c0 {2,S} {4,[S,D]}
4 R u0 p0 c0 {3,[S,D]} {5,S}
5 R u0 p0 c0 {4,S} {6,S} {8,S}
6 R u0 p0 c0 {5,S} {7,S}
7 R u0 p0 c0 {6,S} {8,S} {9,S}
8 R u0 p0 c0 {5,S} {7,S}
9 R u0 p0 c0 {1,S} {2,S} {7,S}
"""),
Group().fromAdjacencyList("""1 R u0 p0 c0 {2,S} {9,S}
2 R u0 p0 c0 {1,S} {3,S} {9,S}
3 R u0 p0 c0 {2,S} {4,S}
4 R u0 p0 c0 {3,S} {5,S} {7,S}
5 R u0 p0 c0 {4,S} {6,S}
6 R u0 p0 c0 {5,S} {7,S} {8,S}
7 R u0 p0 c0 {4,S} {6,S}
8 R u0 p0 c0 {6,S} {9,S}
9 R u0 p0 c0 {1,S} {2,S} {8,S}
"""),
Group().fromAdjacencyList("""1 R u0 p0 c0 {2,S} {5,S}
2 R u0 p0 c0 {1,S} {3,D}
3 R u0 p0 c0 {2,D} {4,S}
4 R u0 p0 c0 {3,S} {5,S}
5 R u0 p0 c0 {1,S} {4,S} {6,S} {9,S}
6 R u0 p0 c0 {5,S} {7,S}
7 R u0 p0 c0 {6,S} {8,D}
8 R u0 p0 c0 {7,D} {9,S}
9 R u0 p0 c0 {5,S} {8,S}
"""),
]
In [ ]:
test_size = 0
R = 1.987 # unit: cal/mol/K
validation_test_dict = {} # key: spec.label, value: (thermo_heuristic, thermo_qm)
spec_labels = []
spec_dict = {}
H298s_qm = []
Cp298s_qm = []
H298s_gav = []
Cp298s_gav = []
for db_mol in db_mols:
smiles_in = str(db_mol["SMILES_input"])
spec_in = Species().fromSMILES(smiles_in)
# remove unwanted species
for grp in filterList:
if spec_in.molecule[0].isSubgraphIsomorphic(grp):
break
else:
spec_in.generateResonanceIsomers()
spec_labels.append(smiles_in)
# qm: just free energy but not free energy of formation
G298_qm = float(db_mol["G298"])*627.51 # unit: kcal/mol
H298_qm = float(db_mol["Hf298(kcal/mol)"]) # unit: kcal/mol
Cv298_qm = float(db_mol["Cv298"]) # unit: cal/mol/K
Cp298_qm = Cv298_qm + R # unit: cal/mol/K
H298s_qm.append(H298_qm)
Cp298s_qm.append(Cp298_qm)
# gav
thermo_gav = thermoDatabase.getThermoDataFromGroups(spec_in)
H298_gav = thermo_gav.H298.value_si/4184.0 # unit: kcal/mol
Cp298_gav = thermo_gav.getHeatCapacity(298)/4.184 # unit: cal/mol
H298s_gav.append(H298_gav)
Cp298s_gav.append(Cp298_gav)
spec_dict[smiles_in] = spec_in
In [ ]:
# create pandas dataframe
validation_test_df = pd.DataFrame(index=spec_labels)
validation_test_df['H298_heuristic(kcal/mol/K)'] = pd.Series(H298s_gav, index=validation_test_df.index)
validation_test_df['H298_qm(kcal/mol/K)'] = pd.Series(H298s_qm, index=validation_test_df.index)
heuristic_qm_diff = abs(validation_test_df['H298_heuristic(kcal/mol/K)']-validation_test_df['H298_qm(kcal/mol/K)'])
validation_test_df['H298_heuristic_qm_diff(kcal/mol/K)'] = pd.Series(heuristic_qm_diff, index=validation_test_df.index)
display(validation_test_df.head())
print "Validation test dataframe has {0} tricyclics.".format(len(spec_labels))
validation_test_df['H298_heuristic_qm_diff(kcal/mol/K)'].describe()
In [ ]:
diff20_df = validation_test_df[(validation_test_df['H298_heuristic_qm_diff(kcal/mol/K)'] > 20)
& (validation_test_df['H298_heuristic_qm_diff(kcal/mol/K)'] <= 80)]
len(diff20_df)
In [ ]:
print len(diff20_df)
for smiles in diff20_df.index:
print "***********heur = {0}************".format(diff20_df[diff20_df.index==smiles]['H298_heuristic(kcal/mol/K)'])
print "***********qm = {0}************".format(diff20_df[diff20_df.index==smiles]['H298_qm(kcal/mol/K)'])
spe = spec_dict[smiles]
display(spe)
In [ ]:
p = figure(plot_width=500, plot_height=400)
# plot_df = validation_test_df[validation_test_df['H298_heuristic_qm_diff(kcal/mol)'] < 10]
plot_df = validation_test_df
# add a square renderer with a size, color, and alpha
p.circle(plot_df['H298_heuristic(kcal/mol/K)'], plot_df['H298_qm(kcal/mol/K)'],
size=5, color="green", alpha=0.5)
x = np.array([-50, 200])
y = x
p.line(x=x, y=y, line_width=2, color='#636363')
p.line(x=x, y=y+10, line_width=2,line_dash="dashed", color='#bdbdbd')
p.line(x=x, y=y-10, line_width=2, line_dash="dashed", color='#bdbdbd')
p.xaxis.axis_label = "H298 GAV (kcal/mol)"
p.yaxis.axis_label = "H298 Quantum (kcal/mol)"
p.xaxis.axis_label_text_font_style = "normal"
p.yaxis.axis_label_text_font_style = "normal"
p.xaxis.axis_label_text_font_size = "16pt"
p.yaxis.axis_label_text_font_size = "16pt"
p.xaxis.major_label_text_font_size = "12pt"
p.yaxis.major_label_text_font_size = "12pt"
show(p)
In [ ]:
len(plot_df.index)
In [ ]:
from bokeh.models import Range1d
hist = Histogram(validation_test_df,
values='Cp298_heuristic_qm_diff(cal/mol/K)', xlabel='Cp Prediction Error (cal/mol/K)',
ylabel='Number of Testing Molecules',
bins=50,\
plot_width=500, plot_height=300)
# hist.y_range = Range1d(0, 1640)
hist.x_range = Range1d(0, 20)
show(hist)
In [ ]:
with open('validation_test_sdata134k_2903_pyPoly_dbPoly.csv', 'w') as fout:
validation_test_df.to_csv(fout)
In [ ]: