This notebook runs a quick initial analysis of whether a molecule set can be simulated by a given SMIRNOFF-format force field. While some attempt has been made to improve the speed of this code, it is not particuarly high-performance in its current state, and may crash the notebook you try to analyze more than 10,000 molecules.
First, we define global variables, and create a helper function to check for parameterization failures.
It is not important to understand or modify the following two notebook cells.
In [1]:
from openforcefield.topology import Molecule, Topology
from openforcefield.typing.engines.smirnoff import (ForceField,
UnassignedValenceParameterException, BondHandler, AngleHandler,
ProperTorsionHandler, ImproperTorsionHandler,
vdWHandler)
from simtk import unit
import numpy as np
from rdkit import Chem
from rdkit.Chem import Draw, AllChem
from rdkit.Chem.Draw import IPythonConsole
from IPython.display import display
from copy import deepcopy
import time
import os
# Define "super generics", which are parameters that will match
# each instance of a valence type. By adding these to a ForceField
# object, we ensure that a given ParameterHandler will not encounter
# a parameterization failure
super_generics = {'Bonds':
BondHandler.BondType(smirks='[*:1]~[*:2]',
k=0*unit.kilocalorie/unit.mole/unit.angstrom**2,
length=0*unit.angstrom
),
'Angles':
AngleHandler.AngleType(smirks='[*:1]~[*:2]~[*:3]',
angle=0*unit.degree,
k=0*unit.kilocalorie/unit.mole/unit.degree**2
),
'ProperTorsions':
ProperTorsionHandler.ProperTorsionType(smirks='[*:1]~[*:2]~[*:3]~[*:4]',
phase1=0*unit.degree,
periodicity1=0,
k1=0*unit.kilocalorie/unit.mole,
idivf1=1
),
'ImproperTorsions':
ImproperTorsionHandler.ImproperTorsionType(smirks='[*:1]~[*:2](~[*:3])~[*:4]',
phase1=0*unit.degree,
periodicity1=0,
k1=0*unit.kilocalorie/unit.mole,
idivf1=1
),
'vdW':
vdWHandler.vdWType(smirks='[*:1]',
rmin_half=0*unit.angstrom,
epsilon = 0*unit.kilocalorie/unit.mole
),
}
In [2]:
def report_missing_parameters(molecule, forcefield):
"""
Analyze a molecule using a provided ForceField, generating a report of any
chemical groups in the molecule that are lacking parameters.
Parameters
----------
molecule : an openforcefield.topology.FrozenMolecule
The molecule to analyze
forcefield : an openforcefield.typing.engine.smirnoff.ForceField
The ForceField object to use
Returns
-------
missing_parameters : dict[tagname: list[dict[tagged_smiles:string, image:PIL.Image, atom indices:list[int]]]]
A hierarchical dictionary, with first level keys indicating ForceField tag
names (eg. "Bonds"), and first-level values which are lists of dictionaries.
Each dictionary in this list reflects one missing parameter, and contains the
following key:value pairs :
* "image": PIL.Image
* shows a 2D drawing, highlighting the feature that could not be parametrized
* "tagged_smiles": string
* SMILES of the whole molecule, tagging the atom indices which could not be
parametrized
* "atom_indices": tuple(int)
* The indices of atoms which could not be parametrized
"""
highlight_color = (0.75, 0.75, 0.75)
# Make deepcopies of both inputs, since we may modify them in this function
forcefield = deepcopy(forcefield)
molecule = deepcopy(molecule)
# Set partial charges to placeholder values so that we can skip AM1-BCC
# during parameterization
molecule.partial_charges = (np.zeros(molecule.n_atoms) + 0.1) * unit.elementary_charge
# Prepare dictionary to catch parameterization failure info
success = False
missing_params = {}
while not success:
# Try to parameterize the system, catching the exception if there is one.
try:
forcefield.create_openmm_system(molecule.to_topology(),
charge_from_molecules=[molecule],
allow_nonintegral_charges=True)
success = True
except UnassignedValenceParameterException as e:
success = False
# Ensure that there is a list initialized for missing parameters
# under this tagname
handler_tagname = e.handler_class._TAGNAME
if handler_tagname not in missing_params:
missing_params[handler_tagname] = []
# Create a shortcut to the topology atom tuples attached to
# the parametrization error
top_atom_tuples = e.unassigned_topology_atom_tuples
# Make a summary of the missing parameters from this attempt and add it to
# the missing_params dict
rdmol = molecule.to_rdkit()
for top_atom_tuple in top_atom_tuples:
orig_atom_indices = [i.topology_atom_index for i in top_atom_tuple]
# Make a copy of the input RDMol so that we don't modify the original
this_rdmol = deepcopy(rdmol)
# Attach tags to relevant atoms so that a tagged SMILES can be written
orig_rdatoms = []
for tag_idx, atom_idx in enumerate(orig_atom_indices):
rdatom = this_rdmol.GetAtomWithIdx(atom_idx)
rdatom.SetAtomMapNum(tag_idx + 1)
orig_rdatoms.append(rdatom)
tagged_smiles = Chem.MolToSmiles(this_rdmol)
# Make tagged hydrogens into deuteriums so that RemoveHs doesn't get rid of them
for rdatom in orig_rdatoms:
if rdatom.GetAtomicNum() == 1:
rdatom.SetIsotope(2)
# Remove hydrogens, since they clutter up the 2D drawing
# (tagged Hs are not removed, since they were converted to deuterium)
h_less_rdmol = Chem.RemoveHs(this_rdmol)
# Generate 2D coords, since drawing from 3D can look really weird
Draw.rdDepictor.Compute2DCoords(h_less_rdmol)
# Search over the molecule to find the indices of the tagged atoms
# after hydrogen removal
h_less_atom_indices = [None for i in orig_atom_indices]
for rdatom in h_less_rdmol.GetAtoms():
# Convert deuteriums back into hydrogens
if rdatom.GetAtomicNum() == 1:
rdatom.SetIsotope(1)
atom_map_num = rdatom.GetAtomMapNum()
if atom_map_num == 0:
continue
h_less_atom_indices[atom_map_num-1] = rdatom.GetIdx()
# Once the new atom indices are found, use them to find the H-less
# bond indices
h_less_rdbonds = []
for i in range(len(h_less_atom_indices)-1):
rdbond = h_less_rdmol.GetBondBetweenAtoms(
h_less_atom_indices[i],
h_less_atom_indices[i+1])
h_less_rdbonds.append(rdbond)
h_less_bond_indices = [bd.GetIdx() for bd in h_less_rdbonds]
# Create a 2D drawing of the molecule, highlighting the
# parameterization failure
highlight_atom_colors = {idx:highlight_color for idx in h_less_atom_indices}
highlight_bond_colors = {idx:highlight_color for idx in h_less_bond_indices}
image = Draw.MolsToGridImage([h_less_rdmol],
highlightAtomLists=[h_less_atom_indices],
highlightBondLists=[h_less_bond_indices],
molsPerRow=1,
highlightAtomColors=[highlight_atom_colors],
highlightBondColors=[highlight_bond_colors],
subImgSize=(600,600)
)
# Structure and append the relevant info to the missing_params dictionary
param_description = {'atom_indices': orig_atom_indices,
'image': image,
'tagged_smiles': tagged_smiles
}
missing_params[handler_tagname].append(param_description)
# Add a "super generic" parameter to the top of this handler's ParameterList,
# which will make it always find parameters for each term. This will prevent the same
# parameterization exception from being raised in the next attempt.
param_list = forcefield.get_parameter_handler(handler_tagname).parameters
param_list.insert(0, super_generics[handler_tagname])
return missing_params
There are several ways to load molecules into the Open Force Field Toolkit. Below, we show how to load molecule databases from .smi
, .sdf
, and .mol2
format files. It's important that these databases contain a complete representation of each molecule, including formal charge, stereochemistry, and protonation state.
Note that loading .mol2
files is currently only supported using the OpenEye toolkit.
In [3]:
molecules = Molecule.from_file('example_molecules.smi', allow_undefined_stereo=True)
# We also provide a SMILES dataset of ~1000 problematic molecules
#molecules = Molecule.from_file('problem_smiles.smi', allow_undefined_stereo=True)
print(f'Loaded {len(molecules)} molecules')
In [ ]:
molecules = Molecule.from_file('example_molecules.sdf', allow_undefined_stereo=True)
print(f'Loaded {len(molecules)} molecules')
In [ ]:
try:
molecules = Molecule.from_file('example_molecules.mol2', allow_undefined_stereo=True)
print(f'Loaded {len(molecules)} molecules')
except NotImplementedError as e:
print(e)
print("Loading mol2 files requires the OpenEye Toolkits")
Here, we run the above-defined function on all molecules in the dataset. The parameterization failures will be shown in the notebook as the data set is processed.
Note: If the dataset is large this will take a very long time, and displaying all parameterization failures may run into memory/output limits in the notebook. If you're analyzing more than ~1,000 molecules, use Option 2
below.
In [4]:
start_time = time.time()
forcefield = ForceField('openff-1.0.0.offxml')
results = {}
for mol_idx, molecule in enumerate(molecules):
# Prepare a title for this molecule
if molecule.name == '':
mol_name = f'molecule_{mol_idx+1}'
else:
mol_name = molecule.name
print('\n'*3)
print('=' * 60)
print('=' * 60)
print(f'Processing "{mol_name}" with smiles {molecule.to_smiles()}')
print('=' * 60)
print('=' * 60)
# Analyze missing parameters
time_i = time.time()
missing_params = report_missing_parameters(molecule, forcefield)
print(f'Molecule analysis took {time.time()-time_i} seconds')
results[mol_name] = missing_params
for tagname, missing_tag_params in missing_params.items():
print('~'*60)
print(tagname)
print('~'*60)
for missing_param in missing_tag_params:
print(missing_param['tagged_smiles'])
display(missing_param['image'])
print(f'Processing {len(molecules)} molecules took {time.time()-start_time} seconds')
This method is faster than Option 1
, but will not display unparameterizable chemistry in the notebook. 2D depictions and tagged SMILES of unparameterizable chemistry will be written to file in the final cell of the notebook.
This will by default use 75% of the system's CPUs. If this is not desired, manually set num_threads
below.
In [ ]:
import multiprocessing
num_threads = max(1, int(multiprocessing.cpu_count() * 0.75))
def check_molecule(inputs):
mol_idx = inputs[0]
molecule = inputs[1]
forcefield = ForceField('openff-1.0.0-RC1.offxml')
# Prepare a title for this molecule
if molecule.name == '':
mol_name = f'molecule_{mol_idx+1}'
else:
mol_name = molecule.name
print('\n'*3)
print('=' * 60)
print('=' * 60)
print(f'Processing "{mol_name}" with smiles {molecule.to_smiles()}')
print('=' * 60)
print('=' * 60)
# Analyze missing parameters
time_i = time.time()
missing_params = report_missing_parameters(molecule, forcefield)
print(f'Molecule analysis took {time.time()-time_i} seconds')
return (mol_name, missing_params)
start_time = time.time()
p = multiprocessing.Pool(num_threads)
job_args = [(idx, molecule) for idx, molecule in enumerate(molecules)]
result_list = p.map(check_molecule, job_args)
results = dict(result_list)
print(f'Processing {len(molecules)} molecules took {time.time()-start_time} seconds')
Since the results from above are only held in memory during this Python session, it can be helpful to save this analysis to disk.
The following code will write files containing a 2D image and a tagged SMILES for each unparameterizable motif found above. Results for each molecule are saved to different folders. If molecule names were not provided, these folders will be named molecule_N/
, where N is the order in which the molecules were read. A single molecule may have multiple parameterization failures, and each one is written both as an image (eg. molecule_2/Bonds_1-2.png
) and a tagged SMILES (eg. molecule_2/Bonds_1-2.smi
).
Note that this does not clear previous outputs, so it is possible that running this script on several datasets will overwrite or mix data with previous runs. Run `rm -r molecule*` between runs to prevent potential issues._
When a molecule contains even one instance of unparameterizable chemistry, it can result in a large number of ProperTorsions
failures being reported. To help reduce redundancy in these cases, the code below groups ProperTorsions
output such that the atom indices defining the central bond are written first in the file name, followed by the atom indices of the whole torsion. This way, lexical (alphabetical) displays of file names make it easier to identify possibly-redundant outputs.
Concretely, the second molecule in the SMILES set is an example of this issue, as 1-indexed atom in that molecule is an unparameterizable Hg
. This leads to the following output:
molecule_2/ProperTorsions__0-1__2-1-0-23.png
molecule_2/ProperTorsions__0-1__2-1-0-24.png
molecule_2/ProperTorsions__0-1__2-1-0-25.png
molecule_2/ProperTorsions__1-2__0-1-2-3.png
molecule_2/ProperTorsions__1-2__0-1-2-4.png
molecule_2/ProperTorsions__1-2__0-1-2-5.png
Here, listing the central atoms early in the filename makes it easy to see that the 1-indexed atom is likely to be the cause of the error.
When reporting parameterization failures, note that the tagged SMILES contains the full identity of the molecule, and that it is not trivial to extract only the motif which caused the parameterization failure. To report a parameterization failure without revealing the identity of the entire molecule, consider cropping the molecule image to only show the tagged atoms and their first or second neighbors, and uploading it to the openforcefield toolkit issue tracker
In [5]:
# Iterate over all molecules, and create a folder for each
# one that experienced a parameterization failure
for mol_name, result_dict in results.items():
if result_dict == {}:
continue
if not os.path.exists(mol_name):
os.mkdir(mol_name)
# Write each parameterization failure to file
for tagname, missing_parm_dicts in result_dict.items():
elements = []
for missing_parm_dict in missing_parm_dicts:
inds = missing_parm_dict['atom_indices']
inds_str = '-'.join([str(i) for i in inds])
if tagname == 'ProperTorsions':
cent_atom_1 = min(inds[1], inds[2])
cent_atom_2 = max(inds[1], inds[2])
file_prefix = f'{tagname}__{cent_atom_1}-{cent_atom_2}__{inds_str}'
else:
file_prefix = f'{tagname}_{inds_str}'
png_file = os.path.join(mol_name, file_prefix+'.png')
smi_file = os.path.join(mol_name, file_prefix+'.smi')
missing_parm_dict['image'].save(png_file)
with open(smi_file, 'w') as of:
of.write(missing_parm_dict['tagged_smiles'])
In [6]:
! ls molecule_*/*