Check two SMIRNOFFs have the same typing

This notebook was created to check that a change to a smirnoff forcefield doesn't change the way it types molecules. This concern came from switching the decorator 'R' to 'x' so that the SMIRKS in the SMIRNOFF format would be compatible with OEtoolkits and RDKit. See smirnoff99Frosst issue#54

This notebook will:

  1. Convert a specified smirks-frcmod file to SMIRNOFF FFXML (this is the test force field)
  2. Load the current release of smirnoff99Frosst (this is the reference force field)
  3. Generate (or take in) a set of molecules in OpenEye oemol format. Label these molecules with both forcefields.
  4. Identify molecules where parameter assignment doesn't agree
  5. Visualize molecules by force type

Authors:

  • Caitlin C. Bannan (UCI)
  • functions copied from parameter_usage.ipynb written by David L. Mobley (UCI)

In [1]:
# Imports
from __future__ import print_function
from convert_frcmod import *
import openeye.oechem as oechem
import openeye.oeiupac as oeiupac
import openeye.oeomega as oeomega
import openeye.oedepict as oedepict
from IPython.display import display
from openforcefield.typing.engines.smirnoff.forcefield import *
from openforcefield.typing.engines.smirnoff.forcefield_utils import get_molecule_parameterIDs
from openforcefield.utils import *
% matplotlib inline
import matplotlib
import numpy as np
import pylab as pl
import matplotlib.pyplot as plt
import time
import IPython
import pickle
import glob

Relevant methods


In [2]:
def depictAtomByIdx(mol_copy, atomIdxList, supH = True, width=900, height=500):
    mol = oechem.OEMol(mol_copy)
    OEGenerate2DCoordinates(mol)
    atomBondSet = oechem.OEAtomBondSet()
    for atom in mol.GetAtoms():
        if atom.GetIdx() in atomIdxList:
            atomBondSet.AddAtom( atom)
            for bond in atom.GetBonds():
                nbrAtom = bond.GetNbr(atom)
                nbrIdx = nbrAtom.GetIdx()
                if (nbrIdx in atomIdxList) and nbrIdx>atom.GetIdx():
                    atomBondSet.AddBond( bond)
    from IPython.display import Image
    dopt = oedepict.OEPrepareDepictionOptions()
    dopt.SetDepictOrientation( oedepict.OEDepictOrientation_Horizontal)
    dopt.SetSuppressHydrogens(supH)
    oedepict.OEPrepareDepiction(mol, dopt)
    opts = oedepict.OE2DMolDisplayOptions(width, height, oedepict.OEScale_AutoScale)
    disp = oedepict.OE2DMolDisplay(mol, opts)
    aroStyle = oedepict.OEHighlightStyle_Color
    aroColor = oechem.OEColor(oechem.OEGrey)
    oedepict.OEAddHighlighting(disp, aroColor, aroStyle, 
                               oechem.OEIsAromaticAtom(), oechem.OEIsAromaticBond() )
    hstyle = oedepict.OEHighlightStyle_BallAndStick
    hcolor = oechem.OEColor(oechem.OELightGreen)
    oedepict.OEAddHighlighting(disp, hcolor, hstyle, atomBondSet)
    #ofs = oechem.oeosstream()
    img = oedepict.OEImage(width, height)
    oedepict.OERenderMolecule(img, disp)
    #oedepict.OERenderMolecule(ofs, 'png', disp)
    #ofs.flush()
    #return Image(data = "".join(ofs.str()))
    return Image(oedepict.OEWriteImageToString("png",img))

In [3]:
def getMolParamIDToAtomIndex( oemol, ff):
    """Take an OEMol and a SMIRNOFF forcefield object and return a dictionary,
    keyed by parameter ID, where each entry is a tuple of
    ( smirks, [[atom1, ... atomN], [atom1, ... atomN]) giving the SMIRKS
    corresponding to that parameter ID and a list of the atom groups in that
    molecule that parameter is applied to.

    Parameters
    ----------
    oemol : OEMol
        OpenEye OEMol with the molecule to investigate.
    ff : ForceField
        SMIRNOFF ForceField object (obtained from an ffxml via ForceField(ffxml)) containing FF of interest.

    Returns
    -------
    param_usage : dictionary
        Dictionary, keyed by parameter ID, where each entry is a tuple of
        ( smirks, [[atom1, ... atomN], [atom1, ... atomN]) giving the SMIRKS
        corresponding to that parameter ID and a list of the atom groups in
        that molecule that parameter is applied to.

    """
    labels = ff.labelMolecules([oemol])
    param_usage = {}
    for mol_entry in range(len(labels)):
        for force in labels[mol_entry].keys():
            for (atom_indices, pid, smirks) in labels[mol_entry][force]:
                if not pid in param_usage:
                    param_usage[pid] = (smirks, [atom_indices])
                else:
                    param_usage[pid][1].append( atom_indices )

    return param_usage

In [4]:
def labels_to_pidDict(labels):
    """
    This method takes a set of SMIRNOFF forcefield labels and returns
    a dictionary with information for each molecule at each force type
    in the form:
        { force_type: {mol_index: {(indice tuple): pid, ...}, ... } }
    """
    force_type_dict = dict()
    for idx, mol_dict in enumerate(labels):
        for force_type, label_set in mol_dict.items():
            if not force_type in force_type_dict:
                force_type_dict[force_type] = dict()
            force_type_dict[force_type][idx] = dict()
            
            for (indices, pid, smirks) in label_set:
                force_type_dict[force_type][idx][tuple(indices)] = {'pid': pid, 'smirks':smirks}
    return force_type_dict

1. Convert specified SMIRKS frcmod file to SMIRNOFF FFXML


In [5]:
# Input and output info
#infile = 'example.frcmod' # smirnoffish frcmod file to convert
infile = 'smirnoffishFrcmod.parm99Frosst.txt' # smirnoffish frcmod file to convert
ffxmlFile = 'smirnoff99FrosstFrcmod.offxml'
template = 'template.offxml' # Template FFXML file without parameters (but with remainder of contents)

In [6]:
# Convert
# Already converted
convert_frcmod_to_ffxml( infile, template, ffxmlFile)

In [7]:
# Load SMIRNOFF FFXML
test_ff = ForceField(ffxmlFile) # We will use this below to access details of parameters

2. Load smirnoff99Frosst from current release

This is currently linking to openforcefield/data/test_forcefields/smirnoff99Frosst.offxml which is the current release


In [8]:
ref_ff = ForceField('test_forcefields/smirnoff99Frosst.offxml')

3. Generate or take in a set of molecules in OpenEye OEMol format

Here we will generate a list of molecules. We will label all molecules given. Here we don't care rather the assigned parameters are generic or not just that the typing doesn't change between the two forcefields.

Currently this section expects a relative or absolute path to a single file. The utils.read_molecules function will access /openforcefield/data/molecules/[your path] if there is no file at the relative path.


In [9]:
molecule_file = "DrugBank_tripos.mol2"
molecules = utils.read_molecules(molecule_file)


Loading molecules from '/Users/bannanc/anaconda3/lib/python3.5/site-packages/openforcefield/data/molecules/DrugBank_tripos.mol2'...
5928 molecules read
1.180 s elapsed

In [10]:
init = time.time()
test_labels = test_ff.labelMolecules(molecules)
ref_labels = ref_ff.labelMolecules(molecules)
t = (time.time() - init) / 60.0
print("Typed %i molecules with test and reference force fields in %.2f minutes" % (len(molecules), t))


Typed 5928 molecules with test and reference force fields in 22.57 minutes

4. Identify any molecules not assigned the same parameters by both force fields


In [11]:
# Make dictionary by molecule and tuple indices
init = time.time()
test_dict = labels_to_pidDict(test_labels)
ref_dict = labels_to_pidDict(ref_labels)
t = (time.time() - init) / 60.0
print("created indices tuple to pid dictionaries in %.2f minutes" % t)


created indices tuple to pid dictionaries in 0.06 minutes

In [12]:
# Make a dictionary to store mismatches:
mismatch = dict()
# This will have embedded dictionaries with this form:
# force_type: {mol_idx:{(index tuple): {test_pid, test_smirks, ref_pid, ref_smirks}}}
mismatch_count = dict()

# loop through force types
for force_type, test_mol_dict in test_dict.items():
    if force_type not in mismatch:
        mismatch[force_type] = dict()
    if force_type not in mismatch_count:
        mismatch_count[force_type] = 0
    
    # loop through molecules in each force type
    for mol_idx, test_tuple_dict in test_mol_dict.items():
        if not mol_idx in mismatch[force_type]:
            mismatch[force_type][mol_idx] = dict()
        
        # loop through all atom indice tuples in this molecule
        for indice_tuple, test_info in test_tuple_dict.items():
            
            # compare pid assignment
            test_pid = test_info['pid']
            ref_pid = ref_dict[force_type][mol_idx][indice_tuple]['pid']
            # if they don't match store info in mismatch dictionary and update count
            if test_pid != ref_pid:
                test_smirks = test_info['smirks']
                ref_smirks = ref_dict[force_type][mol_idx][indice_tuple]['smirks']
                mismatch[force_type][mol_idx][indice_tuple] = {'test_pid': test_pid, 'test_smirks': test_smirks,
                                                              'ref_pid': ref_pid, 'ref_smirks': ref_smirks}
                mismatch_count[force_type] +=1

print("%-35s %s" % ("Force Type", "Number mismatches"))
print("-"*55)
for force_type, count in mismatch_count.items():
    print("%-35s %i" % (force_type, count))


Force Type                          Number mismatches
-------------------------------------------------------
NonbondedGenerator                  0
HarmonicBondGenerator               0
PeriodicTorsionGenerator            0
HarmonicAngleGenerator              0

5. Visualize mismatches by force type

Chose a force type and then the molecules for each will be displayed


In [13]:
ForceType = "PeriodicTorsionGenerator"

for mol_idx, tuple_dict in mismatch[ForceType].items():
    # only visualize molecules with mismatch indices
    keys = [k for k in tuple_dict.keys()]
    if len(keys) == 0:
        continue
    
    mol = OEMol(molecules[mol_idx])
    print("Looking at molecule %i" % mol_idx)    
    
    for indice_tuple, pid_info in tuple_dict.items():
        test_pid = pid_info['test_pid']
        test_smirks = pid_info['test_smirks']
        
        ref_pid = pid_info['ref_pid']
        ref_smirks = pid_info['ref_smirks']
        
        print("%-10s %-40s %-40s" % ('', 'test force field', 'reference forcefield'))
        print("%-10s %-40s %-40s" % ('pid'))
        print("%-10s %-30s %-10s %-30s" % (test_pid, test_smirks, ref_pid, ref_smirks))
        display(depictAtomByIdx(mol, indice_tuple, supH = False))
        print("\n")
    print("\n")
    print("-"*100)
    print("\n")

Extra check for R

Since this notebook was made explicitly for the change from 'R' to 'x' I want to make sure that all of the 'R's were replaced


In [ ]:
# loop through force types
for force_type, test_mol_dict in test_dict.items():  
    # loop through molecules in each force type
    for mol_idx, test_tuple_dict in test_mol_dict.items():
        # loop through all atom indice tuples in this molecule
        for indice_tuple, test_info in test_tuple_dict.items():
            # compare pid assignment
            test_pid = test_info['pid']
            test_smirks = test_info['smirks']
            # Check for 'R'
            if 'R' in test_smirks:
                print("Found 'R' in %s (%s)" % )