Create Mini Molecule Set from Large Database

Goal

1) Try to include as many parameter types from smirnoff99Frosst as possible

2) Try to include all parm@Frosst atomtypes from initial set

2) Limit the number of total molecules

Authors:

Caitlin C. Bannan (UCI)
Followed parm@frossty-y to SMIRNOFF script by David L. Mobley (UCI) as example on how to type molecules with smirnoff99Frosst



In [41]:

    
# Imports
from __future__ import print_function
from openeye.oechem import *
from openeye.oeiupac import *
from openeye.oeomega import *
from openeye.oedepict import *
from IPython.display import display
from openforcefield.typing.engines.smirnoff.forcefield import *
from openforcefield.typing.engines.smirnoff.forcefield_utils import get_molecule_parameterIDs
from openforcefield.utils import *
% matplotlib inline
import matplotlib
import numpy as np
import pylab as pl
import matplotlib.pyplot as plt
import time
import IPython
import pickle
import copy

1. Load Molecules

Load all molecules with parm@frosst atom types and 3D coordinates. We will remove those with repeating SMILES strings.



In [42]:

    
# load molecules with 3D conformers 
inf = get_data_filename("molecules/DrugBank_ff.mol2")
starting_molecules = list()
init = time.time()
smiles = set()
for mol in read_molecules(inf):
    # remove repeating molecules
    smile = OECreateIsoSmiString(mol)
    if smile not in smiles:
        smiles.add(smile)
        starting_molecules.append(OEMol(mol))
end = time.time()
print("Found %i molecules after removing duplicates in \n %s \n it took %.1f seconds" % (len(starting_molecules), inf, end-init))









    



Loading molecules from '/Users/bannanc/anaconda3/lib/python3.5/site-packages/openforcefield/data/molecules/DrugBank_ff.mol2'...
5928 molecules read
1.135 s elapsed
Found 5700 molecules after removing duplicates in 
 /Users/bannanc/anaconda3/lib/python3.5/site-packages/openforcefield/data/molecules/DrugBank_ff.mol2 
 it took 1.8 seconds

1.5 Remove unwanted molecules

Generally speaking this removes molecules smirnoff99Frosst cannot currently type. As this notebook was created to generate a molecule set for smarty/smirky tests it also removes molecules the parm@Frosst forcefield cannot type

No metals
No more than 100 heavy atoms
Grater than 2 heavy atoms
No gg atom types (that is the generic parm@Frosst) atomtype



In [43]:

    
def hasAtomType(mol, typ):
    for atom in mol.GetAtoms():
        if atom.GetType() in typ:
            return True
    return False
    
ff = ForceField('forcefield/smirnoff99Frosst.ffxml')  
labels = ff.labelMolecules(starting_molecules)
molecules = list()
metal_smiles = list()
c = {'metals':0, 'heavy':0, 'light': 0, 'atomtypes': 0, 'n1': 0, 'b1': 0, 'a1': 0, 't1': 0}
for index, mol in enumerate(starting_molecules):
    if OECount(mol, OEIsMetal()) > 0:
        c['metals'] +=1
        metal_smiles.append(OECreateIsoSmiString(mol))
        continue
    if OECount(mol, OEIsHeavy()) > 100:
        c['heavy'] +=1
        continue
    if OECount(mol, OEIsHeavy()) < 4:
        c['light'] +=1 
        continue
    if hasAtomType(mol, ['gg','IM']):
        c['atomtypes'] +=1
        continue
    pids = [pid for force, lists in labels[index].items() for (indices,pid, smarts) in lists]
    if 'n1' in pids:
        c['n1']+=1
        continue
    if 'b1' in pids:
        c['b1']+= 1
        continue
    if 'a1' in pids:
        c['a1']+=1
        continue
    if 't1' in pids:
        c['t1']+=1 
        continue
    molecules.append(OEMol(mol))

print("Keeping %i molecules of the original %i" % (len(molecules), len(starting_molecules)))

for k, e in c.items():
    print("%-15s %i" % (k,e))









    



Keeping 5616 molecules of the original 5700
atomtypes       7
a1              0
light           20
n1              38
t1              5
heavy           1
b1              2
metals          11

2. Load SMIRNOFF99Frosst and label molecules



In [4]:

    
init = time.time()
labels = ff.labelMolecules(molecules)
end = time.time()
print("Took %.1f minutes to label all %i molecules" % ((end-init)/60.0, len(molecules)))









    



Took 12.5 minutes to label all 5616 molecules

3. Store initial information



In [5]:

    
def getInitialInformation(labels, molecules):
    keep_mols = set()
    unused_pids = set()
    dict_mols = dict()
    for i in range(len(labels)):
        dict_mols[i] = set()
    dict_pid = dict()
    # loop through molecules:
    for idx, molDict in enumerate(labels):
        # loop through forcetypes from smirnoff99Frosst label
        for force, idList in molDict.items():
            # loop through pid lists
            for (idices, pid, smirks) in idList:
                if not pid in dict_pid:
                    dict_pid[pid] = set()
                unused_pids.add(pid)
                dict_mols[idx].add(pid)
                dict_pid[pid].add(idx)
        
        # Loop through atoms in molecule treating where
        # atomtypes are the parameter id
        mol = molecules[idx]
        for atom in mol.GetAtoms():
            typ = atom.GetType()
            if not typ in dict_pid:
                dict_pid[typ] = set()
            unused_pids.add(typ)
            dict_mols[idx].add(typ)
            dict_pid[typ].add(idx)
                
    return unused_pids, dict_mols, dict_pid



In [6]:

    
init = time.time()
unused_pids, dict_mols, dict_pid = getInitialInformation(labels, molecules)
all_pids = list(unused_pids)
end = time.time()
print("Created initial dictionaries and lists in %.1f seconds" % (end-init))









    



Created initial dictionaries and lists in 3.8 seconds

4. Remove unnecessary molecules



In [7]:

    
# store molecules (to keep) and pids (to remove)
init = time.time()
keep_mols = set()

length = 2
while len(unused_pids) > 0:
    # Track if a change has been made
    to_remove_pids = set()
    unchanged = True
    
    # For all pids still in the unused list
    for pid in unused_pids:    
        # get the list of molecules with that parameter
        molList = copy.copy(dict_pid[pid])
        
        # Trying to remove the limited number of molecules, so 
        # start by only considering pids with in a small number of molecules, 
        # if no changes are made then the length is increased 
        if len(molList) < length:
            unchanged = False
            for m in molList:
                keep_mols.add(m) # Keep this molecule
                for pid in dict_mols[m]:
                    # remove all pids in this molecule from the unused list
                    to_remove_pids.add(pid) 
                if m in dict_pid[pid]:
                    dict_pid[pid].remove(m) # remove this molecule from pid dict as it has already been stored
            
    # updated unused_pids
    for pid in to_remove_pids:
        if pid in unused_pids:
            unused_pids.remove(pid)
    
    # update length of molList considered
    if unchanged:
        length += 1
    # If A change was made reset length to 2
    else:
        print("the length was %i" % length)
        length = 2

end = time.time()
print("Removing unnecessary molecules took %.1f seconds" % (end-init))









    



the length was 2
the length was 3
the length was 4
the length was 5
the length was 6
the length was 7
the length was 8
the length was 9
the length was 11
the length was 14
the length was 15
the length was 18
the length was 22
the length was 29
the length was 122
Removing unnecessary molecules took 0.0 seconds



In [8]:

    
# Make an updated list with only the stored molecules
new_molecules = list()
for idx in keep_mols:
    new_molecules.append(OEMol(molecules[idx]))
print("There are %i molecules in the final set" % len(new_molecules))









    



There are 371 molecules in the final set

5. Get information about new molecules



In [18]:

    
init = time.time()
new_labels = [labels[idx] for idx in keep_mols]
new_labels = ff.labelMolecules(new_molecules)
new_unused_pids, new_dict_mols, new_dict_pid = getInitialInformation(new_labels, new_molecules)

for pid in all_pids:
    if pid not in new_unused_pids:
        print("pid %s is missing in new_molecules" % pid)
end = time.time()
print("checking new molecules took %.1f seconds" % (end-init))









    



checking new molecules took 48.6 seconds



In [10]:

    
def getAtomtypeCounts(molecules):
    A = dict()
    for mol in new_molecules:
        for atom in mol.GetAtoms():
            a_num = atom.GetAtomicNum()
            a_type = atom.GetType()
            if not a_num in A:
                A[a_num] = set()
            A[a_num].add(a_type)
    return A



In [21]:

    
# print information about smirnoff parameter IDs
print( "Total pids and atom types: %i" % len(all_pids))
n = [a for a in all_pids if a[0]=='n']
print("Vdw %i" % len(n))
b = [a for a in all_pids if a[0]=='b']
print("Bonds %i" % len(b))
aS = [a for a in all_pids if a[0]=='a']
print("Angles %i" % len(aS))
t = [a for a in all_pids if a[0]=='t']
print("Torsions %i" % len(t))

# print information about diversity of atom types
oldA = getAtomtypeCounts(molecules)
newA = getAtomtypeCounts(new_molecules)
print("%20s %20s %20s" % ('AtomicNum', 'in Old', 'in New'))
for k, e in oldA.items():
    new_e = newA[k]
    print("%20s %20s %20s" % (k, len(e), len(new_e)))









    



Total pids and atom types: 320
Vdw 26
Bonds 73
Angles 34
Torsions 136
           AtomicNum               in Old               in New
                  16                    5                    5
                   1                   12                   12
                  35                    1                    1
                  17                    1                    1
                  53                    1                    1
                   6                   12                   12
                   7                    8                    8
                   8                    5                    5
                   9                    1                    1
                  15                    1                    1

6. Store new Molecules



In [26]:

    
import os
ff_file = 'MiniDrugBank_ff.mol2'
tri_file = 'MiniDrugBank_tripos.mol2'

# Following example in Christopher Bayly's oeb-to-FF-and-tripos-mol2.py script
if os.path.exists(ff_file):
    os.system("rm %s" % ff_file)
if os.path.exists(tri_file):
    os.system("rm %s" % tri_file)

# Make a parm@Frosst atomtype mol2 file
ofsff = oechem.oemolostream()
ofsff.SetFlavor( oechem.OEFormat_MOL2, oechem.OEOFlavor_MOL2_Forcefield )
ofsff.open(ff_file)
# and Tripos atomtype mol2 file
ofsTri = oechem.oemolostream()
ofsTri.SetFlavor( oechem.OEFormat_MOL2, oechem.OEOFlavor_MOL2_Forcefield )
ofsTri.open(tri_file)

for index, c_mol in enumerate(new_molecules):
    c_mol.SetTitle("MiniDrugBank_%i" % index)
    mol1 = OEMol(c_mol)
    oechem.OETriposAtomNames(mol1)
    oechem.OEWriteConstMolecule(ofsff, mol1)
    mol2 = OEMol(c_mol)
    oechem.OETriposAtomTypeNames(mol2)
    oechem.OEWriteConstMolecule(ofsTri, mol2)

ofsff.close()
ofsTri.close()

7. Test new MiniDrugBank Set

We will perform the following tests:

check for repeating molecules based on isomeric SMILES
check that the number of different parm@frosst atom types haven't changed
check that the number of different types of smirnoff99Frosst parameter IDs
check for 3D coordinates
check for no implicit hydrogens



In [34]:

    
tripos_molecules = read_molecules(tri_file)
ff_molecules = read_molecules(ff_file)









    



Loading molecules from 'MiniDrugBank_tripos.mol2'...
371 molecules read
0.156 s elapsed
Loading molecules from 'MiniDrugBank_ff.mol2'...
371 molecules read
0.102 s elapsed



In [35]:

    
ff_mols = copy.deepcopy(ff_molecules)
tripos_mols = copy.deepcopy(tripos_molecules)

# Check SMILES strings
smiles = set()
# check for repeating SMILES
for idx, ff_mol in enumerate(ff_mols):
    # get SMILES information
    ff_smile = oechem.OECreateIsoSmiString(ff_mol)
    tri_mol = tripos_mols[idx]
    tri_smile = oechem.OECreateIsoSmiString(tri_mol)

    if ff_smile != tri_smile:
        print("Molecule %i doesn't match\n parm@frosst: %s (%s) \n tripos: %s (%s) \n" %\
              (idx, ff_smile, ff_mol.GetTitle(), tri_smile, tri_mol.GetTitle()))
    
    if ff_smile in smiles:
        print("Found repeating SMILES string for molecule %i %s (%s)" % \
              (idx, ff_smile, ff_mol.GetTitle()))

    # add smiles to the list
    smiles.add(ff_smile)



In [36]:

    
ff_mols = copy.deepcopy(ff_molecules)
tripos_mols = copy.deepcopy(tripos_molecules)

# Check atom types
atom_types = {1: [12, set()], 6: [12, set()], 7: [8, set()],
        8: [5, set()], 9: [1, set()], 15: [1, set()], 16: [5, set()],
        17: [1, set()], 35: [1, set()], 53: [1, set()]}

for mol in ff_mols:
    for atom in mol.GetAtoms():
        # get atomic number
        n = atom.GetAtomicNum()
        t = atom.GetType()
        if n not in atom_types:
            print("Element number %i missing in original list" % n)
        atom_types[n][1].add(t)

# Check that the number of types here match original
for n, [count, s] in atom_types.items():
    if len(s) != count:
        print("Current set has %i atom types for atomic number %i" % (len(s), n))
        print("there were %i in the original set\n" % (count))



In [37]:

    
ff_mols = copy.deepcopy(ff_molecules)
tripos_mols = copy.deepcopy(tripos_molecules)

# Check parameter IDS
pids = {'HarmonicBondGenerator': [73, set()],
        'HarmonicAngleGenerator': [34, set()],
        'PeriodicTorsionGenerator': [136, set()],
        'NonbondedGenerator': [26, set()]}

ff = ForceField("forcefield/smirnoff99Frosst.ffxml")
labels = ff.labelMolecules(ff_mols, verbose = False)
# loop through labels from smirnoff
for force_dict in labels:
    for force, label_list in force_dict.items():
        for (indics, pid, smirks) in label_list:
            # we don't have current counts on impropers
            if pid[0] == 'i':
                continue
            pids[force][1].add(pid)

# Check that the number of types here match original
for force, [count, s] in pids.items():
    if len(s) != count:
        print("Current set has %i types for %s" % (len(s), force))
        print("There were %i in the original set" % count)



In [38]:

    
ff_mols = copy.deepcopy(ff_molecules)
tripos_mols = copy.deepcopy(tripos_molecules)

# Check for 3D coordinates
for idx, ff_mol in enumerate(ff_mols):
    tri_mol = tripos_mols[idx]
    if ff_mol.GetDimension() != 3:
        print("Molecule %i in the parm@frosst set (%s) doesn't have 3D coordinates" % \
             (idx, ff_mol.GetTitle()))
    if tri_mol.GetDimension() != 3:
        print("Molecule %i in the tripos set (%s) doesn't have 3D coordinates" % \
             (idx, tri_mol.GetTitle()))



In [40]:

    
ff_mols = copy.deepcopy(ff_molecules)
tripos_mols = copy.deepcopy(tripos_molecules)

# check for implicit hydrogens
for idx, ff_mol in enumerate(ff_mols):
    for a in ff_mol.GetAtoms():
        if a.GetImplicitHCount() != 0:
            print("Found a %i atom with implicit hydrogens in molecule %i (%s) in the parm@frosst set" \
                 % (a.GetAtomicNum(), idx, ff_mol.GetTitle()))
            
    tri_mol = tripos_mols[idx]
    for a in tri_mol.GetAtoms():
        if a.GetImplicitHCount() != 0:
            print("Found a %i atom with implicit hydrogens in molecule %i (%s) in the tripos set" \
                 % (a.GetAtomicNum(), idx, tri_mol.GetTitle()))



In [ ]: