Goal
1) Try to include as many parameter types from smirnoff99Frosst as possible
2) Try to include all parm@Frosst atomtypes from initial set
2) Limit the number of total molecules
Authors:
In [41]:
# Imports
from __future__ import print_function
from openeye.oechem import *
from openeye.oeiupac import *
from openeye.oeomega import *
from openeye.oedepict import *
from IPython.display import display
from openforcefield.typing.engines.smirnoff.forcefield import *
from openforcefield.typing.engines.smirnoff.forcefield_utils import get_molecule_parameterIDs
from openforcefield.utils import *
% matplotlib inline
import matplotlib
import numpy as np
import pylab as pl
import matplotlib.pyplot as plt
import time
import IPython
import pickle
import copy
In [42]:
# load molecules with 3D conformers
inf = get_data_filename("molecules/DrugBank_ff.mol2")
starting_molecules = list()
init = time.time()
smiles = set()
for mol in read_molecules(inf):
# remove repeating molecules
smile = OECreateIsoSmiString(mol)
if smile not in smiles:
smiles.add(smile)
starting_molecules.append(OEMol(mol))
end = time.time()
print("Found %i molecules after removing duplicates in \n %s \n it took %.1f seconds" % (len(starting_molecules), inf, end-init))
Generally speaking this removes molecules smirnoff99Frosst cannot currently type. As this notebook was created to generate a molecule set for smarty/smirky tests it also removes molecules the parm@Frosst forcefield cannot type
In [43]:
def hasAtomType(mol, typ):
for atom in mol.GetAtoms():
if atom.GetType() in typ:
return True
return False
ff = ForceField('forcefield/smirnoff99Frosst.ffxml')
labels = ff.labelMolecules(starting_molecules)
molecules = list()
metal_smiles = list()
c = {'metals':0, 'heavy':0, 'light': 0, 'atomtypes': 0, 'n1': 0, 'b1': 0, 'a1': 0, 't1': 0}
for index, mol in enumerate(starting_molecules):
if OECount(mol, OEIsMetal()) > 0:
c['metals'] +=1
metal_smiles.append(OECreateIsoSmiString(mol))
continue
if OECount(mol, OEIsHeavy()) > 100:
c['heavy'] +=1
continue
if OECount(mol, OEIsHeavy()) < 4:
c['light'] +=1
continue
if hasAtomType(mol, ['gg','IM']):
c['atomtypes'] +=1
continue
pids = [pid for force, lists in labels[index].items() for (indices,pid, smarts) in lists]
if 'n1' in pids:
c['n1']+=1
continue
if 'b1' in pids:
c['b1']+= 1
continue
if 'a1' in pids:
c['a1']+=1
continue
if 't1' in pids:
c['t1']+=1
continue
molecules.append(OEMol(mol))
print("Keeping %i molecules of the original %i" % (len(molecules), len(starting_molecules)))
for k, e in c.items():
print("%-15s %i" % (k,e))
In [4]:
init = time.time()
labels = ff.labelMolecules(molecules)
end = time.time()
print("Took %.1f minutes to label all %i molecules" % ((end-init)/60.0, len(molecules)))
In [5]:
def getInitialInformation(labels, molecules):
keep_mols = set()
unused_pids = set()
dict_mols = dict()
for i in range(len(labels)):
dict_mols[i] = set()
dict_pid = dict()
# loop through molecules:
for idx, molDict in enumerate(labels):
# loop through forcetypes from smirnoff99Frosst label
for force, idList in molDict.items():
# loop through pid lists
for (idices, pid, smirks) in idList:
if not pid in dict_pid:
dict_pid[pid] = set()
unused_pids.add(pid)
dict_mols[idx].add(pid)
dict_pid[pid].add(idx)
# Loop through atoms in molecule treating where
# atomtypes are the parameter id
mol = molecules[idx]
for atom in mol.GetAtoms():
typ = atom.GetType()
if not typ in dict_pid:
dict_pid[typ] = set()
unused_pids.add(typ)
dict_mols[idx].add(typ)
dict_pid[typ].add(idx)
return unused_pids, dict_mols, dict_pid
In [6]:
init = time.time()
unused_pids, dict_mols, dict_pid = getInitialInformation(labels, molecules)
all_pids = list(unused_pids)
end = time.time()
print("Created initial dictionaries and lists in %.1f seconds" % (end-init))
In [7]:
# store molecules (to keep) and pids (to remove)
init = time.time()
keep_mols = set()
length = 2
while len(unused_pids) > 0:
# Track if a change has been made
to_remove_pids = set()
unchanged = True
# For all pids still in the unused list
for pid in unused_pids:
# get the list of molecules with that parameter
molList = copy.copy(dict_pid[pid])
# Trying to remove the limited number of molecules, so
# start by only considering pids with in a small number of molecules,
# if no changes are made then the length is increased
if len(molList) < length:
unchanged = False
for m in molList:
keep_mols.add(m) # Keep this molecule
for pid in dict_mols[m]:
# remove all pids in this molecule from the unused list
to_remove_pids.add(pid)
if m in dict_pid[pid]:
dict_pid[pid].remove(m) # remove this molecule from pid dict as it has already been stored
# updated unused_pids
for pid in to_remove_pids:
if pid in unused_pids:
unused_pids.remove(pid)
# update length of molList considered
if unchanged:
length += 1
# If A change was made reset length to 2
else:
print("the length was %i" % length)
length = 2
end = time.time()
print("Removing unnecessary molecules took %.1f seconds" % (end-init))
In [8]:
# Make an updated list with only the stored molecules
new_molecules = list()
for idx in keep_mols:
new_molecules.append(OEMol(molecules[idx]))
print("There are %i molecules in the final set" % len(new_molecules))
In [18]:
init = time.time()
new_labels = [labels[idx] for idx in keep_mols]
new_labels = ff.labelMolecules(new_molecules)
new_unused_pids, new_dict_mols, new_dict_pid = getInitialInformation(new_labels, new_molecules)
for pid in all_pids:
if pid not in new_unused_pids:
print("pid %s is missing in new_molecules" % pid)
end = time.time()
print("checking new molecules took %.1f seconds" % (end-init))
In [10]:
def getAtomtypeCounts(molecules):
A = dict()
for mol in new_molecules:
for atom in mol.GetAtoms():
a_num = atom.GetAtomicNum()
a_type = atom.GetType()
if not a_num in A:
A[a_num] = set()
A[a_num].add(a_type)
return A
In [21]:
# print information about smirnoff parameter IDs
print( "Total pids and atom types: %i" % len(all_pids))
n = [a for a in all_pids if a[0]=='n']
print("Vdw %i" % len(n))
b = [a for a in all_pids if a[0]=='b']
print("Bonds %i" % len(b))
aS = [a for a in all_pids if a[0]=='a']
print("Angles %i" % len(aS))
t = [a for a in all_pids if a[0]=='t']
print("Torsions %i" % len(t))
# print information about diversity of atom types
oldA = getAtomtypeCounts(molecules)
newA = getAtomtypeCounts(new_molecules)
print("%20s %20s %20s" % ('AtomicNum', 'in Old', 'in New'))
for k, e in oldA.items():
new_e = newA[k]
print("%20s %20s %20s" % (k, len(e), len(new_e)))
In [26]:
import os
ff_file = 'MiniDrugBank_ff.mol2'
tri_file = 'MiniDrugBank_tripos.mol2'
# Following example in Christopher Bayly's oeb-to-FF-and-tripos-mol2.py script
if os.path.exists(ff_file):
os.system("rm %s" % ff_file)
if os.path.exists(tri_file):
os.system("rm %s" % tri_file)
# Make a parm@Frosst atomtype mol2 file
ofsff = oechem.oemolostream()
ofsff.SetFlavor( oechem.OEFormat_MOL2, oechem.OEOFlavor_MOL2_Forcefield )
ofsff.open(ff_file)
# and Tripos atomtype mol2 file
ofsTri = oechem.oemolostream()
ofsTri.SetFlavor( oechem.OEFormat_MOL2, oechem.OEOFlavor_MOL2_Forcefield )
ofsTri.open(tri_file)
for index, c_mol in enumerate(new_molecules):
c_mol.SetTitle("MiniDrugBank_%i" % index)
mol1 = OEMol(c_mol)
oechem.OETriposAtomNames(mol1)
oechem.OEWriteConstMolecule(ofsff, mol1)
mol2 = OEMol(c_mol)
oechem.OETriposAtomTypeNames(mol2)
oechem.OEWriteConstMolecule(ofsTri, mol2)
ofsff.close()
ofsTri.close()
We will perform the following tests:
In [34]:
tripos_molecules = read_molecules(tri_file)
ff_molecules = read_molecules(ff_file)
In [35]:
ff_mols = copy.deepcopy(ff_molecules)
tripos_mols = copy.deepcopy(tripos_molecules)
# Check SMILES strings
smiles = set()
# check for repeating SMILES
for idx, ff_mol in enumerate(ff_mols):
# get SMILES information
ff_smile = oechem.OECreateIsoSmiString(ff_mol)
tri_mol = tripos_mols[idx]
tri_smile = oechem.OECreateIsoSmiString(tri_mol)
if ff_smile != tri_smile:
print("Molecule %i doesn't match\n parm@frosst: %s (%s) \n tripos: %s (%s) \n" %\
(idx, ff_smile, ff_mol.GetTitle(), tri_smile, tri_mol.GetTitle()))
if ff_smile in smiles:
print("Found repeating SMILES string for molecule %i %s (%s)" % \
(idx, ff_smile, ff_mol.GetTitle()))
# add smiles to the list
smiles.add(ff_smile)
In [36]:
ff_mols = copy.deepcopy(ff_molecules)
tripos_mols = copy.deepcopy(tripos_molecules)
# Check atom types
atom_types = {1: [12, set()], 6: [12, set()], 7: [8, set()],
8: [5, set()], 9: [1, set()], 15: [1, set()], 16: [5, set()],
17: [1, set()], 35: [1, set()], 53: [1, set()]}
for mol in ff_mols:
for atom in mol.GetAtoms():
# get atomic number
n = atom.GetAtomicNum()
t = atom.GetType()
if n not in atom_types:
print("Element number %i missing in original list" % n)
atom_types[n][1].add(t)
# Check that the number of types here match original
for n, [count, s] in atom_types.items():
if len(s) != count:
print("Current set has %i atom types for atomic number %i" % (len(s), n))
print("there were %i in the original set\n" % (count))
In [37]:
ff_mols = copy.deepcopy(ff_molecules)
tripos_mols = copy.deepcopy(tripos_molecules)
# Check parameter IDS
pids = {'HarmonicBondGenerator': [73, set()],
'HarmonicAngleGenerator': [34, set()],
'PeriodicTorsionGenerator': [136, set()],
'NonbondedGenerator': [26, set()]}
ff = ForceField("forcefield/smirnoff99Frosst.ffxml")
labels = ff.labelMolecules(ff_mols, verbose = False)
# loop through labels from smirnoff
for force_dict in labels:
for force, label_list in force_dict.items():
for (indics, pid, smirks) in label_list:
# we don't have current counts on impropers
if pid[0] == 'i':
continue
pids[force][1].add(pid)
# Check that the number of types here match original
for force, [count, s] in pids.items():
if len(s) != count:
print("Current set has %i types for %s" % (len(s), force))
print("There were %i in the original set" % count)
In [38]:
ff_mols = copy.deepcopy(ff_molecules)
tripos_mols = copy.deepcopy(tripos_molecules)
# Check for 3D coordinates
for idx, ff_mol in enumerate(ff_mols):
tri_mol = tripos_mols[idx]
if ff_mol.GetDimension() != 3:
print("Molecule %i in the parm@frosst set (%s) doesn't have 3D coordinates" % \
(idx, ff_mol.GetTitle()))
if tri_mol.GetDimension() != 3:
print("Molecule %i in the tripos set (%s) doesn't have 3D coordinates" % \
(idx, tri_mol.GetTitle()))
In [40]:
ff_mols = copy.deepcopy(ff_molecules)
tripos_mols = copy.deepcopy(tripos_molecules)
# check for implicit hydrogens
for idx, ff_mol in enumerate(ff_mols):
for a in ff_mol.GetAtoms():
if a.GetImplicitHCount() != 0:
print("Found a %i atom with implicit hydrogens in molecule %i (%s) in the parm@frosst set" \
% (a.GetAtomicNum(), idx, ff_mol.GetTitle()))
tri_mol = tripos_mols[idx]
for a in tri_mol.GetAtoms():
if a.GetImplicitHCount() != 0:
print("Found a %i atom with implicit hydrogens in molecule %i (%s) in the tripos set" \
% (a.GetAtomicNum(), idx, tri_mol.GetTitle()))
In [ ]: