In [1]:
import sys
import os
import re
import glob
from matplotlib import pylab as plt
import pandas as pd
%matplotlib notebook
# %matplotlib inline
%config InlineBackend.figure_format = 'retina'




# curr_ld_lib_path = ''
# os.environ['LD_LIBRARY_PATH'] = curr_ld_lib_path + ':/project/projectdirs/openmsi/jupyterhub_libs/boost_1_55_0/lib' + ':/project/projectdirs/openmsi/jupyterhub_libs/lib'
# import sys
sys.path.append('/global/project/projectdirs/openmsi/jupyterhub_libs/anaconda/lib/python2.7/site-packages')

# sys.path.insert(0,'/global/project/projectdirs/metatlas/anaconda/lib/python2.7/site-packages' )



sys.path.append('/project/projectdirs/openmsi/projects/meta-iq/pactolus/pactolus')
sys.path.insert(0,'/global/project/projectdirs/metatlas/anaconda/lib/python2.7/site-packages' )

from metatlas import metatlas_objects as metob
from metatlas import h5_query as h5q
from metatlas import mzml_to_hdf


import score_frag_dag

from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Draw
# from rdkit.Chem.rdMolDescriptors import ExactMolWt
from rdkit.Chem import Descriptors
from rdkit.Chem import rdMolDescriptors
from rdkit.Chem import AllChem
from rdkit.Chem import Draw
from rdkit.Chem import rdDepictor
from rdkit.Chem.Draw import rdMolDraw2D
from rdkit.Chem.Draw import IPythonConsole
from IPython.display import SVG,display

In [ ]:
# These are the tests:
#         if len(Chem.GetMolFrags(mol, sanitizeFrags=False, asMols=False)) != 1:
#             raise TypeError('Molecule must be fully connected by covalent bonds.')

#         if Chem.RemoveHs(mol).GetNumAtoms() != mol.GetNumAtoms():
#             raise TypeError('Molecule must have only implicit H atoms.')

#         if GetFormalCharge(mol) != 0:
#             raise TypeError('Molecule must have overall neutral charge.')

In [2]:
""" contribution from Hans de Winter """
def _InitialiseNeutralisationReactions():
    patts= (
        # Imidazoles
        ('[n+;H]','n'),
        # Amines
        ('[N+;!H0]','N'),
        # Carboxylic acids and alcohols
        ('[$([O-]);!$([O-][#7])]','O'),
        # Thiols
        ('[S-;X1]','S'),
        # Sulfonamides
        ('[$([N-;X2]S(=O)=O)]','N'),
        # Enamines
        ('[$([N-;X2][C,N]=C)]','N'),
        # Tetrazoles
        ('[n-]','[nH]'),
        # Sulfoxides
        ('[$([S-]=O)]','S'),
        # Amides
        ('[$([N-]C=O)]','N'),
        )
    return [(Chem.MolFromSmarts(x),Chem.MolFromSmiles(y,False)) for x,y in patts]

_reactions=None
def NeutraliseCharges(mol, reactions=None):
    global _reactions
    if reactions is None:
        if _reactions is None:
            _reactions=_InitialiseNeutralisationReactions()
        reactions=_reactions
#     mol = Chem.MolFromSmiles(smiles)
    replaced = False
    for i,(reactant, product) in enumerate(reactions):
        while mol.HasSubstructMatch(reactant):
            replaced = True
            rms = AllChem.ReplaceSubstructs(mol, reactant, product)
            rms_smiles = Chem.MolToSmiles(rms[0])
            mol = Chem.MolFromSmiles(rms_smiles)
    if replaced:
        return (mol, True) #Chem.MolToSmiles(mol,True)
    else:
        return (mol, False)
def drawStructure_ShowingFragment(pactolus_tree,fragment_idx,myMol,myMol_w_Hs):

    drawer = rdMolDraw2D.MolDraw2DSVG(600,300)

    fragment_atoms = np.where(pactolus_tree[fragment_idx]['atom_bool_arr'])[0]
    mark_atoms_no_H = []
    for a_index in fragment_atoms:
        if myMol_w_Hs.GetAtomWithIdx(a_index).GetSymbol() != 'H':
            mark_atoms_no_H.append(a_index)

    rdDepictor.Compute2DCoords(myMol)

    drawer.DrawMolecule(myMol,highlightAtoms=mark_atoms_no_H)
    drawer.FinishDrawing()
    svg = drawer.GetDrawingText().replace('svg:','')
    return svg

def drawStructure_Fragment(pactolus_tree,fragment_idx,myMol,myMol_w_Hs):
    fragment_atoms = np.where(pactolus_tree[fragment_idx]['atom_bool_arr'])[0]
    depth_of_hit = np.sum(pactolus_tree[fragment_idx]['bond_bool_arr'])
    mol2 = deepcopy(myMol_w_Hs)
    # Now set the atoms you'd like to remove to dummy atoms with atomic number 0
    fragment_atoms = np.where(pactolus_tree[fragment_idx]['atom_bool_arr']==False)[0]
    for f in fragment_atoms:
        mol2.GetAtomWithIdx(f).SetAtomicNum(0)

    # Now remove dummy atoms using a query
    mol3 = Chem.DeleteSubstructs(mol2, Chem.MolFromSmarts('[#0]'))
    mol3 = Chem.RemoveHs(mol3)
    # You get what you are looking for
    return moltosvg(mol3),depth_of_hit


def moltosvg(mol,molSize=(450,150),kekulize=True):
    mc = Chem.Mol(mol.ToBinary())
    if kekulize:
        try:
            Chem.Kekulize(mc)
        except:
            mc = Chem.Mol(mol.ToBinary())
    if not mc.GetNumConformers():
        rdDepictor.Compute2DCoords(mc)
    drawer = rdMolDraw2D.MolDraw2DSVG(molSize[0],molSize[1])
    drawer.DrawMolecule(mc)
    drawer.FinishDrawing()
    svg = drawer.GetDrawingText()
    # It seems that the svg renderer used doesn't quite hit the spec.
    # Here are some fixes to make it work in the notebook, although I think
    # the underlying issue needs to be resolved at the generation step
    return svg.replace('svg:','')

def get_ion_from_fragment(frag_info,spectrum):
    hit_indices = np.where(np.sum(frag_info,axis=1))
    hit = spectrum[hit_indices,:][0]
    return hit,hit_indices

In [4]:
allCompounds = metob.retrieve('Compounds',inchi='InchI%',username='*')
print len(allCompounds)
allCompounds[0]
#11539 trees of depth 5 Curt had before I messed with it.


15616
Out[4]:
{'creation_time': '2015-10-08T17:36:17',
 'description': u'',
 'formula': u'C21H28O2',
 'functional_sets': [],
 'head_id': u'0029ce6ff55b44468275b69597be364c',
 'inchi': u'InChI=1S/C21H28O2/c1-13(22)17-6-7-18-16-5-4-14-12-15(23)8-10-20(14,2)19(16)9-11-21(17,18)3/h9,12,16-18H,4-8,10-11H2,1-3H3/t16?,17?,18?,20-,21+/m0/s1',
 'inchi_key': None,
 'last_modified': '2015-10-08T17:38:10',
 'mono_isotopic_molecular_weight': 312.20892333984375,
 'name': u'pregna-4,9(11)-diene-3,20-dione',
 'neutralized_2d_inchi': None,
 'neutralized_2d_inchi_key': None,
 'neutralized_inchi': None,
 'neutralized_inchi_key': None,
 'number_components': None,
 'permanent_charge': None,
 'prev_uid': u'origin',
 'reference_xrefs': [],
 'synonyms': u'',
 'unique_id': u'0029ce6ff55b44468275b69597be364c',
 'url': u'',
 'username': u'bpb'}

In [6]:
# all_my_h5_files = glob.glob('/global/project/projectdirs/openmsi/projects/ben_trees/*_hdf5_5_*.h5')
# inchi_keys = []
# for f in all_my_h5_files:
#     temp = f.replace('/global/project/projectdirs/openmsi/projects/ben_trees/FragTreeLibrary_test_hdf5_5_','').replace('h5','').replace('.','')
#     inchi_keys.append(temp)
# len(inchi_keys)
# #was 13189 before

In [7]:
# get inchi key for neutralized molecules
ik = []
num_pieces = []
inchi = []
charge = []
compound_list = []
for idx,myCompound in enumerate(allCompounds):
    compound_list.append(myCompound.name)
    
    try:
        myMol = Chem.MolFromInchi(str(myCompound.inchi))
        (myMol, neutralised) = NeutraliseCharges(myMol)

        num_pieces.append(len(Chem.GetMolFrags(myMol, sanitizeFrags=False, asMols=False)))
        charge.append(Chem.GetFormalCharge(myMol))
        i = Chem.MolToInchi(myMol)
        inchi.append(i)
        ik.append(Chem.InchiToInchiKey(i))
        
    except:
        print "Can not parse ", myCompound.name, myCompound.InChI

In [9]:
cols = ['InChI',
 'MonoIsotopic_molecular_weight',
 'formula',
       'InChI-Key',
       'charge',
       'num_pieces',
       'description']
    
    # print myAtlas[0].compound_identifications[0].compound
df = pd.DataFrame(index=compound_list, columns=cols)

df['name'] = compound_list
df.set_index('name',drop=True)
for idx,myCompound in enumerate(allCompounds):
    n = myCompound.name
    df.ix[n,'formula'] = myCompound.formula
    df.ix[n,'Monoisotopic'] = myCompound.mono_isotopic_molecular_weight
    df.ix[n,'num_pieces'] = num_pieces[idx]
    df.ix[n,'Charge'] = charge[idx]
    df.ix[n,'InChI-Key'] = ik[idx]
    df.ix[n,'InChI'] = inchi[idx]
    df.ix[n,'description'] = myCompound.description
df.to_csv('20151214_lookup_table_metatlas_to_inchi_key.csv')

In [ ]: