In [1]:
import sys,os
sys.path.insert(0,'/global/project/projectdirs/metatlas/anaconda/lib/python2.7/site-packages' )
from metatlas import metatlas_objects as metob
import csv


curr_ld_lib_path = ''

os.environ['LD_LIBRARY_PATH'] = curr_ld_lib_path + ':/project/projectdirs/openmsi/jupyterhub_libs/boost_1_55_0/lib' + ':/project/projectdirs/openmsi/jupyterhub_libs/lib'
import sys
# sys.path.remove('/anaconda/lib/python2.7/site-packages')
sys.path.append('/global/project/projectdirs/openmsi/jupyterhub_libs/anaconda/lib/python2.7/site-packages')
sys.path.insert(0,'/project/projectdirs/openmsi/projects/meta-iq/pactolus/pactolus' )

from rdkit import Chem
# from rdkit.Chem.rdMolDescriptors import ExactMolWt
from rdkit.Chem import Descriptors
from rdkit.Chem import rdMolDescriptors
from rdkit.Chem import AllChem
from rdkit.Chem.Fingerprints import FingerprintMols
from rdkit.Chem import Draw

from rdkit import DataStructs
import numpy as np

from matplotlib import pylab as plt

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import networkx as nx

In [2]:
allCompounds = metob.retrieve('Compounds')
print len(allCompounds)


11871

In [11]:
for c in allCompounds:
    if 'AMP' in c.name:
        print c


glycyl-AMP (10f07f882af14292bd6b7e725ac6082e)
L-threonyl-AMP (18b07b153dcf429c8827952edde92b37)
2-hydroxy-dAMP (3b6ea909ce1041ee863c476b9d936534)
AMP-Mor (45449ed9a0c74a07a8a38a703354e8ed)
1-(5-phospho-β-D-ribosyl)-AMP (4faa68f98f3a499c9edde95663f4e2b9)
linoleyl-AMP (60439034c9864043b783aa09676925ea)
lipol-AMP (62ad67ef79b64b7d82ff2edeb17da61e)
2-iodo-L-isoleucine-NHSO<sub>2</sub>-AMP (76c96a776ced401e8176055a3a284b85)
3'-amino-3'-deoxyAMP (7b7981ccd9564aed8ca4af175a1ff4cd)
dAMP (7e89e193408840e4ace0a0ad76555579)
AMP-PCP (7f263779c884467eb80551297b34eaca)
http://metacyc.org/META/NEW-IMAGE?type=COMPOUND&object=AMP (818b5d47fa604be6ba997ecdb7ba27f0)
AMPPNP (8826c772513b414aa5fa039246c65306)
AMP-lysine (a13a2645de4544298b9da5bda09daf87)
8-oxo-dAMP (db4e69dbffea4225889fd922e7bffd22)
acetyl AMP (f5d5b1563e814e89b61230b1300d1bbd)
cyclic-AMP (f67cedae81134c36bbb91907c782b99e)

In [3]:
allCompounds[0]


Out[3]:
{'InChI': u'InChI=1S/C15H24/c1-11(2)13-8-9-14-7-5-6-12(3)15(14,4)10-13/h7,12-13H,1,5-6,8-10H2,2-4H3/t12-,13-,15-/m1/s1',
 'MonoIsotopic_molecular_weight': 204.188,
 'creation_time': '2015-10-08T17:37:27',
 'description': u'',
 'formula': u'C15H24',
 'functional_sets': [],
 'last_modified': '2015-10-08T17:38:13',
 'name': u'(-)-4-epieremophilene',
 'prev_uid': u'',
 'reference_xrefs': [],
 'synonyms': u'',
 'unique_id': u'0005ec9d4a8e4f1c9bd7cdee585a1ebc',
 'url': u'',
 'username': u'bpb'}

In [44]:
#Store the necessary data from the metatlas database for each compound
myNames = [x.name for x in allCompounds]
myMW = [x.MonoIsotopic_molecular_weight for x in allCompounds]
myformula = [x.formula for x in allCompounds]
myID = [x.unique_id for x in allCompounds]

# Convert each InChI to an RDKit molecule
myMols = [Chem.MolFromInchi(str(x.InChI)) for x in allCompounds]

# Calculate Chemical Fingerprints
# fps = [FingerprintMols.FingerprintMol(x) for x in myMols]

# Calculate Chemical Fingerprints Using Fixed BitLength
N = 2048
fps = [FingerprintMols.FingerprintMol(x,minPath=1,maxPath=7,fpSize=N,bitsPerHash=2,useHs=True,tgtDensity=0,minSize=N,branchedPaths=True,
                                     useBondOrder=True,atomInvariants=[],fromAtoms=[],atomBits=[]) for x in myMols]

In [19]:
fp_mat = np.zeros((len(fps),N))
for i,f in enumerate(fps):
    for j in range(N):
        fp_mat[i,j] = float(DataStructs.BitVectToText(fps[i])[j])

In [20]:
print fp_mat.shape


(11862, 2048)

In [24]:
from scipy.spatial.distance import pdist
from scipy.cluster.hierarchy import linkage
from scipy.cluster.hierarchy import fcluster
dists = pdist(fp_mat[:,:],'cityblock')

In [26]:
from scipy.spatial.distance import squareform
pd = squareform(dists)

In [25]:
# Y = linkage(dists, method='average')
# cutoff = 0
# metClusterIDs = fcluster(Y,cutoff,criterion='distance')
# print len(np.unique(metClusterIDs))

In [ ]:
from rdkit.DataManip import Metric

import time
start_time = time.time()
d = np.zeros((len(fps),len(fps)))
for i,fps1 in enumerate(fps):
    for j,fps2 in enumerate(fps):
        if i<j:
            pd[i,j] = DataStructs.FingerprintSimilarity(fps[i],fps[j])

### TO
# there is something that doesn't make sense with this.
# probably row column ordering is different than scipy
# d = Metric.GetTanimotoSimMat(fps) 
# pd = squareform(d)

e_time = time.time() - start_time
print e_time

In [ ]:


In [18]:
idx1 = 2945
idx2 = 168
test_d = DataStructs.FingerprintSimilarity(fps[idx1],fps[idx2])
print myNames[idx1]," ",myNames[idx2], test_d
print pd[idx1,idx2]


3-oxo-(11Z)-eicos-11-enoyl-CoA   chlorophyll b 0.62623120788
1.0

In [46]:
# s = np.sum(pd==0.0,1)
# idx1 = np.argsort(s)[::-1]

# checker = 0+18

# print idx1[checker], myNames[idx1[checker]]
# idx2 = np.argwhere(pd[idx1[checker],:]==0.0)
# for i,idx in enumerate(idx2):
#     print i, idx, myNames[idx], pd[checker,idx],pd[idx1[checker],idx[0]]
# #     idx2 = np.argwhere(pd[i,:]>=1)
# #     for j in idx2:
# #         print i,j,myNames[j],pd[i,j]

Identify sets of compounds that are identical by Tanimoto coefficient


In [142]:
fid = open('MetAtlas_Network_unity.tab','w')
for i in range(pd.shape[0]):
    for j in range(pd.shape[1]):
        if i<j:
            if pd[i,j]==1:
                fid.write('"#%d %s"\t"#%d %s"\t%5.5f\n' % (i,
                                                           myNames[i],
                                                           j,
                                                           myNames[j],
                                                           pd[i,j]))
#         print i, idx, myNames[idx], pd[checker,idx],pd[idx1[checker],idx[0]]
#     pd_row = pd[i,:]
#     m = np.max(pd_row)
#     if m==1:
#     idx = np.argwhere( pd_row == 1.0 )
#     for j in idx:
#         if j!=i:
#             fid.write('"%d%s"\t"%d%s"\t%5.5f\n' % (i,myNames[i],j,myNames[j],pd_row[j]))
fid.close()

Make network of most similar metabolites


In [48]:
fid = open('MetAtlas_Network_Similar.tab','w')
for i in range(pd.shape[0]):
    temp = pd[i,0:i]
    if len(temp)>1:
        m = min(temp)
        if m<100000:
            for j in range(i-1):
                if pd[i,j]==m:
                    fid.write('"#%d %s %5.4f %s"\t"#%d %s %5.4f %s"\t%5.5f\n' % (i,
                                                                                 myformula[i],
                                                                                 myMW[i],
                                                                                 myNames[i],
                                                                                 j,
                                                                                 myformula[j],
                                                                                 myMW[j],
                                                                                 myNames[j],
                                                                                 pd[i,j]))
    
#     m = max(pd[i,:])
# #     if m == 1:
#     for j in range(pd.shape[1]):
#         if i<j:
#             if pd[i,j]==m:
#                 fid.write('"#%d %s %5.4f %s"\t"#%d %s %5.4f %s"\t%5.5f\n' % (i,
#                                                                                  myformula[i],
#                                                                                  myMW[i],
#                                                                                  myNames[i],
#                                                                                  j,
#                                                                                  myformula[j],
#                                                                                  myMW[j],
#                                                                                  myNames[j],
#                                                                                  pd[i,j]))
#         print i, idx, myNames[idx], pd[checker,idx],pd[idx1[checker],idx[0]]
#     pd_row = pd[i,:]
#     m = np.max(pd_row)
#     if m==1:
#     idx = np.argwhere( pd_row == 1.0 )
#     for j in idx:
#         if j!=i:
#             fid.write('"%d%s"\t"%d%s"\t%5.5f\n' % (i,myNames[i],j,myNames[j],pd_row[j]))
fid.close()

In [45]:
fid = open('Tanimoto_2048_MetAtlas_Network_Cutoff_Min_Match_100.tab','w')
for i in range(pd.shape[0]):
    temp = pd[i,0:i]
    if len(temp)>1:
        m = min(temp)
        if m<100:
            for j in range(i-1):
                if pd[i,j]==m:
                    fid.write('"#%d %s %5.4f %s"\t"#%d %s %5.4f %s"\t%5.5f\n' % (i,
                                                                                 myformula[i],
                                                                                 myMW[i],
                                                                                 myNames[i],
                                                                                 j,
                                                                                 myformula[j],
                                                                                 myMW[j],
                                                                                 myNames[j],
                                                                                 pd[i,j]))
#         print i, idx, myNames[idx], pd[checker,idx],pd[idx1[checker],idx[0]]
#     pd_row = pd[i,:]
#     m = np.max(pd_row)
#     if m==1:
#     idx = np.argwhere( pd_row == 1.0 )
#     for j in idx:
#         if j!=i:
#             fid.write('"%d%s"\t"%d%s"\t%5.5f\n' % (i,myNames[i],j,myNames[j],pd_row[j]))
fid.close()

In [ ]:


In [138]:
print i,j
pd.shape[0]
idx2 = np.where(pd[0,:]==1.0)
print idx2
for i in idx2:
    print i
    print myNames[i]
    print myNames[0]


[1522] 650
(array([1522]),)
[1522]
(+)-valencene
(-)-4-epieremophilene

In [54]:
%%javascript
var nb = IPython.notebook;
var kernel = IPython.notebook.kernel;
var command = "NOTEBOOK_FULL_PATH = '" + nb.base_url + nb.notebook_path + "'";
kernel.execute(command);



In [58]:
filename = os.path.basename(NOTEBOOK_FULL_PATH)
%system cp $filename /project/projectdirs/openmsi/www/
temp = '%s/%s'%('/project/projectdirs/openmsi/www',filename)
%system chmod 775 $temp
print 'http://nbviewer.ipython.org/url/portal.nersc.gov/project/openmsi/%s?flush_cache=true'%filename


http://nbviewer.ipython.org/url/portal.nersc.gov/project/openmsi/Make_Network_of_MetAtlas_Compounds.ipynb?flush_cache=true

In [ ]: