Add file level attribute to hdf5 files containing pactolus trees

The attribute is the inchi string I'm writing them to a new folder "pactolus_trees_with_inchi" not readable to openmsi users, but its writeable by us

When you are happy with the result:
chgrp -R m1541
chmod -R 750 "foldername"
so the users can have read access to the updated tree files


In [1]:
# %matplotlib notebook
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
from matplotlib import pylab as plt
import sys
import glob, os

curr_ld_lib_path = ''
os.environ['LD_LIBRARY_PATH'] = curr_ld_lib_path + ':/project/projectdirs/openmsi/jupyterhub_libs/boost_1_55_0/lib' + ':/project/projectdirs/openmsi/jupyterhub_libs/lib'
import sys
# sys.path.remove('/anaconda/lib/python2.7/site-packages')
sys.path.append('/global/project/projectdirs/openmsi/jupyterhub_libs/anaconda/lib/python2.7/site-packages')
sys.path.insert(0,'/project/projectdirs/openmsi/projects/meta-iq/pactolus/pactolus' )

from generate_frag_dag import *

import score_frag_dag

sys.path.insert(0,'/global/project/projectdirs/metatlas/anaconda/lib/python2.7/site-packages' )

from metatlas import metatlas_objects as metob
from metatlas import h5_query as h5q
from metatlas import mzml_to_hdf

import h5py
import tables
import pickle

from rdkit import Chem
# from rdkit.Chem.rdMolDescriptors import ExactMolWt
from rdkit.Chem import Descriptors
from rdkit.Chem import rdMolDescriptors
from rdkit.Chem import AllChem
from rdkit.Chem import Draw
from rdkit.Chem import rdDepictor
from rdkit.Chem.Draw import rdMolDraw2D

from copy import deepcopy


from pyteomics import mgf

from rdkit.Chem.Draw import IPythonConsole
from IPython.display import SVG,display

In [2]:
datafile = []
datafile.append('inchi_tables/enzo_and_img_abc_inchis.txt')
datafile.append('inchi_tables/gnps_molecule_inchis.txt')
datafile.append('inchi_tables/neutral_organic_compounds_from_metacyc.txt')
inchi = []
for d in datafile:
    with open(d) as fid:
        for line in fid:
            inchi.append(line.strip())
inchi = np.unique(inchi)

inchi = inchi[1:] #remove the empty line break TODO: verify that its empty line.

inchi_with_key = []
#This list contains the inchi, inchi-key, rdkit-mol, and rdkit-mol with Hs
#TODO: store metatlas compound ID
for chi in inchi:
    myMol = Chem.MolFromInchi(chi)
    inchi_with_key.append((chi,Chem.InchiToInchiKey(chi),myMol,Chem.AddHs(myMol)))
print len(inchi_with_key)
print inchi_with_key[0]


13920
('InChI=1S/2C10H15NO.H2O4S/c2*1-11(2)8-7-9-3-5-10(12)6-4-9;1-5(2,3)4/h2*3-6,12H,7-8H2,1-2H3;(H2,1,2,3,4)', 'PKCUSEDYJXJFJO-UHFFFAOYSA-N', <rdkit.Chem.rdchem.Mol object at 0x7f63584b27d0>, <rdkit.Chem.rdchem.Mol object at 0x7f635c43ebb0>)

In [3]:
path_to_trees = '/project/projectdirs/openmsi/projects/pactolus_trees_with_inchi/'
all_my_h5_files = glob.glob('/project/projectdirs/openmsi/projects/pactolus_trees_with_inchi/*_hdf5_5_*.h5')

for myFile in all_my_h5_files:
    f = h5py.File(myFile, 'r+')
    matches = [x[0] for x in inchi_with_key if x[1] == f.keys()[0]][0]
    if len(matches) > 0:
        f.attrs['inchi'] = matches
    else:
        print myFile    
    f.close()

In [50]:
%%javascript
var nb = IPython.notebook;
var kernel = IPython.notebook.kernel;
var command = "NOTEBOOK_FULL_PATH = '" + nb.base_url + nb.notebook_path + "'";
kernel.execute(command);



In [51]:
filename = os.path.basename(NOTEBOOK_FULL_PATH)
%system cp $filename /project/projectdirs/openmsi/www/
temp = '%s/%s'%('/project/projectdirs/openmsi/www',filename)
%system chmod 775 $temp
print 'http://nbviewer.ipython.org/url/portal.nersc.gov/project/openmsi/%s?flush_cache=true'%filename


http://nbviewer.ipython.org/url/portal.nersc.gov/project/openmsi/Add_InChI_To_Pactolus_Trees.ipynb?flush_cache=true

In [ ]: