In [23]:
    
import os
import pandas as pd
import requests
import json
import requests
import multiprocessing as mp
from random import randint
from time import sleep
import sys
sys.path.insert(0,'/global/project/projectdirs/metatlas/anaconda/lib/python2.7/site-packages' )
from rdkit import Chem
    
In [24]:
    
df = pd.read_pickle('/project/projectdirs/openmsi/projects/ben_run_pactolus/unique_compounds.pkl')
#df = pd.read_csv('/project/projectdirs/openmsi/projects/compound_data/jgi_molecules/new_jgi_compounds.csv')
# df.rename(columns = {'monoisotopoic_mw':'monoisotopic_mw'},inplace=True)
df = df.convert_objects(convert_numeric=True)
print df.keys()
df.sort_values('mono_isotopic_molecular_weight',inplace=True)
    
    
    
In [25]:
    
df.head(10)
    
    Out[25]:
In [47]:
    
inchis = df[(~ df.inchi.str.contains('\.')) & (df.formula.str.contains('C'))  & (df.mono_isotopic_molecular_weight < 1200) & (~ pd.isnull(df.inchi))].inchi
inchis = inchis[140000:]
len(inchis)
    
    Out[47]:
In [43]:
    
N = 400
chunks = [inchis[x:x+N] for x in xrange(0, len(inchis), N)]
len(chunks)
    
    Out[43]:
In [49]:
    
print df.shape, len(inchis)
%matplotlib notebook
from matplotlib import pyplot as plt
a = df.hist(column=u'mono_isotopic_molecular_weight',bins=50,range=(0,1200))
# plt.xlim(0,1200)
    
    
    
    
In [ ]:
    
import glob as glob
out_files = glob.glob('/scratch2/scratchdirs/bpb/logs/*_out.txt')
len(out_files)
incomplete_files = []
for i,f in enumerate(out_files):
    with open(f,'r') as fid:
        if len(fid.readlines()) < 3000:
            incomplete_files.append(f)
# sorted(incomplete_files)
    
In [46]:
    
import glob as glob
files = sorted(glob.glob('/global/homes/b/bpb/inchi_metatlas/*.txt'))
# print len(files),files
script_names = []
for f in files:
    with open('make_pactolus_trees_script.sh','r') as fid:
        a_new_line = fid.read().replace('/scratch2/scratchdirs/bpb/inchi/inchis_3.txt',f).replace('job_pactolus_realtime1',os.path.basename(f).split('.')[0]).replace('time=','time=3').replace('realtime','regular')
    out_file = os.path.join('/global/homes/b/bpb/pactolus_scripts/',os.path.basename(f).split('.')[0] + '.sbatch')
    script_names.append(out_file)
    with open(out_file,'w') as out_fid:
        out_fid.write(a_new_line)
with open('/global/homes/b/bpb/pactolus_scripts/level_3_run_all_scripts.sh','w') as fid:
    for i,s in enumerate(script_names):
#         if (i > 149):# & (i < 150):
        fid.write('sbatch %s\n'%s)
    
In [ ]:
    
# import glob as glob
# files = sorted(glob.glob('/global/homes/b/bpb/inchi_metatlas/*.txt'))
# # print len(files),files
# script_names = []
# for f in files:
#     with open('make_pactolus_trees_script.sh','r') as fid:
#         a_new_line = fid.read().replace('/scratch2/scratchdirs/bpb/inchi/inchis_3.txt',f).replace('job_pactolus_realtime1',os.path.basename(f).split('.')[0]).replace('realtime','shared').replace('--time=4','--time=36')
#     out_file = os.path.join('/global/homes/b/bpb/pactolus_scripts/',os.path.basename(f).split('.')[0] + '_shared_queue.sbatch')
#     script_names.append(out_file)
#     with open(out_file,'w') as out_fid:
#         out_fid.write(a_new_line)
# with open('/global/homes/b/bpb/pactolus_scripts/run_all_scripts_shared_queue.sh','w') as fid:
#     for s in script_names[650:]:
#         fid.write('sbatch %s\n'%s)
    
In [41]:
    
%system cat /global/homes/b/bpb/pactolus_scripts/inchis_000.sbatch
    
    Out[41]:
In [ ]:
    
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem.rdMolDescriptors import CalcExactMolWt
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole
mols = [Chem.MolFromInchi(m) for m in inchis[-9:]]
    
In [ ]:
    
images = Draw.MolsToGridImage(mols,molsPerRow=3, useSVG=True)
# Draw.MolsToGridImage([f['frag_mol_h'] for f in f_tree_2.fragment_list],molsPerRow=7,useSVG=True)
    
In [ ]:
    
baumin_inchi = df[(df.metatlas_inchi_key.str.contains('JZRMMLYGOBWIGY')) & (~ pd.isnull(df.metatlas_inchi))].metatlas_inchi
with open('/global/homes/b/bpb/inchi/baumin_inchi.txt','w') as fid:
    for my_inchi in baumin_inchi:
        m = Chem.MolFromInchi(my_inchi)
        if m:
            fid.write('%s\n'%my_inchi)
baumin_inchi
    
In [ ]:
    
images.data
    
In [ ]:
    
# cell_out = _97
with open('9 molecules.svg','w') as fid:
    fid.write(str(images.data.replace('font-family:sans-serif;','')))
    
In [ ]:
    
from IPython.display import SVG, display
SVG(data=_97)
    
In [ ]:
    
images.save('9 big molecules.png')
    
In [ ]:
    
%system mkdir ~/inchi_metatlas
    
In [ ]:
    
# with open('/global/homes/b/bpb/inchi/inchis_all.txt','w') as fid:
#     for my_inchi in inchis:
#         m = Chem.MolFromInchi(my_inchi)
#         if m:
#             fid.write('%s\n'%my_inchi)
    
In [44]:
    
%system rm /global/homes/b/bpb/inchi_metatlas/*.txt
for i,inchi_list in enumerate(chunks):
    with open('/global/homes/b/bpb/inchi_metatlas/inchis_%03d.txt'%i,'w') as fid:
        for my_inchi in inchi_list:
            fid.write('%s\n'%my_inchi)
    
In [ ]:
    
import glob
files = glob.glob(os.path.join(out_dir,'*.json'))
done_inchi_key = []
for f in files:
    done_inchi_key.append(os.path.basename(f).split('.')[0])
inchi_keys = list(set(inchi_keys) - set(done_inchi_key))
    
In [ ]:
    
len(inchi_keys)
    
In [ ]:
    
def write_pubchem_info_to_file(inchi_key):
    suffix = '.json'
    fname = os.path.join(out_dir, inchi_key + suffix)
    if not os.path.isfile(fname):
        url = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/inchikey/%s/synonyms/json'%inchi_key
        response = requests.get(url)
        try: 
            with open(fname, 'w') as fid:
                json.dump(response.json(), fid)
        except:
            print "could not query", inchi_key
    
In [ ]:
    
for ik in inchi_keys[1:]:
    print ik
    write_pubchem_info_to_file(ik)
    
In [ ]:
    
# pool = mp.Pool(processes=20)
# pool.map(write_pubchem_info_to_file, inchi_keys[1:100])
    
In [ ]:
    
url = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/inchikey/%s/synonyms/json'%'CKLJMWTZIZZHCS-REOHCLBHSA-N'
response = requests.get(url)
response.json()
    
In [ ]:
    
response.json()
    
In [ ]:
    
url = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/%s/json'%'Baumin'
response = requests.get(url)
response.json()
    
In [ ]: