In [23]:
import os
import pandas as pd
import requests
import json
import requests
import multiprocessing as mp
from random import randint
from time import sleep
import sys
sys.path.insert(0,'/global/project/projectdirs/metatlas/anaconda/lib/python2.7/site-packages' )
from rdkit import Chem
In [24]:
df = pd.read_pickle('/project/projectdirs/openmsi/projects/ben_run_pactolus/unique_compounds.pkl')
#df = pd.read_csv('/project/projectdirs/openmsi/projects/compound_data/jgi_molecules/new_jgi_compounds.csv')
# df.rename(columns = {'monoisotopoic_mw':'monoisotopic_mw'},inplace=True)
df = df.convert_objects(convert_numeric=True)
print df.keys()
df.sort_values('mono_isotopic_molecular_weight',inplace=True)
In [25]:
df.head(10)
Out[25]:
In [47]:
inchis = df[(~ df.inchi.str.contains('\.')) & (df.formula.str.contains('C')) & (df.mono_isotopic_molecular_weight < 1200) & (~ pd.isnull(df.inchi))].inchi
inchis = inchis[140000:]
len(inchis)
Out[47]:
In [43]:
N = 400
chunks = [inchis[x:x+N] for x in xrange(0, len(inchis), N)]
len(chunks)
Out[43]:
In [49]:
print df.shape, len(inchis)
%matplotlib notebook
from matplotlib import pyplot as plt
a = df.hist(column=u'mono_isotopic_molecular_weight',bins=50,range=(0,1200))
# plt.xlim(0,1200)
In [ ]:
import glob as glob
out_files = glob.glob('/scratch2/scratchdirs/bpb/logs/*_out.txt')
len(out_files)
incomplete_files = []
for i,f in enumerate(out_files):
with open(f,'r') as fid:
if len(fid.readlines()) < 3000:
incomplete_files.append(f)
# sorted(incomplete_files)
In [46]:
import glob as glob
files = sorted(glob.glob('/global/homes/b/bpb/inchi_metatlas/*.txt'))
# print len(files),files
script_names = []
for f in files:
with open('make_pactolus_trees_script.sh','r') as fid:
a_new_line = fid.read().replace('/scratch2/scratchdirs/bpb/inchi/inchis_3.txt',f).replace('job_pactolus_realtime1',os.path.basename(f).split('.')[0]).replace('time=','time=3').replace('realtime','regular')
out_file = os.path.join('/global/homes/b/bpb/pactolus_scripts/',os.path.basename(f).split('.')[0] + '.sbatch')
script_names.append(out_file)
with open(out_file,'w') as out_fid:
out_fid.write(a_new_line)
with open('/global/homes/b/bpb/pactolus_scripts/level_3_run_all_scripts.sh','w') as fid:
for i,s in enumerate(script_names):
# if (i > 149):# & (i < 150):
fid.write('sbatch %s\n'%s)
In [ ]:
# import glob as glob
# files = sorted(glob.glob('/global/homes/b/bpb/inchi_metatlas/*.txt'))
# # print len(files),files
# script_names = []
# for f in files:
# with open('make_pactolus_trees_script.sh','r') as fid:
# a_new_line = fid.read().replace('/scratch2/scratchdirs/bpb/inchi/inchis_3.txt',f).replace('job_pactolus_realtime1',os.path.basename(f).split('.')[0]).replace('realtime','shared').replace('--time=4','--time=36')
# out_file = os.path.join('/global/homes/b/bpb/pactolus_scripts/',os.path.basename(f).split('.')[0] + '_shared_queue.sbatch')
# script_names.append(out_file)
# with open(out_file,'w') as out_fid:
# out_fid.write(a_new_line)
# with open('/global/homes/b/bpb/pactolus_scripts/run_all_scripts_shared_queue.sh','w') as fid:
# for s in script_names[650:]:
# fid.write('sbatch %s\n'%s)
In [41]:
%system cat /global/homes/b/bpb/pactolus_scripts/inchis_000.sbatch
Out[41]:
In [ ]:
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem.rdMolDescriptors import CalcExactMolWt
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole
mols = [Chem.MolFromInchi(m) for m in inchis[-9:]]
In [ ]:
images = Draw.MolsToGridImage(mols,molsPerRow=3, useSVG=True)
# Draw.MolsToGridImage([f['frag_mol_h'] for f in f_tree_2.fragment_list],molsPerRow=7,useSVG=True)
In [ ]:
baumin_inchi = df[(df.metatlas_inchi_key.str.contains('JZRMMLYGOBWIGY')) & (~ pd.isnull(df.metatlas_inchi))].metatlas_inchi
with open('/global/homes/b/bpb/inchi/baumin_inchi.txt','w') as fid:
for my_inchi in baumin_inchi:
m = Chem.MolFromInchi(my_inchi)
if m:
fid.write('%s\n'%my_inchi)
baumin_inchi
In [ ]:
images.data
In [ ]:
# cell_out = _97
with open('9 molecules.svg','w') as fid:
fid.write(str(images.data.replace('font-family:sans-serif;','')))
In [ ]:
from IPython.display import SVG, display
SVG(data=_97)
In [ ]:
images.save('9 big molecules.png')
In [ ]:
%system mkdir ~/inchi_metatlas
In [ ]:
# with open('/global/homes/b/bpb/inchi/inchis_all.txt','w') as fid:
# for my_inchi in inchis:
# m = Chem.MolFromInchi(my_inchi)
# if m:
# fid.write('%s\n'%my_inchi)
In [44]:
%system rm /global/homes/b/bpb/inchi_metatlas/*.txt
for i,inchi_list in enumerate(chunks):
with open('/global/homes/b/bpb/inchi_metatlas/inchis_%03d.txt'%i,'w') as fid:
for my_inchi in inchi_list:
fid.write('%s\n'%my_inchi)
In [ ]:
import glob
files = glob.glob(os.path.join(out_dir,'*.json'))
done_inchi_key = []
for f in files:
done_inchi_key.append(os.path.basename(f).split('.')[0])
inchi_keys = list(set(inchi_keys) - set(done_inchi_key))
In [ ]:
len(inchi_keys)
In [ ]:
def write_pubchem_info_to_file(inchi_key):
suffix = '.json'
fname = os.path.join(out_dir, inchi_key + suffix)
if not os.path.isfile(fname):
url = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/inchikey/%s/synonyms/json'%inchi_key
response = requests.get(url)
try:
with open(fname, 'w') as fid:
json.dump(response.json(), fid)
except:
print "could not query", inchi_key
In [ ]:
for ik in inchi_keys[1:]:
print ik
write_pubchem_info_to_file(ik)
In [ ]:
# pool = mp.Pool(processes=20)
# pool.map(write_pubchem_info_to_file, inchi_keys[1:100])
In [ ]:
url = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/inchikey/%s/synonyms/json'%'CKLJMWTZIZZHCS-REOHCLBHSA-N'
response = requests.get(url)
response.json()
In [ ]:
response.json()
In [ ]:
url = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/%s/json'%'Baumin'
response = requests.get(url)
response.json()
In [ ]: