Example from justin: http://classyfire.wishartlab.com/entities/HNDVDQJCIGZPNO-YFKPBYRVSA-N (histidine)
Fetch the result from above query in json format.
In [2]:
import urllib2
import json
import jsonpickle
def get_json(url):
response = urllib2.urlopen(url)
data = json.load(response)
return data
Let's see what info we get back from classifyer
In [3]:
url = 'http://classyfire.wishartlab.com/entities/HNDVDQJCIGZPNO-YFKPBYRVSA-N.json'
data = get_json(url)
for key in data:
print key
We need the kingdom, superclass, class, subclass, intermediate_nodes and direct parent to contruct the taxonomy path of this document (InChiKey).
Wrap this nicely as a function. We pass in the inchi key and get back the taxonomy.
In [30]:
def get_taxa_path(inchikey):
url = 'http://classyfire.wishartlab.com/entities/%s.json' % inchikey
response = urllib2.urlopen(url)
data = json.load(response)
# store the taxonomy path for this inchikey here
taxa_path = []
# add the top-4 taxa
keys = ['kingdom', 'superclass', 'class', 'subclass']
for key in keys:
if data[key] is not None:
taxa_path.append(data[key]['name'])
# add all the intermediate taxa >level 4 but above the direct parent
for entry in data['intermediate_nodes']:
taxa_path.append(entry['name'])
# add the direct parent
taxa_path.append(data['direct_parent']['name'])
return taxa_path
inchikey = 'HNDVDQJCIGZPNO-YFKPBYRVSA-N'
tp = get_taxa_path(inchikey)
print '\n'.join(tp)
A method to extract the substituents from a query
In [43]:
def get_substituents(inchikey):
url = 'http://classyfire.wishartlab.com/entities/%s.json' % inchikey
response = urllib2.urlopen(url)
data = json.load(response)
return data.get('substituents',None)
Now try with some Mass2Motif from MassBank. First get all the docs above the default doc-topic threshold (0.05). Retrieve the metadata (inchikey) and pass it to Classifyer.
In [31]:
def print_m2m_taxonomy(m2m_id):
server = 'www.ms2lda.org'
url = 'http://%s/basicviz/get_parents_metadata/%d' % (server, m2m_id)
data = get_json(url)
for metadata_str in data:
doc = jsonpickle.decode(metadata_str)
inchikey = doc['InChIKey']
print doc['annotation'], inchikey
for taxon in get_taxa_path(inchikey):
print '-', taxon
print
Print a list of substituents from all of the molecules, ranked by how often they appear
In [24]:
def get_all_substituents(m2m_id):
server = 'www.ms2lda.org'
url = 'http://%s/basicviz/get_parents_metadata/%d' % (server, m2m_id)
data = get_json(url)
substituents = {}
for metadata_str in data:
doc = jsonpickle.decode(metadata_str)
inchikey = doc['InChIKey']
substituents[inchikey] = get_substituents(inchikey)
substituent_counts = {}
for inchikey in substituents:
for ss in substituents[inchikey]:
if not ss in substituent_counts:
substituent_counts[ss] = 1
else:
substituent_counts[ss] += 1
ss_c = zip(substituent_counts.keys(),substituent_counts.values())
ss_c = sorted(ss_c,key = lambda x:x[1],reverse = True)
for ss,count in ss_c:
print "{},{} (/{})".format(ss,count,len(substituents))
In [5]:
print_m2m_taxonomy(1083)
In [27]:
get_all_substituents(1083)
In [6]:
print_m2m_taxonomy(1367)
In [26]:
get_all_substituents(1367)
In [32]:
print_m2m_taxonomy(1430)
In [33]:
get_all_substituents(1430)
It would be useful to:
In [40]:
server = 'www.ms2lda.org'
exp_id = 3 # experiment id of massbank
url = 'http://%s/basicviz/get_all_parents_metadata/%d' % (server, exp_id)
data = get_json(url)
inchikeys = []
for metadata_str in data:
metadata = jsonpickle.decode(metadata_str)
inchikeys.append(metadata['InChIKey'])
In [45]:
substituents = {}
n_done = 0
for inchikey in inchikeys:
try:
substituents[inchikey] = get_substituents(inchikey)
except:
print "Failed on {}".format(inchikey)
n_done += 1
if n_done % 10 == 0:
print n_done
In [49]:
import pickle
with open('massbank_substituents.dict','w') as f:
pickle.dump(substituents,f)
Tally the individual terms
In [47]:
tally = {}
for inchikey in substituents:
if not substituents[inchikey] == None:
for ss in substituents[inchikey]:
if not ss in tally:
tally[ss] = 1
else:
tally[ss] += 1
ss_c = zip(tally.keys(),tally.values())
ss_c = sorted(ss_c,key = lambda x:x[1],reverse = True)
In [48]:
for ss,c in ss_c[:100]:
print ss,c
In [51]:
import plotly as plotly
from plotly.graph_objs import *
plotly.offline.init_notebook_mode()
Make a bar plot of the prevalence of different terms
In [66]:
data = []
ss,c = zip(*ss_c)
x = ss
n_inchi_keys = len(inchikeys)
y = [100.0*float(count)/float(n_inchi_keys) for count in c]
data.append(
Bar(
x = x,
y = y,
)
)
layout = Layout(
xaxis = dict(
title = 'substituent term',
),
yaxis = dict(
title = 'percentage of inchi keys',
type = 'log',
),
)
plotly.offline.iplot({'data':data,'layout':layout})
print "There are {} unique terms in this dataset".format(len(x))
In [ ]: