In [3]:
import pandas as pd
import json
idmap = pd.read_csv('./yeast_clean4.txt', delimiter='\t')
idmap.head()
Out[3]:
In [4]:
orf2symbol = {}
for row in idmap.itertuples():
sym = row[1]
orf = row[2]
orf2symbol[orf] = sym
In [5]:
# Load GO full dag
treeSourceUrl = 'http://chianti.ucsd.edu/~kono/ci/data/collapsed_go.no_IGI.propagated.small_parent_tree'
# Load the tree data
treeColNames = ['parent', 'child', 'type', 'in_tree']
tree = pd.read_csv(treeSourceUrl, delimiter='\t', names=treeColNames)
tree.tail()
Out[5]:
In [6]:
# Filter terms only.
term2genes = {}
filtered = []
for row in tree.itertuples():
t = row[3]
if t == 'gene':
term = row[1]
terms = []
if term in term2genes.keys():
terms = term2genes[term]
terms.append(row[2])
term2genes[term] = terms
continue
r = (row[1], row[2], row[4])
filtered.append(r)
dag = pd.DataFrame(filtered, columns=['parent', 'child', 'in_tree'])
print(term2genes['GO:0070822'])
dag.head()
Out[6]:
In [7]:
import networkx as nx
godag = nx.from_pandas_dataframe(dag, 'child', 'parent', ['in_tree'], create_using=nx.DiGraph())
godag.edges(data=True)[0]
godag.get_edge_data('GO:0019203', 'GO:0016791')
Out[7]:
In [8]:
subnet = {
'data': {
'name': 'subnet1'
},
'elements': {
'nodes': [],
'edges': []
}
}
def get_node(id):
name = ''
n_type = 'term'
if id == 'GO:00SUPER':
name = "Root"
elif id.startswith('Y') or id.startswith('Q'):
if id in orf2symbol.keys():
name = orf2symbol[id].split(';')[0]
else:
name = id
n_type = 'gene'
else:
name = obo[id].name
# if id == start:
# n_type = 'origin'
node = {
'data': {
'id': id,
'name': name,
'type': n_type
}
}
return node
def get_edge(source, target, in_tree):
edge = {
'data': {
'source': source,
'target': target,
'in_tree': in_tree
}
}
return edge
In [9]:
from goatools import obo_parser
oboUrl = './data/go.obo'
obo = obo_parser.GODag(oboUrl, optional_attrs=['def'])
In [12]:
def create_dag(paths, start_terms, goal):
net = {
'data': {
'name': 'Path to ' + goal
},
'elements': {
'nodes': [],
'edges': []
}
}
nodes = set()
echeck = set()
edges = []
for p in paths:
p_len = len(p)
for i, node in enumerate(p):
if i > p_len-2:
break
nodes.add(node)
nodes.add(p[i+1])
key = node + p[i+1]
if key in echeck:
continue
# if p[i+1] == goal:
# continue
echeck.add(key)
in_tree = godag.get_edge_data(node, p[i+1])['in_tree']
edge = get_edge(node, p[i+1], in_tree)
edges.append(edge)
cynodes = []
# add genes
for start in start_terms:
genes = term2genes[start]
for g in genes:
edge = get_edge(g, start, 'gene')
edges.append(edge)
for n in genes:
cynodes.append(get_node(n))
# Add nodes
for n in nodes:
cynodes.append(get_node(n))
net['elements']['nodes'] = cynodes
net['elements']['edges'] = edges
print(nodes)
return net
In [10]:
# paths1 = nx.shortest_simple_paths(godag, source='GO:0070822', target='GO:00SUPER')
# paths2 = nx.shortest_simple_paths(godag, source='GO:0000812', target='GO:00SUPER')
goal = 'GO:00SUPER'
terms = ['GO:0070822', 'GO:0000812', 'GO:0070143']
for term in terms:
paths = nx.shortest_simple_paths(godag, source=term, target=goal)
with open('./data/' + term.replace(':', '') + '.cyjs', 'w') as outfile:
json.dump(create_dag(paths, term, goal), outfile)
For the first "And example", the parent term is: nuclear chromatin GO:0000790
The two children are: Sin3-type complex GO:0070822 Swr1 complex GO:0000812
You can do type command "grep GO:0000790 precollapse.propagated.2016" to verify in Mike's file that GO:0070822 and GO:0000812 are children of GO:0000790.
For the second "Or example", the parent term is: mitochondrial respiratory chain GO:0005746
The two children are: mitochondrial respiratory chain complex III GO:0005750 mitochondrial respiratory chain complex IV GO:0005751
The third example is "GFP-GI" is: response to endoplasmic reticulum stress GO:0034976
In [13]:
# Test sub-routes
goal1 = 'GO:0000790'
goal2 = 'GO:0005746'
terms1 = ['GO:0070822', 'GO:0000812']
terms2 = ['GO:0005750', 'GO:0005751']
all_paths = []
for term in terms1:
plist = nx.shortest_simple_paths(godag, source=term, target=goal1)
for p in plist:
all_paths.append(p)
with open('./data/' + goal1.replace(':', '') + '.cyjs', 'w') as outfile:
json.dump(create_dag(all_paths, terms1, 'N/A'), outfile)
In [16]:
paths = nx.all_shortest_paths(godag, source='GO:0000790', target='GO:00SUPER')
print(list(paths))