In [1]:
import pandas as pd
from goatools import obo_parser
oboUrl = './data/go.obo'
treeSourceUrl = 'http://chianti.ucsd.edu/~kono/ci/data/collapsed_go.no_IGI.propagated.small_parent_tree'
oboUrl = './data/go.obo'
yeastAnnotationUrl = './data/gene_association.sgd.gz'
kegg2goUrl = 'http://geneontology.org/external2go/kegg2go'
reactome2go = 'http://geneontology.org/external2go/reactome2go'
phenotypeUrl='http://downloads.yeastgenome.org/curation/literature/phenotype_data.tab'
In [2]:
import json
with open('data/full-go.cyjs') as data_file:
original = json.load(data_file)
In [29]:
print(original['elements']['nodes'][0])
print(original['elements']['edges'][0])
In [33]:
cols = pd.read_csv('./annotation_columns.txt', names=['col_names'])
col_names = cols['col_names'].tolist()
print(col_names)
yeastAnnotation = pd.read_csv(yeastAnnotationUrl, delimiter='\t', comment='!', compression='gzip', names=col_names)
yeastAnnotation['DB_Object_Synonym'] = yeastAnnotation['DB_Object_Synonym'].fillna('')
yeastAnnotation.head()
Out[33]:
In [11]:
## Load gene count
df_term_size = pd.read_csv('./data/collapsed_go.no_IGI.propagated.term_sizes', delimiter='\t', names=['term_id', 'geneCount'])
df_term_size.head()
Out[11]:
In [13]:
go_map = {}
for row in df_term_size.itertuples():
go_map[row[1]] = int(row[2])
In [40]:
gene_map = {}
for row in yeastAnnotation.itertuples():
gene_map[row[11].split('|')[0]] = row[3]
In [20]:
obo = obo_parser.GODag(oboUrl)
In [47]:
full_go_w_genes = {}
new_nodes = []
new_edges = []
for node in original['elements']['nodes']:
data = node['data']
new_node = {
'data': {
'id': data['name']
},
'position': {}
}
data = node['data']
if (node['data']['name'].startswith('GO')) and (data['name'] in obo.keys()):
# This is GO
new_node['data']['geneCount'] = go_map[data['name']]
go = obo[data['name']]
new_node['data']['name'] = go.name
new_node['data']['namespace'] = go.namespace
elif not node['data']['name'].startswith('GO'):
if data['name'] in gene_map.keys():
new_node['data']['name'] = gene_map[data['name']]
else:
new_node['data']['name'] = data['name']
original_pos = node['position']
new_node['position']['x'] = original_pos['x']*10
new_node['position']['y'] = original_pos['y']*10
new_nodes.append(new_node)
print(new_nodes[9000])
print(new_nodes[9])
In [48]:
treeColNames = ['parent', 'child', 'type', 'in_tree']
tree = pd.read_csv(treeSourceUrl, delimiter='\t', names=treeColNames)
tree.tail(10)
Out[48]:
In [ ]:
In [148]:
import networkx as nx
G=nx.DiGraph()
node_set = set()
edges = []
for row in tree.itertuples():
node_set.add(row[1])
node_set.add(row[2])
if "GO:" in row[1] and "GO:" in row[2]:
edges.append((row[2], row[1]))
In [149]:
for node in node_set:
if "GO:" in node:
G.add_node(node)
len(edges)
Out[149]:
In [150]:
for e in edges:
G.add_edge(e[0], e[1])
In [151]:
print(nx.info(G))
for n in G.nodes():
if n == 'GO:00SUPER':
print(n)
root = G.node['GO:00SUPER']
In [153]:
paths = nx.all_simple_paths(G, source='GO:0098799', target='GO:00SUPER')
sg = nx.DiGraph()
ns = set()
for p in paths:
p_len = len(p)
for i, v in enumerate(p):
if i < p_len-1:
s = v
t = p[i+1]
if s not in ns:
sg.add_node(s)
ns.add(s)
if t not in ns:
sg.add_node(t)
ns.add(t)
sg.add_edge(s, t)
print(ns)
print(nx.info(sg))
nx.write_graphml(sg, "test.graphml")
In [102]:
import igraph as ig
g = ig.Graph(directed=True)
for node in node_set:
if "GO:" in node:
g.add_vertex(name=node)
g.summary()
Out[102]:
In [103]:
g.add_edges(edges)
In [104]:
g.summary()
for e in edges:
if e[0] == 'GO:0098798' or e[1]=='GO:0098798':
print(e)
In [117]:
paths1 = g.vertex_disjoint_paths(g.vs.find('GO:0098798').index, target=g.vs.find('GO:00SUPER').index)
In [118]:
print(g.vs[2787]['name'])
print(paths1)
In [108]:
subg = ig.Graph()
n_set = set()
for p in paths1:
p_len = len(p)
for i, v in enumerate(p):
if i < p_len-1:
# print(str(v) + ' --> ' + str(p[i+1]))
s = g.vs[v]['name']
t = g.vs[p[i+1]]['name']
print(s + ' --> ' + t)
if s not in n_set:
subg.add_vertex(s)
n_set.add(s)
if t not in n_set:
subg.add_vertex(t)
n_set.add(t)
subg.add_edge(source=s, target=t)
print('-----')
subg.summary()
print(n_set)
In [101]:
subg.save("sub.gml", format="gml")