In [1]:
import pandas as pd
from goatools import obo_parser
oboUrl = './data/go.obo'
treeSourceUrl = 'http://chianti.ucsd.edu/~kono/ci/data/collapsed_go.no_IGI.propagated.small_parent_tree'
oboUrl = './data/go.obo'
yeastAnnotationUrl = './data/gene_association.sgd.gz'
kegg2goUrl = 'http://geneontology.org/external2go/kegg2go'
reactome2go = 'http://geneontology.org/external2go/reactome2go'
phenotypeUrl='http://downloads.yeastgenome.org/curation/literature/phenotype_data.tab'
In [2]:
import json
with open('data/full-go.cyjs') as data_file:
original = json.load(data_file)
In [3]:
print(original['elements']['nodes'][0])
print(original['elements']['edges'][0])
In [4]:
cols = pd.read_csv('./annotation_columns.txt', names=['col_names'])
col_names = cols['col_names'].tolist()
print(col_names)
yeastAnnotation = pd.read_csv(yeastAnnotationUrl, delimiter='\t', comment='!', compression='gzip', names=col_names)
yeastAnnotation['DB_Object_Synonym'] = yeastAnnotation['DB_Object_Synonym'].fillna('')
yeastAnnotation.head()
Out[4]:
In [5]:
## Load gene count
df_term_size = pd.read_csv('./data/collapsed_go.no_IGI.propagated.term_sizes', delimiter='\t', names=['term_id', 'geneCount'])
df_term_size.head()
Out[5]:
In [6]:
go_map = {}
for row in df_term_size.itertuples():
go_map[row[1]] = int(row[2])
In [7]:
gene_map = {}
for row in yeastAnnotation.itertuples():
gene_map[row[11].split('|')[0]] = row[3]
In [8]:
obo = obo_parser.GODag(oboUrl)
In [9]:
full_go_w_genes = {}
new_nodes = []
new_edges = []
for node in original['elements']['nodes']:
data = node['data']
new_node = {
'data': {
'id': data['name']
},
'position': {}
}
data = node['data']
if (node['data']['name'].startswith('GO')) and (data['name'] in obo.keys()):
# This is GO
new_node['data']['geneCount'] = go_map[data['name']]
go = obo[data['name']]
new_node['data']['name'] = go.name
new_node['data']['namespace'] = go.namespace
new_node['data']['type'] = 't'
elif data['name'] == 'GO:00SUPER':
# Root node
new_node['data']['name'] = 'Root'
new_node['data']['type'] = 'r'
elif not node['data']['name'].startswith('GO'):
if data['name'] in gene_map.keys():
new_node['data']['name'] = gene_map[data['name']]
else:
new_node['data']['name'] = data['name']
new_node['data']['type'] = 'g'
original_pos = node['position']
new_node['position']['x'] = original_pos['x']*8
new_node['position']['y'] = original_pos['y']*8
new_nodes.append(new_node)
print(new_nodes[9000])
print(new_nodes[9])
In [10]:
treeColNames = ['parent', 'child', 'type', 'in_tree']
tree = pd.read_csv(treeSourceUrl, delimiter='\t', names=treeColNames)
tree.tail(10)
Out[10]:
In [11]:
for row in tree.itertuples():
if row[4] == 'NOT_TREE':
continue
t = row[1]
s = row[2]
edge = {
'data': {
'source': s,
'target': t,
'type': row[3]
}
}
new_edges.append(edge)
print(tree['type'].unique())
In [14]:
# Add path info
import networkx as nx
G=nx.DiGraph()
node_set = set()
edges = []
for row in tree.itertuples():
node_set.add(row[1])
node_set.add(row[2])
if "GO:" in row[1] and "GO:" in row[2]:
edges.append((row[2], row[1]))
for node in node_set:
if "GO:" in node:
G.add_node(node)
len(edges)
Out[14]:
In [15]:
for e in edges:
G.add_edge(e[0], e[1])
In [16]:
p=nx.shortest_path_length(G)
In [21]:
p['GO:0048308']['GO:00SUPER']
Out[21]:
In [23]:
for node in new_nodes:
source = node['data']['id']
if source in p:
path_len = p[node['data']['id']]['GO:00SUPER']
node['data']['pLen'] = path_len
In [24]:
new_nodes[9000]
Out[24]:
In [25]:
final_go_tree = {
'data': original['data'],
'elements': {
'nodes': new_nodes,
'edges': new_edges
}
}
with open('./data/tree-go-genes.json', 'w') as outfile:
json.dump(final_go_tree, outfile)