Build full go tree from Cytoscape.js JSON


In [1]:
import pandas as pd
from goatools import obo_parser

oboUrl = './data/go.obo'
treeSourceUrl = 'http://chianti.ucsd.edu/~kono/ci/data/collapsed_go.no_IGI.propagated.small_parent_tree'
oboUrl = './data/go.obo'
yeastAnnotationUrl = './data/gene_association.sgd.gz'
kegg2goUrl = 'http://geneontology.org/external2go/kegg2go'
reactome2go = 'http://geneontology.org/external2go/reactome2go'

phenotypeUrl='http://downloads.yeastgenome.org/curation/literature/phenotype_data.tab'

In [2]:
import json

with open('data/full-go.cyjs') as data_file:    
    original = json.load(data_file)

In [3]:
print(original['elements']['nodes'][0])

print(original['elements']['edges'][0])


{'selected': False, 'position': {'x': 16546.29659293159, 'y': 29466.993279476694}, 'data': {'selected': False, 'name': 'YNL259C', 'SUID': 425956, 'shared_name': 'YNL259C', 'id': '425956'}}
{'selected': False, 'data': {'isTree': 'TREE', 'interaction': 'gene', 'shared_name': 'YNL259C (gene) GO:0016531', 'shared_interaction': 'gene', 'id': '425981', 'selected': False, 'source': '425956', 'target': '12823', 'name': 'YNL259C (gene) GO:0016531', 'SUID': 425981}}

In [4]:
cols = pd.read_csv('./annotation_columns.txt', names=['col_names'])
col_names = cols['col_names'].tolist()
print(col_names)

yeastAnnotation = pd.read_csv(yeastAnnotationUrl, delimiter='\t', comment='!', compression='gzip', names=col_names)
yeastAnnotation['DB_Object_Synonym'] = yeastAnnotation['DB_Object_Synonym'].fillna('')
yeastAnnotation.head()


['DB', 'DB_Object_ID', 'DB_Object_Symbol', 'Qualifier', 'GO_ID', 'DB:Reference', 'Evidence', 'With_or_From', 'Aspect', 'DB_Object_Name', 'DB_Object_Synonym', 'DB_Object_Type', 'taxon', 'Date', 'Assigned_by', 'Annotation_Extension', 'Gene_Product_Form_ID']
Out[4]:
DB DB_Object_ID DB_Object_Symbol Qualifier GO_ID DB:Reference Evidence With_or_From Aspect DB_Object_Name DB_Object_Synonym DB_Object_Type taxon Date Assigned_by Annotation_Extension Gene_Product_Form_ID
0 SGD S000007287 15S_RRNA NaN GO:0005763 SGD_REF:S000073641|PMID:6262728 IDA NaN C Ribosomal RNA of the small mitochondrial ribos... Q0020|14s rRNA|15S_RRNA_2 gene taxon:559292 20150612 SGD NaN NaN
1 SGD S000007287 15S_RRNA NaN GO:0032543 SGD_REF:S000073641|PMID:6262728 IC GO:0005763 P Ribosomal RNA of the small mitochondrial ribos... Q0020|14s rRNA|15S_RRNA_2 gene taxon:559292 20150612 SGD NaN NaN
2 SGD S000007287 15S_RRNA NaN GO:0003735 SGD_REF:S000073641|PMID:6262728 IC GO:0005763 F Ribosomal RNA of the small mitochondrial ribos... Q0020|14s rRNA|15S_RRNA_2 gene taxon:559292 20150612 SGD NaN NaN
3 SGD S000007288 21S_RRNA NaN GO:0005762 SGD_REF:S000073372|PMID:6759872 IDA NaN C Mitochondrial 21S rRNA Q0158|21S_rRNA_3|21S_rRNA_4 gene taxon:559292 20040202 SGD NaN NaN
4 SGD S000007288 21S_RRNA NaN GO:0032543 SGD_REF:S000073372|PMID:6759872 IMP NaN P Mitochondrial 21S rRNA Q0158|21S_rRNA_3|21S_rRNA_4 gene taxon:559292 20100715 SGD NaN NaN

In [5]:
## Load gene count
df_term_size = pd.read_csv('./data/collapsed_go.no_IGI.propagated.term_sizes', delimiter='\t', names=['term_id', 'geneCount'])
df_term_size.head()


Out[5]:
term_id geneCount
0 GO:0000001 27
1 GO:0000002 42
2 GO:0000003 448
3 GO:0000006 1
4 GO:0000007 1

In [6]:
go_map = {}

for row in df_term_size.itertuples():
    go_map[row[1]] = int(row[2])

In [7]:
gene_map = {}

for row in yeastAnnotation.itertuples():
    gene_map[row[11].split('|')[0]] = row[3]

In [8]:
obo = obo_parser.GODag(oboUrl)


load obo file ./data/go.obo
./data/go.obo: fmt(1.2) rel(2017-08-10) 49,042 GO Terms

In [9]:
full_go_w_genes = {}
new_nodes = []
new_edges = []

for node in original['elements']['nodes']:
    
    data = node['data']
    new_node = {
        'data': {
            'id': data['name']
        },
        'position': {}
    }
    
    data = node['data']
    
    if (node['data']['name'].startswith('GO'))  and (data['name'] in obo.keys()):
        # This is GO
        new_node['data']['geneCount'] = go_map[data['name']]
        go = obo[data['name']]
        new_node['data']['name'] = go.name
        new_node['data']['namespace'] = go.namespace
        new_node['data']['type'] = 't'
    elif data['name'] == 'GO:00SUPER':
        # Root node
        new_node['data']['name'] = 'Root'
        new_node['data']['type'] = 'r'
        
    elif not node['data']['name'].startswith('GO'):
                
        if data['name'] in gene_map.keys():
            new_node['data']['name'] = gene_map[data['name']]
        else:
            new_node['data']['name'] = data['name']
        
        new_node['data']['type'] = 'g' 

    
    original_pos = node['position']
    
    new_node['position']['x'] = original_pos['x']*8
    new_node['position']['y'] = original_pos['y']*8 

    
    new_nodes.append(new_node)

print(new_nodes[9000])
print(new_nodes[9])


{'position': {'x': 269394.57763320964, 'y': 304962.6855812461}, 'data': {'namespace': 'cellular_component', 'type': 't', 'name': 'transferase complex, transferring phosphorus-containing groups', 'geneCount': 167, 'id': 'GO:0061695'}}
{'position': {'x': 165658.29047559967, 'y': 152957.96378916368}, 'data': {'type': 'g', 'name': 'LSM1', 'id': 'YJL124C'}}

In [10]:
treeColNames = ['parent', 'child', 'type', 'in_tree']
tree = pd.read_csv(treeSourceUrl, delimiter='\t', names=treeColNames)
tree.tail(10)


Out[10]:
parent child type in_tree
441927 GO:0090150 YHR083W gene NOT_TREE
441928 GO:0005575 YHR083W gene NOT_TREE
441929 GO:0098796 YHR083W gene NOT_TREE
441930 GO:1902589 YHR083W gene NOT_TREE
441931 GO:0044085 YHR083W gene NOT_TREE
441932 GO:0015031 YHR083W gene NOT_TREE
441933 GO:1902582 YHR083W gene NOT_TREE
441934 GO:1902580 YHR083W gene NOT_TREE
441935 GO:0098799 YHR083W gene NOT_TREE
441936 GO:0098798 YHR083W gene NOT_TREE

In [11]:
for row in tree.itertuples():
    
    if row[4] == 'NOT_TREE':
        continue
    
    t = row[1]
    s = row[2]
    
    edge = {
        'data': {
            'source': s,
            'target': t,
            'type': row[3]
        }
    }
    new_edges.append(edge)
    
print(tree['type'].unique())


['is_a' 'regulates' 'part_of' 'negatively_regulates' 'positively_regulates'
 'gene']

In [14]:
# Add path info

import networkx as nx

G=nx.DiGraph()

node_set = set()
edges = []

for row in tree.itertuples():
    node_set.add(row[1])
    node_set.add(row[2])
    if "GO:" in row[1] and "GO:" in row[2]:
        edges.append((row[2], row[1]))
        
for node in node_set:
     if "GO:" in node:
        G.add_node(node)
    
len(edges)


Out[14]:
14528

In [15]:
for e in edges:
    G.add_edge(e[0], e[1])

In [16]:
p=nx.shortest_path_length(G)

In [21]:
p['GO:0048308']['GO:00SUPER']


Out[21]:
5

In [23]:
for node in new_nodes:
    source = node['data']['id']
    if source in p:
        path_len = p[node['data']['id']]['GO:00SUPER']
        node['data']['pLen'] = path_len

In [24]:
new_nodes[9000]


Out[24]:
{'data': {'geneCount': 167,
  'id': 'GO:0061695',
  'name': 'transferase complex, transferring phosphorus-containing groups',
  'namespace': 'cellular_component',
  'pLen': 5,
  'type': 't'},
 'position': {'x': 269394.57763320964, 'y': 304962.6855812461}}

In [25]:
final_go_tree = {
    'data': original['data'],
    'elements': {
        'nodes': new_nodes,
        'edges': new_edges
    }
}

with open('./data/tree-go-genes.json', 'w') as outfile:
    json.dump(final_go_tree, outfile)