Build full go tree from Cytoscape.js JSON



In [1]:

    
import pandas as pd
from goatools import obo_parser

oboUrl = './data/go.obo'
treeSourceUrl = 'http://chianti.ucsd.edu/~kono/ci/data/collapsed_go.no_IGI.propagated.small_parent_tree'
oboUrl = './data/go.obo'
yeastAnnotationUrl = './data/gene_association.sgd.gz'
kegg2goUrl = 'http://geneontology.org/external2go/kegg2go'
reactome2go = 'http://geneontology.org/external2go/reactome2go'

phenotypeUrl='http://downloads.yeastgenome.org/curation/literature/phenotype_data.tab'



In [2]:

    
import json

with open('data/full-go.cyjs') as data_file:    
    original = json.load(data_file)



In [3]:

    
print(original['elements']['nodes'][0])

print(original['elements']['edges'][0])









    



{'selected': False, 'position': {'x': 16546.29659293159, 'y': 29466.993279476694}, 'data': {'selected': False, 'name': 'YNL259C', 'SUID': 425956, 'shared_name': 'YNL259C', 'id': '425956'}}
{'selected': False, 'data': {'isTree': 'TREE', 'interaction': 'gene', 'shared_name': 'YNL259C (gene) GO:0016531', 'shared_interaction': 'gene', 'id': '425981', 'selected': False, 'source': '425956', 'target': '12823', 'name': 'YNL259C (gene) GO:0016531', 'SUID': 425981}}



In [4]:

    
cols = pd.read_csv('./annotation_columns.txt', names=['col_names'])
col_names = cols['col_names'].tolist()
print(col_names)

yeastAnnotation = pd.read_csv(yeastAnnotationUrl, delimiter='\t', comment='!', compression='gzip', names=col_names)
yeastAnnotation['DB_Object_Synonym'] = yeastAnnotation['DB_Object_Synonym'].fillna('')
yeastAnnotation.head()









    



['DB', 'DB_Object_ID', 'DB_Object_Symbol', 'Qualifier', 'GO_ID', 'DB:Reference', 'Evidence', 'With_or_From', 'Aspect', 'DB_Object_Name', 'DB_Object_Synonym', 'DB_Object_Type', 'taxon', 'Date', 'Assigned_by', 'Annotation_Extension', 'Gene_Product_Form_ID']






    Out[4]:







  
    
      
      DB
      DB_Object_ID
      DB_Object_Symbol
      Qualifier
      GO_ID
      DB:Reference
      Evidence
      With_or_From
      Aspect
      DB_Object_Name
      DB_Object_Synonym
      DB_Object_Type
      taxon
      Date
      Assigned_by
      Annotation_Extension
      Gene_Product_Form_ID
    
  
  
    
      0
      SGD
      S000007287
      15S_RRNA
      NaN
      GO:0005763
      SGD_REF:S000073641|PMID:6262728
      IDA
      NaN
      C
      Ribosomal RNA of the small mitochondrial ribos...
      Q0020|14s rRNA|15S_RRNA_2
      gene
      taxon:559292
      20150612
      SGD
      NaN
      NaN
    
    
      1
      SGD
      S000007287
      15S_RRNA
      NaN
      GO:0032543
      SGD_REF:S000073641|PMID:6262728
      IC
      GO:0005763
      P
      Ribosomal RNA of the small mitochondrial ribos...
      Q0020|14s rRNA|15S_RRNA_2
      gene
      taxon:559292
      20150612
      SGD
      NaN
      NaN
    
    
      2
      SGD
      S000007287
      15S_RRNA
      NaN
      GO:0003735
      SGD_REF:S000073641|PMID:6262728
      IC
      GO:0005763
      F
      Ribosomal RNA of the small mitochondrial ribos...
      Q0020|14s rRNA|15S_RRNA_2
      gene
      taxon:559292
      20150612
      SGD
      NaN
      NaN
    
    
      3
      SGD
      S000007288
      21S_RRNA
      NaN
      GO:0005762
      SGD_REF:S000073372|PMID:6759872
      IDA
      NaN
      C
      Mitochondrial 21S rRNA
      Q0158|21S_rRNA_3|21S_rRNA_4
      gene
      taxon:559292
      20040202
      SGD
      NaN
      NaN
    
    
      4
      SGD
      S000007288
      21S_RRNA
      NaN
      GO:0032543
      SGD_REF:S000073372|PMID:6759872
      IMP
      NaN
      P
      Mitochondrial 21S rRNA
      Q0158|21S_rRNA_3|21S_rRNA_4
      gene
      taxon:559292
      20100715
      SGD
      NaN
      NaN



In [5]:

    
## Load gene count
df_term_size = pd.read_csv('./data/collapsed_go.no_IGI.propagated.term_sizes', delimiter='\t', names=['term_id', 'geneCount'])
df_term_size.head()



In [6]:

    
go_map = {}

for row in df_term_size.itertuples():
    go_map[row[1]] = int(row[2])



In [7]:

    
gene_map = {}

for row in yeastAnnotation.itertuples():
    gene_map[row[11].split('|')[0]] = row[3]



In [8]:

    
obo = obo_parser.GODag(oboUrl)









    



load obo file ./data/go.obo
./data/go.obo: fmt(1.2) rel(2017-08-10) 49,042 GO Terms



In [9]:

    
full_go_w_genes = {}
new_nodes = []
new_edges = []

for node in original['elements']['nodes']:
    
    data = node['data']
    new_node = {
        'data': {
            'id': data['name']
        },
        'position': {}
    }
    
    data = node['data']
    
    if (node['data']['name'].startswith('GO'))  and (data['name'] in obo.keys()):
        # This is GO
        new_node['data']['geneCount'] = go_map[data['name']]
        go = obo[data['name']]
        new_node['data']['name'] = go.name
        new_node['data']['namespace'] = go.namespace
        new_node['data']['type'] = 't'
    elif data['name'] == 'GO:00SUPER':
        # Root node
        new_node['data']['name'] = 'Root'
        new_node['data']['type'] = 'r'
        
    elif not node['data']['name'].startswith('GO'):
                
        if data['name'] in gene_map.keys():
            new_node['data']['name'] = gene_map[data['name']]
        else:
            new_node['data']['name'] = data['name']
        
        new_node['data']['type'] = 'g' 

    
    original_pos = node['position']
    
    new_node['position']['x'] = original_pos['x']*8
    new_node['position']['y'] = original_pos['y']*8 

    
    new_nodes.append(new_node)

print(new_nodes[9000])
print(new_nodes[9])









    



{'position': {'x': 269394.57763320964, 'y': 304962.6855812461}, 'data': {'namespace': 'cellular_component', 'type': 't', 'name': 'transferase complex, transferring phosphorus-containing groups', 'geneCount': 167, 'id': 'GO:0061695'}}
{'position': {'x': 165658.29047559967, 'y': 152957.96378916368}, 'data': {'type': 'g', 'name': 'LSM1', 'id': 'YJL124C'}}



In [10]:

    
treeColNames = ['parent', 'child', 'type', 'in_tree']
tree = pd.read_csv(treeSourceUrl, delimiter='\t', names=treeColNames)
tree.tail(10)









    Out[10]:







  
    
      
      parent
      child
      type
      in_tree
    
  
  
    
      441927
      GO:0090150
      YHR083W
      gene
      NOT_TREE
    
    
      441928
      GO:0005575
      YHR083W
      gene
      NOT_TREE
    
    
      441929
      GO:0098796
      YHR083W
      gene
      NOT_TREE
    
    
      441930
      GO:1902589
      YHR083W
      gene
      NOT_TREE
    
    
      441931
      GO:0044085
      YHR083W
      gene
      NOT_TREE
    
    
      441932
      GO:0015031
      YHR083W
      gene
      NOT_TREE
    
    
      441933
      GO:1902582
      YHR083W
      gene
      NOT_TREE
    
    
      441934
      GO:1902580
      YHR083W
      gene
      NOT_TREE
    
    
      441935
      GO:0098799
      YHR083W
      gene
      NOT_TREE
    
    
      441936
      GO:0098798
      YHR083W
      gene
      NOT_TREE



In [11]:

    
for row in tree.itertuples():
    
    if row[4] == 'NOT_TREE':
        continue
    
    t = row[1]
    s = row[2]
    
    edge = {
        'data': {
            'source': s,
            'target': t,
            'type': row[3]
        }
    }
    new_edges.append(edge)
    
print(tree['type'].unique())









    



['is_a' 'regulates' 'part_of' 'negatively_regulates' 'positively_regulates'
 'gene']



In [14]:

    
# Add path info

import networkx as nx

G=nx.DiGraph()

node_set = set()
edges = []

for row in tree.itertuples():
    node_set.add(row[1])
    node_set.add(row[2])
    if "GO:" in row[1] and "GO:" in row[2]:
        edges.append((row[2], row[1]))
        
for node in node_set:
     if "GO:" in node:
        G.add_node(node)
    
len(edges)









    Out[14]:





14528



In [15]:

    
for e in edges:
    G.add_edge(e[0], e[1])



In [16]:

    
p=nx.shortest_path_length(G)



In [21]:

    
p['GO:0048308']['GO:00SUPER']









    Out[21]:





5



In [23]:

    
for node in new_nodes:
    source = node['data']['id']
    if source in p:
        path_len = p[node['data']['id']]['GO:00SUPER']
        node['data']['pLen'] = path_len



In [24]:

    
new_nodes[9000]









    Out[24]:





{'data': {'geneCount': 167,
  'id': 'GO:0061695',
  'name': 'transferase complex, transferring phosphorus-containing groups',
  'namespace': 'cellular_component',
  'pLen': 5,
  'type': 't'},
 'position': {'x': 269394.57763320964, 'y': 304962.6855812461}}



In [25]:

    
final_go_tree = {
    'data': original['data'],
    'elements': {
        'nodes': new_nodes,
        'edges': new_edges
    }
}

with open('./data/tree-go-genes.json', 'w') as outfile:
    json.dump(final_go_tree, outfile)

	term_id	geneCount
0	GO:0000001	27
1	GO:0000002	42
2	GO:0000003	448
3	GO:0000006	1
4	GO:0000007	1

	DB	DB_Object_ID	DB_Object_Symbol	Qualifier	GO_ID	DB:Reference	Evidence	With_or_From	Aspect	DB_Object_Name	DB_Object_Synonym	DB_Object_Type	taxon	Date	Assigned_by	Annotation_Extension	Gene_Product_Form_ID
0	SGD	S000007287	15S_RRNA	NaN	GO:0005763	SGD_REF:S000073641\|PMID:6262728	IDA	NaN	C	Ribosomal RNA of the small mitochondrial ribos...	Q0020\|14s rRNA\|15S_RRNA_2	gene	taxon:559292	20150612	SGD	NaN	NaN
1	SGD	S000007287	15S_RRNA	NaN	GO:0032543	SGD_REF:S000073641\|PMID:6262728	IC	GO:0005763	P	Ribosomal RNA of the small mitochondrial ribos...	Q0020\|14s rRNA\|15S_RRNA_2	gene	taxon:559292	20150612	SGD	NaN	NaN
2	SGD	S000007287	15S_RRNA	NaN	GO:0003735	SGD_REF:S000073641\|PMID:6262728	IC	GO:0005763	F	Ribosomal RNA of the small mitochondrial ribos...	Q0020\|14s rRNA\|15S_RRNA_2	gene	taxon:559292	20150612	SGD	NaN	NaN
3	SGD	S000007288	21S_RRNA	NaN	GO:0005762	SGD_REF:S000073372\|PMID:6759872	IDA	NaN	C	Mitochondrial 21S rRNA	Q0158\|21S_rRNA_3\|21S_rRNA_4	gene	taxon:559292	20040202	SGD	NaN	NaN
4	SGD	S000007288	21S_RRNA	NaN	GO:0032543	SGD_REF:S000073372\|PMID:6759872	IMP	NaN	P	Mitochondrial 21S rRNA	Q0158\|21S_rRNA_3\|21S_rRNA_4	gene	taxon:559292	20100715	SGD	NaN	NaN

	parent	child	type	in_tree
441927	GO:0090150	YHR083W	gene	NOT_TREE
441928	GO:0005575	YHR083W	gene	NOT_TREE
441929	GO:0098796	YHR083W	gene	NOT_TREE
441930	GO:1902589	YHR083W	gene	NOT_TREE
441931	GO:0044085	YHR083W	gene	NOT_TREE
441932	GO:0015031	YHR083W	gene	NOT_TREE
441933	GO:1902582	YHR083W	gene	NOT_TREE
441934	GO:1902580	YHR083W	gene	NOT_TREE
441935	GO:0098799	YHR083W	gene	NOT_TREE
441936	GO:0098798	YHR083W	gene	NOT_TREE