Build full go tree from Cytoscape.js JSON



In [1]:

    
import pandas as pd
from goatools import obo_parser

oboUrl = './data/go.obo'
treeSourceUrl = 'http://chianti.ucsd.edu/~kono/ci/data/collapsed_go.no_IGI.propagated.small_parent_tree'
oboUrl = './data/go.obo'
yeastAnnotationUrl = './data/gene_association.sgd.gz'
kegg2goUrl = 'http://geneontology.org/external2go/kegg2go'
reactome2go = 'http://geneontology.org/external2go/reactome2go'

phenotypeUrl='http://downloads.yeastgenome.org/curation/literature/phenotype_data.tab'



In [2]:

    
import json

with open('data/full-go.cyjs') as data_file:    
    original = json.load(data_file)



In [29]:

    
print(original['elements']['nodes'][0])

print(original['elements']['edges'][0])









    



{'data': {'shared_name': 'YNL259C', 'id': '425956', 'SUID': 425956, 'selected': False, 'name': 'YNL259C'}, 'position': {'y': 29466.993279476694, 'x': 16546.29659293159}, 'selected': False}
{'data': {'source': '425956', 'shared_name': 'YNL259C (gene) GO:0016531', 'selected': False, 'interaction': 'gene', 'target': '12823', 'isTree': 'TREE', 'name': 'YNL259C (gene) GO:0016531', 'id': '425981', 'shared_interaction': 'gene', 'SUID': 425981}, 'selected': False}



In [33]:

    
cols = pd.read_csv('./annotation_columns.txt', names=['col_names'])
col_names = cols['col_names'].tolist()
print(col_names)

yeastAnnotation = pd.read_csv(yeastAnnotationUrl, delimiter='\t', comment='!', compression='gzip', names=col_names)
yeastAnnotation['DB_Object_Synonym'] = yeastAnnotation['DB_Object_Synonym'].fillna('')
yeastAnnotation.head()









    



['DB', 'DB_Object_ID', 'DB_Object_Symbol', 'Qualifier', 'GO_ID', 'DB:Reference', 'Evidence', 'With_or_From', 'Aspect', 'DB_Object_Name', 'DB_Object_Synonym', 'DB_Object_Type', 'taxon', 'Date', 'Assigned_by', 'Annotation_Extension', 'Gene_Product_Form_ID']






    Out[33]:







  
    
      
      DB
      DB_Object_ID
      DB_Object_Symbol
      Qualifier
      GO_ID
      DB:Reference
      Evidence
      With_or_From
      Aspect
      DB_Object_Name
      DB_Object_Synonym
      DB_Object_Type
      taxon
      Date
      Assigned_by
      Annotation_Extension
      Gene_Product_Form_ID
    
  
  
    
      0
      SGD
      S000007287
      15S_RRNA
      NaN
      GO:0005763
      SGD_REF:S000073641|PMID:6262728
      IDA
      NaN
      C
      Ribosomal RNA of the small mitochondrial ribos...
      Q0020|14s rRNA|15S_RRNA_2
      gene
      taxon:559292
      20150612
      SGD
      NaN
      NaN
    
    
      1
      SGD
      S000007287
      15S_RRNA
      NaN
      GO:0032543
      SGD_REF:S000073641|PMID:6262728
      IC
      GO:0005763
      P
      Ribosomal RNA of the small mitochondrial ribos...
      Q0020|14s rRNA|15S_RRNA_2
      gene
      taxon:559292
      20150612
      SGD
      NaN
      NaN
    
    
      2
      SGD
      S000007287
      15S_RRNA
      NaN
      GO:0003735
      SGD_REF:S000073641|PMID:6262728
      IC
      GO:0005763
      F
      Ribosomal RNA of the small mitochondrial ribos...
      Q0020|14s rRNA|15S_RRNA_2
      gene
      taxon:559292
      20150612
      SGD
      NaN
      NaN
    
    
      3
      SGD
      S000007288
      21S_RRNA
      NaN
      GO:0005762
      SGD_REF:S000073372|PMID:6759872
      IDA
      NaN
      C
      Mitochondrial 21S rRNA
      Q0158|21S_rRNA_3|21S_rRNA_4
      gene
      taxon:559292
      20040202
      SGD
      NaN
      NaN
    
    
      4
      SGD
      S000007288
      21S_RRNA
      NaN
      GO:0032543
      SGD_REF:S000073372|PMID:6759872
      IMP
      NaN
      P
      Mitochondrial 21S rRNA
      Q0158|21S_rRNA_3|21S_rRNA_4
      gene
      taxon:559292
      20100715
      SGD
      NaN
      NaN



In [11]:

    
## Load gene count
df_term_size = pd.read_csv('./data/collapsed_go.no_IGI.propagated.term_sizes', delimiter='\t', names=['term_id', 'geneCount'])
df_term_size.head()



In [13]:

    
go_map = {}

for row in df_term_size.itertuples():
    go_map[row[1]] = int(row[2])



In [40]:

    
gene_map = {}

for row in yeastAnnotation.itertuples():
    gene_map[row[11].split('|')[0]] = row[3]



In [20]:

    
obo = obo_parser.GODag(oboUrl)









    



load obo file ./data/go.obo
./data/go.obo: fmt(1.2) rel(2017-08-10) 49,042 GO Terms



In [47]:

    
full_go_w_genes = {}
new_nodes = []
new_edges = []

for node in original['elements']['nodes']:
    
    data = node['data']
    new_node = {
        'data': {
            'id': data['name']
        },
        'position': {}
    }
    
    data = node['data']
    
    if (node['data']['name'].startswith('GO'))  and (data['name'] in obo.keys()):
        # This is GO
        new_node['data']['geneCount'] = go_map[data['name']]
        go = obo[data['name']]
        new_node['data']['name'] = go.name
        new_node['data']['namespace'] = go.namespace
    elif not node['data']['name'].startswith('GO'):
                
        if data['name'] in gene_map.keys():
            new_node['data']['name'] = gene_map[data['name']]
        else:
            new_node['data']['name'] = data['name']
    
    original_pos = node['position']
    
    new_node['position']['x'] = original_pos['x']*10 
    new_node['position']['y'] = original_pos['y']*10 

    
    new_nodes.append(new_node)

print(new_nodes[9000])
print(new_nodes[9])









    



{'data': {'geneCount': 167, 'id': 'GO:0061695', 'namespace': 'cellular_component', 'name': 'transferase complex, transferring phosphorus-containing groups'}, 'position': {'y': 381203.35697655764, 'x': 336743.222041512}}
{'data': {'id': 'YJL124C', 'name': 'LSM1'}, 'position': {'y': 191197.4547364546, 'x': 207072.8630944996}}



In [48]:

    
treeColNames = ['parent', 'child', 'type', 'in_tree']
tree = pd.read_csv(treeSourceUrl, delimiter='\t', names=treeColNames)
tree.tail(10)









    Out[48]:







  
    
      
      parent
      child
      type
      in_tree
    
  
  
    
      441927
      GO:0090150
      YHR083W
      gene
      NOT_TREE
    
    
      441928
      GO:0005575
      YHR083W
      gene
      NOT_TREE
    
    
      441929
      GO:0098796
      YHR083W
      gene
      NOT_TREE
    
    
      441930
      GO:1902589
      YHR083W
      gene
      NOT_TREE
    
    
      441931
      GO:0044085
      YHR083W
      gene
      NOT_TREE
    
    
      441932
      GO:0015031
      YHR083W
      gene
      NOT_TREE
    
    
      441933
      GO:1902582
      YHR083W
      gene
      NOT_TREE
    
    
      441934
      GO:1902580
      YHR083W
      gene
      NOT_TREE
    
    
      441935
      GO:0098799
      YHR083W
      gene
      NOT_TREE
    
    
      441936
      GO:0098798
      YHR083W
      gene
      NOT_TREE



In [ ]:



In [148]:

    
import networkx as nx

G=nx.DiGraph()

node_set = set()
edges = []

for row in tree.itertuples():
    node_set.add(row[1])
    node_set.add(row[2])
    if "GO:" in row[1] and "GO:" in row[2]:
        edges.append((row[2], row[1]))



In [149]:

    
for node in node_set:
     if "GO:" in node:
        G.add_node(node)
    
len(edges)









    Out[149]:





14528



In [150]:

    
for e in edges:
    G.add_edge(e[0], e[1])



In [151]:

    
print(nx.info(G))
for n in G.nodes():
    if n == 'GO:00SUPER':
        print(n)
        
root = G.node['GO:00SUPER']









    



Name: 
Type: DiGraph
Number of nodes: 6618
Number of edges: 14528
Average in degree:   2.1952
Average out degree:   2.1952
GO:00SUPER



In [153]:

    
paths = nx.all_simple_paths(G, source='GO:0098799', target='GO:00SUPER')

sg = nx.DiGraph()
ns = set()

for p in paths:
    p_len = len(p)
    
    for i, v in enumerate(p):
        
        if i < p_len-1:
            s = v
            t = p[i+1]
            
            if s not in ns:
                sg.add_node(s)
                ns.add(s)
            if t not in ns:
                sg.add_node(t)
                ns.add(t)
            sg.add_edge(s, t)

print(ns)
print(nx.info(sg))

nx.write_graphml(sg, "test.graphml")









    



{'GO:0098798', 'GO:0044455', 'GO:0044425', 'GO:0019867', 'GO:0098588', 'GO:0044422', 'GO:0098805', 'GO:0044464', 'GO:0032991', 'GO:0044444', 'GO:0005740', 'GO:0044446', 'GO:0044424', 'GO:0043231', 'GO:0043234', 'GO:0043226', 'GO:0043227', 'GO:0005737', 'GO:0098799', 'GO:0016020', 'GO:0044429', 'GO:0031966', 'GO:00SUPER', 'GO:0043229', 'GO:0031090', 'GO:0031967', 'GO:0005741', 'GO:0005739', 'GO:0005622', 'GO:0005575', 'GO:0098796', 'GO:0031968'}
Name: 
Type: DiGraph
Number of nodes: 32
Number of edges: 63
Average in degree:   1.9688
Average out degree:   1.9688



In [102]:

    
import igraph as ig

g = ig.Graph(directed=True)
for node in node_set:
    if "GO:" in node:
        g.add_vertex(name=node)

g.summary()









    Out[102]:





'IGRAPH DN-- 6618 0 -- \n+ attr: name (v)'



In [103]:

    
g.add_edges(edges)



In [104]:

    
g.summary()
for e in edges:
    if e[0] == 'GO:0098798' or e[1]=='GO:0098798':
        print(e)









    



('GO:0098798', 'GO:0005739')
('GO:0098798', 'GO:0044429')
('GO:0098798', 'GO:0043234')
('GO:0017087', 'GO:0098798')
('GO:0098800', 'GO:0098798')
('GO:0098799', 'GO:0098798')
('GO:0030062', 'GO:0098798')



In [117]:

    
paths1 = g.vertex_disjoint_paths(g.vs.find('GO:0098798').index, target=g.vs.find('GO:00SUPER').index)



In [118]:

    
print(g.vs[2787]['name'])
print(paths1)



In [108]:

    
subg = ig.Graph()
n_set = set()

for p in paths1:
    p_len = len(p)
    
    
    for i, v in enumerate(p):
        if i < p_len-1:
#             print(str(v) + ' --> ' + str(p[i+1]))
            s = g.vs[v]['name']
            t = g.vs[p[i+1]]['name']
            print(s + ' --> ' + t)
            if s not in n_set: 
                subg.add_vertex(s)
                n_set.add(s)
            if t not in n_set:
                subg.add_vertex(t)
                n_set.add(t)
            subg.add_edge(source=s, target=t)
    print('-----')

subg.summary()
print(n_set)









    



GO:0098798 --> GO:0043234
GO:0043234 --> GO:0032991
GO:0032991 --> GO:0005575
GO:0005575 --> GO:00SUPER
-----
{'GO:0032991', 'GO:0043234', 'GO:0098798', 'GO:00SUPER', 'GO:0005575'}



In [101]:

    
subg.save("sub.gml", format="gml")

	term_id	geneCount
0	GO:0000001	27
1	GO:0000002	42
2	GO:0000003	448
3	GO:0000006	1
4	GO:0000007	1

	DB	DB_Object_ID	DB_Object_Symbol	Qualifier	GO_ID	DB:Reference	Evidence	With_or_From	Aspect	DB_Object_Name	DB_Object_Synonym	DB_Object_Type	taxon	Date	Assigned_by	Annotation_Extension	Gene_Product_Form_ID
0	SGD	S000007287	15S_RRNA	NaN	GO:0005763	SGD_REF:S000073641\|PMID:6262728	IDA	NaN	C	Ribosomal RNA of the small mitochondrial ribos...	Q0020\|14s rRNA\|15S_RRNA_2	gene	taxon:559292	20150612	SGD	NaN	NaN
1	SGD	S000007287	15S_RRNA	NaN	GO:0032543	SGD_REF:S000073641\|PMID:6262728	IC	GO:0005763	P	Ribosomal RNA of the small mitochondrial ribos...	Q0020\|14s rRNA\|15S_RRNA_2	gene	taxon:559292	20150612	SGD	NaN	NaN
2	SGD	S000007287	15S_RRNA	NaN	GO:0003735	SGD_REF:S000073641\|PMID:6262728	IC	GO:0005763	F	Ribosomal RNA of the small mitochondrial ribos...	Q0020\|14s rRNA\|15S_RRNA_2	gene	taxon:559292	20150612	SGD	NaN	NaN
3	SGD	S000007288	21S_RRNA	NaN	GO:0005762	SGD_REF:S000073372\|PMID:6759872	IDA	NaN	C	Mitochondrial 21S rRNA	Q0158\|21S_rRNA_3\|21S_rRNA_4	gene	taxon:559292	20040202	SGD	NaN	NaN
4	SGD	S000007288	21S_RRNA	NaN	GO:0032543	SGD_REF:S000073372\|PMID:6759872	IMP	NaN	P	Mitochondrial 21S rRNA	Q0158\|21S_rRNA_3\|21S_rRNA_4	gene	taxon:559292	20100715	SGD	NaN	NaN

	parent	child	type	in_tree
441927	GO:0090150	YHR083W	gene	NOT_TREE
441928	GO:0005575	YHR083W	gene	NOT_TREE
441929	GO:0098796	YHR083W	gene	NOT_TREE
441930	GO:1902589	YHR083W	gene	NOT_TREE
441931	GO:0044085	YHR083W	gene	NOT_TREE
441932	GO:0015031	YHR083W	gene	NOT_TREE
441933	GO:1902582	YHR083W	gene	NOT_TREE
441934	GO:1902580	YHR083W	gene	NOT_TREE
441935	GO:0098799	YHR083W	gene	NOT_TREE
441936	GO:0098798	YHR083W	gene	NOT_TREE