CLIXO Tree builder

Cleanup tree data created with Cytoscape and link anntation from mapping file.



In [30]:

    
import json
import pandas as pd


# Load mapping

with open('./data/clixo-mapping.json', 'r') as f:
    clixo_map = json.load(f)
    
print(len(clixo_map.keys()))



In [32]:

    
with open('./data/clixo-tree-layout.cyjs', 'r') as f:
    clixo = json.load(f)

nodes = clixo['elements']['nodes']
len(nodes)









    Out[32]:





4805



In [34]:

    
print(nodes[1])









    



{'selected': False, 'position': {'x': 15220.0, 'y': 25564.0}, 'data': {'shared_name': 'CLIXO:8037', 'name': 'CLIXO:8037', 'SUID': 24102, 'selected': False, 'geneCount': 3.0, 'id_original': 'CLIXO:8037', 'id': '24102'}}



In [33]:

    
from goatools import obo_parser

# Annotation file for the CLIXO terms
oboUrl = './data/go.obo'

obo = obo_parser.GODag(oboUrl, optional_attrs=['def'])









    



load obo file ./data/go.obo
./data/go.obo: fmt(1.2) rel(2017-01-13) 47,943 GO Terms



In [37]:

    
new_nodes = []

id2clixo = {}
gcounts = []

for n in nodes:
    d = n['data']
    
    id2clixo[d['id']] = d['id_original']
    clixo_id = d['id_original'].split(':')[1]
    name = d['id_original']
    genes = int(d['geneCount'])
    
    gcounts.append(genes)
    
    if clixo_id in clixo_map.keys():
        goid = clixo_map[clixo_id]['go']
        name = obo[goid].name
        
    data = {
        'id': d['id_original'],
        'name': name,
        'geneCount': genes
    }
    
    new_nodes.append({
            'data': data,
            'position': n['position']
        })



In [38]:

    
len(nodes)









    Out[38]:





4805



In [39]:

    
max(gcounts)









    Out[39]:





5886



In [40]:

    
# Cleanup edges

edges = clixo['elements']['edges']
new_edges = []

for e in edges:
    d = e['data']
    
    data = {
        'source': id2clixo[d['source']],
        'target': id2clixo[d['target']],
    }
    
    new_edges.append({'data': data})



In [41]:

    
clixo_compact = {
    'data': {
        'name': 'CLIXO Tree'
    },
    'elements': {
        'nodes': new_nodes,
        'edges': new_edges
    }
}

with open('./data/clixo-compact.json', 'w') as outfile:
    json.dump(clixo_compact, outfile)

And same for GO

Remove unnecessary data fields and create minimized version of JSON



In [1]:

    
!wget 'http://chianti.ucsd.edu/~kono/ci/data/deep-cell/go-sparse_original.cyjs' -O ./data/go-original.cyjs









    



--2017-01-19 11:03:26--  http://chianti.ucsd.edu/~kono/ci/data/deep-cell/go-sparse_original.cyjs
Resolving chianti.ucsd.edu... 169.228.38.202
Connecting to chianti.ucsd.edu|169.228.38.202|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10376908 (9.9M) [text/plain]
Saving to: ‘./data/go-original.cyjs’

./data/go-original. 100%[===================>]   9.90M  --.-KB/s    in 0.1s    

2017-01-19 11:03:26 (68.1 MB/s) - ‘./data/go-original.cyjs’ saved [10376908/10376908]



In [6]:

    
import json
import pandas as pd
import math

with open('./data/go-original.cyjs', 'r') as f:
    gotree = json.load(f)



In [4]:

    
import pandas as pd

df = pd.read_csv('./data/collapsed_go.no_IGI.propagated.term_sizes', sep='\t', names=['id', 'genes'])
df.head(10)



In [7]:

    
math.exp(max(df['genes'])/1000)









    Out[7]:





613.3894179106748



In [8]:

    
go2genes = {}

for row in df.itertuples():
    go2genes[row[1]] = row[2].item()



In [9]:

    
nodes = gotree['elements']['nodes']
nodes[1]









    Out[9]:





{'data': {'AverageShortestPathLength': 17.18280147,
  'BetweennessCentrality': 0.00015342,
  'ClosenessCentrality': 0.05819773,
  'ClusteringCoefficient': 0.0,
  'Degree': 2,
  'Eccentricity': 26,
  'EdgeCount': 2,
  'Indegree': 1,
  'IsSingleNode': False,
  'NeighborhoodConnectivity': 2.0,
  'NumberOfDirectedEdges': 2,
  'NumberOfUndirectedEdges': 0,
  'Outdegree': 1,
  'PartnerOfMultiEdgedNodePairs': 0,
  'Radiality': 0.47797415,
  'SUID': 13097,
  'SelfLoops': 0,
  'Stress': 26070,
  'TopologicalCoefficient': 0.5,
  'id': '13097',
  'id_original': 'GO:0016973',
  'name': 'poly(A)+ mRNA export from nucleus',
  'namespace': 'biological_process',
  'selected': False,
  'shared_name': 'poly(A)+ mRNA export from nucleus',
  'type': ''},
 'position': {'x': 18278.151549335544, 'y': 4994.833690588063},
 'selected': False}



In [10]:

    
id2go = {}
new_nodes = []

for n in nodes:
    d = n['data']
    
    id2go[d['id']] = d['id_original']
    name = d['name']
    genes = go2genes[d['id_original']]
        
    data = {
        'id': d['id_original'],
        'name': name,
        'geneCount': genes,
        'namespace': d['namespace']
    }
    
    new_nodes.append({
            'data': data,
            'position': n['position']
        })



In [11]:

    
new_nodes[1]









    Out[11]:





{'data': {'geneCount': 23,
  'id': 'GO:0016973',
  'name': 'poly(A)+ mRNA export from nucleus',
  'namespace': 'biological_process'},
 'position': {'x': 18278.151549335544, 'y': 4994.833690588063}}



In [12]:

    
math.exp(6000/1000)









    Out[12]:





403.4287934927351



In [13]:

    
edges = gotree['elements']['edges']
new_edges = []


for e in edges:
    d = e['data']
    
    data = {
        'source': id2go[d['source']],
        'target': id2go[d['target']],
        'branch': d['branch']
    }
    
    new_edges.append({'data': data})



In [14]:

    
new_edges[1]









    Out[14]:





{'data': {'branch': 'BP', 'source': 'GO:0043648', 'target': 'GO:0043650'}}



In [15]:

    
go_compact = {
    'data': {
        'name': 'GO Tree'
    },
    'elements': {
        'nodes': new_nodes,
        'edges': new_edges
    }
}

with open('./data/go-sparse-compact.json', 'w') as outfile:
    json.dump(go_compact, outfile)

	id	genes
0	GO:0000001	27
1	GO:0000002	42
2	GO:0000003	448
3	GO:0000006	1
4	GO:0000007	1
5	GO:0000009	9
6	GO:0000010	1
7	GO:0000011	18
8	GO:0000014	7
9	GO:0000015	5