CLIXO Tree builder

Cleanup tree data created with Cytoscape and link anntation from mapping file.


In [30]:
import json
import pandas as pd


# Load mapping

with open('./data/clixo-mapping.json', 'r') as f:
    clixo_map = json.load(f)
    
print(len(clixo_map.keys()))


1811

In [32]:
with open('./data/clixo-tree-layout.cyjs', 'r') as f:
    clixo = json.load(f)

nodes = clixo['elements']['nodes']
len(nodes)


Out[32]:
4805

In [34]:
print(nodes[1])


{'selected': False, 'position': {'x': 15220.0, 'y': 25564.0}, 'data': {'shared_name': 'CLIXO:8037', 'name': 'CLIXO:8037', 'SUID': 24102, 'selected': False, 'geneCount': 3.0, 'id_original': 'CLIXO:8037', 'id': '24102'}}

In [33]:
from goatools import obo_parser

# Annotation file for the CLIXO terms
oboUrl = './data/go.obo'

obo = obo_parser.GODag(oboUrl, optional_attrs=['def'])


load obo file ./data/go.obo
./data/go.obo: fmt(1.2) rel(2017-01-13) 47,943 GO Terms

In [37]:
new_nodes = []

id2clixo = {}
gcounts = []

for n in nodes:
    d = n['data']
    
    id2clixo[d['id']] = d['id_original']
    clixo_id = d['id_original'].split(':')[1]
    name = d['id_original']
    genes = int(d['geneCount'])
    
    gcounts.append(genes)
    
    if clixo_id in clixo_map.keys():
        goid = clixo_map[clixo_id]['go']
        name = obo[goid].name
        
    data = {
        'id': d['id_original'],
        'name': name,
        'geneCount': genes
    }
    
    new_nodes.append({
            'data': data,
            'position': n['position']
        })

In [38]:
len(nodes)


Out[38]:
4805

In [39]:
max(gcounts)


Out[39]:
5886

In [40]:
# Cleanup edges

edges = clixo['elements']['edges']
new_edges = []

for e in edges:
    d = e['data']
    
    data = {
        'source': id2clixo[d['source']],
        'target': id2clixo[d['target']],
    }
    
    new_edges.append({'data': data})

In [41]:
clixo_compact = {
    'data': {
        'name': 'CLIXO Tree'
    },
    'elements': {
        'nodes': new_nodes,
        'edges': new_edges
    }
}

with open('./data/clixo-compact.json', 'w') as outfile:
    json.dump(clixo_compact, outfile)

And same for GO

Remove unnecessary data fields and create minimized version of JSON


In [1]:
!wget 'http://chianti.ucsd.edu/~kono/ci/data/deep-cell/go-sparse_original.cyjs' -O ./data/go-original.cyjs


--2017-01-19 11:03:26--  http://chianti.ucsd.edu/~kono/ci/data/deep-cell/go-sparse_original.cyjs
Resolving chianti.ucsd.edu... 169.228.38.202
Connecting to chianti.ucsd.edu|169.228.38.202|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10376908 (9.9M) [text/plain]
Saving to: ‘./data/go-original.cyjs’

./data/go-original. 100%[===================>]   9.90M  --.-KB/s    in 0.1s    

2017-01-19 11:03:26 (68.1 MB/s) - ‘./data/go-original.cyjs’ saved [10376908/10376908]


In [6]:
import json
import pandas as pd
import math

with open('./data/go-original.cyjs', 'r') as f:
    gotree = json.load(f)

In [4]:
import pandas as pd

df = pd.read_csv('./data/collapsed_go.no_IGI.propagated.term_sizes', sep='\t', names=['id', 'genes'])
df.head(10)


Out[4]:
id genes
0 GO:0000001 27
1 GO:0000002 42
2 GO:0000003 448
3 GO:0000006 1
4 GO:0000007 1
5 GO:0000009 9
6 GO:0000010 1
7 GO:0000011 18
8 GO:0000014 7
9 GO:0000015 5

In [7]:
math.exp(max(df['genes'])/1000)


Out[7]:
613.3894179106748

In [8]:
go2genes = {}

for row in df.itertuples():
    go2genes[row[1]] = row[2].item()

In [9]:
nodes = gotree['elements']['nodes']
nodes[1]


Out[9]:
{'data': {'AverageShortestPathLength': 17.18280147,
  'BetweennessCentrality': 0.00015342,
  'ClosenessCentrality': 0.05819773,
  'ClusteringCoefficient': 0.0,
  'Degree': 2,
  'Eccentricity': 26,
  'EdgeCount': 2,
  'Indegree': 1,
  'IsSingleNode': False,
  'NeighborhoodConnectivity': 2.0,
  'NumberOfDirectedEdges': 2,
  'NumberOfUndirectedEdges': 0,
  'Outdegree': 1,
  'PartnerOfMultiEdgedNodePairs': 0,
  'Radiality': 0.47797415,
  'SUID': 13097,
  'SelfLoops': 0,
  'Stress': 26070,
  'TopologicalCoefficient': 0.5,
  'id': '13097',
  'id_original': 'GO:0016973',
  'name': 'poly(A)+ mRNA export from nucleus',
  'namespace': 'biological_process',
  'selected': False,
  'shared_name': 'poly(A)+ mRNA export from nucleus',
  'type': ''},
 'position': {'x': 18278.151549335544, 'y': 4994.833690588063},
 'selected': False}

In [10]:
id2go = {}
new_nodes = []

for n in nodes:
    d = n['data']
    
    id2go[d['id']] = d['id_original']
    name = d['name']
    genes = go2genes[d['id_original']]
        
    data = {
        'id': d['id_original'],
        'name': name,
        'geneCount': genes,
        'namespace': d['namespace']
    }
    
    new_nodes.append({
            'data': data,
            'position': n['position']
        })

In [11]:
new_nodes[1]


Out[11]:
{'data': {'geneCount': 23,
  'id': 'GO:0016973',
  'name': 'poly(A)+ mRNA export from nucleus',
  'namespace': 'biological_process'},
 'position': {'x': 18278.151549335544, 'y': 4994.833690588063}}

In [12]:
math.exp(6000/1000)


Out[12]:
403.4287934927351

In [13]:
edges = gotree['elements']['edges']
new_edges = []


for e in edges:
    d = e['data']
    
    data = {
        'source': id2go[d['source']],
        'target': id2go[d['target']],
        'branch': d['branch']
    }
    
    new_edges.append({'data': data})

In [14]:
new_edges[1]


Out[14]:
{'data': {'branch': 'BP', 'source': 'GO:0043648', 'target': 'GO:0043650'}}

In [15]:
go_compact = {
    'data': {
        'name': 'GO Tree'
    },
    'elements': {
        'nodes': new_nodes,
        'edges': new_edges
    }
}

with open('./data/go-sparse-compact.json', 'w') as outfile:
    json.dump(go_compact, outfile)