In [1]:
!python --version


Python 3.5.2 :: Anaconda custom (x86_64)

CLIXO Ontology Tree Generator

This is a notebook to generate tree data file from original table and annotations.

This is the final version of the script creating an Cytoscape.js file with gene count.

Requirment

  • DAG file for CLIXO
  • Term to gene assignment file
  • GO alignment file

CLIXO TERM COUNT = 4805


In [51]:
# Load data sets
import pandas as pd

treeSourceUrl = './data/preds_yeastnet_no_gi_0.04_0.5.txt.propagate.small_parent_tree'
geneCountFile = './data/preds_yeastnet_no_gi_0.04_0.5.txt.propagate.term_sizes'
alignmentFile = './data/alignments_FDR_0.1_t_0.1'
geneAssignment = './data/preds_yeastnet_no_gi_0.04_0.5.txt.propagate.mapping'

# Load the tree data
treeColNames = ['parent', 'child', 'type', 'in_tree']
tree = pd.read_csv(treeSourceUrl, delimiter='\t', names=treeColNames)
tree.tail()


Out[51]:
parent child type in_tree
136996 10051 YHR083W gene NOT_TREE
136997 10704 YHR083W gene NOT_TREE
136998 10699 YHR083W gene NOT_TREE
136999 10705 YHR083W gene NOT_TREE
137000 9816 YHR083W gene NOT_TREE

In [55]:
assignment = pd.read_csv(geneAssignment, sep='\t', names=['gene', 'clixo'])
print(assignment['clixo'].unique().shape)
assignment.head()


(4805,)
Out[55]:
gene clixo
0 YGR220C 10000
1 YPR100W 10000
2 YNR022C 10000
3 YNL252C 10000
4 YCR046C 10000

In [56]:
al = pd.read_csv(alignmentFile, sep='\t', names=['clixo', 'go', 'similarity', 'fdr', 'genes'])
al.head()


Out[56]:
clixo go similarity fdr genes
0 8607 GO:0031417 0.919767 0.0 3
1 7720 GO:0004354 0.911533 0.0 2
2 8636 GO:0036437 0.911062 0.0 3
3 9228 GO:0004574 0.909649 0.0 5
4 9773 GO:0019773 0.908339 0.0 7

In [57]:
mapping = {}
for row in al.itertuples():
    entry = {
        'go': row[2],
        'score': row[3],
        'dfr': row[4]
    }
    mapping[str(row[1])] = entry

In [58]:
geneCounts = pd.read_csv(geneCountFile, names=['clixo', 'count'], sep='\t')

term2count = {}
for row in geneCounts.itertuples():
    term2count[str(row[1])] = row[2].item()

In [59]:
# Get unique terms

clixo_terms = set()

for row in tree.itertuples():
    etype = row[3]

    if not etype.startswith('gene'):
        clixo_terms.add(str(row[1]))
        clixo_terms.add(str(row[2]))

print(len(clixo_terms))


4805

Build Base CyJS Network


In [60]:
import json

clixoTree = {
    'data': {
        'name': 'CLIXO Tree'
    },
    'elements': {
        'nodes': [],
        'edges': []
    }
}

print(json.dumps(clixoTree, indent=4))


{
    "data": {
        "name": "CLIXO Tree"
    },
    "elements": {
        "edges": [],
        "nodes": []
    }
}

In [63]:
def get_node(id, count):
    node = {
        'data': {
            'id': id,
            'geneCount': count
        }
    }
        
    return node

def get_edge(source, target):
    edge = {
        'data': {
            'source': target,
            'target': source
        }
    }
    
    return edge

In [65]:
edges = []
PREFIX = 'CLIXO:'

for row in tree.itertuples():
    etype = row[3]
    in_tree = row[4]
    if etype.startswith('gene') or in_tree == 'NOT_TREE':
        continue
    
    source = PREFIX + str(row[1])
    child = PREFIX + str(row[2])
    edges.append(get_edge(source, child))
    
print(len(edges))


4804

In [66]:
nodes = []

for id in clixo_terms:
    node = get_node(PREFIX + id, term2count[id])
    nodes.append(node)

print(len(nodes))


4805

In [67]:
clixoTree['elements']['nodes'] = nodes
clixoTree['elements']['edges'] = edges

with open('./data/clixo-tree.cyjs', 'w') as outfile:
    json.dump(clixoTree, outfile)

Layout with networkx


In [29]:
import networkx as nx

DG=nx.DiGraph()

for node in nodes:
    DG.add_node(node['data']['id'])

for edge in edges:
    DG.add_edge(edge['data']['source'], edge['data']['target'])

In [30]:
import matplotlib.pyplot as plt

In [31]:
nx.draw_circular(DG)

In [33]:
# pos = nx.nx_pydot.pydot_layout(DG)