In [1]:

    
!python --version









    



Python 3.5.2 :: Anaconda custom (x86_64)

CLIXO Ontology Tree Generator

This is a notebook to generate tree data file from original table and annotations.

This is the final version of the script creating an Cytoscape.js file with gene count.

Requirment

DAG file for CLIXO
Term to gene assignment file
GO alignment file

CLIXO TERM COUNT = 4805



In [51]:

    
# Load data sets
import pandas as pd

treeSourceUrl = './data/preds_yeastnet_no_gi_0.04_0.5.txt.propagate.small_parent_tree'
geneCountFile = './data/preds_yeastnet_no_gi_0.04_0.5.txt.propagate.term_sizes'
alignmentFile = './data/alignments_FDR_0.1_t_0.1'
geneAssignment = './data/preds_yeastnet_no_gi_0.04_0.5.txt.propagate.mapping'

# Load the tree data
treeColNames = ['parent', 'child', 'type', 'in_tree']
tree = pd.read_csv(treeSourceUrl, delimiter='\t', names=treeColNames)
tree.tail()









    Out[51]:






  
    
      
      parent
      child
      type
      in_tree
    
  
  
    
      136996
      10051
      YHR083W
      gene
      NOT_TREE
    
    
      136997
      10704
      YHR083W
      gene
      NOT_TREE
    
    
      136998
      10699
      YHR083W
      gene
      NOT_TREE
    
    
      136999
      10705
      YHR083W
      gene
      NOT_TREE
    
    
      137000
      9816
      YHR083W
      gene
      NOT_TREE



In [55]:

    
assignment = pd.read_csv(geneAssignment, sep='\t', names=['gene', 'clixo'])
print(assignment['clixo'].unique().shape)
assignment.head()



In [56]:

    
al = pd.read_csv(alignmentFile, sep='\t', names=['clixo', 'go', 'similarity', 'fdr', 'genes'])
al.head()



In [57]:

    
mapping = {}
for row in al.itertuples():
    entry = {
        'go': row[2],
        'score': row[3],
        'dfr': row[4]
    }
    mapping[str(row[1])] = entry



In [58]:

    
geneCounts = pd.read_csv(geneCountFile, names=['clixo', 'count'], sep='\t')

term2count = {}
for row in geneCounts.itertuples():
    term2count[str(row[1])] = row[2].item()



In [59]:

    
# Get unique terms

clixo_terms = set()

for row in tree.itertuples():
    etype = row[3]

    if not etype.startswith('gene'):
        clixo_terms.add(str(row[1]))
        clixo_terms.add(str(row[2]))

print(len(clixo_terms))

Build Base CyJS Network



In [60]:

    
import json

clixoTree = {
    'data': {
        'name': 'CLIXO Tree'
    },
    'elements': {
        'nodes': [],
        'edges': []
    }
}

print(json.dumps(clixoTree, indent=4))









    



{
    "data": {
        "name": "CLIXO Tree"
    },
    "elements": {
        "edges": [],
        "nodes": []
    }
}



In [63]:

    
def get_node(id, count):
    node = {
        'data': {
            'id': id,
            'geneCount': count
        }
    }
        
    return node

def get_edge(source, target):
    edge = {
        'data': {
            'source': target,
            'target': source
        }
    }
    
    return edge



In [65]:

    
edges = []
PREFIX = 'CLIXO:'

for row in tree.itertuples():
    etype = row[3]
    in_tree = row[4]
    if etype.startswith('gene') or in_tree == 'NOT_TREE':
        continue
    
    source = PREFIX + str(row[1])
    child = PREFIX + str(row[2])
    edges.append(get_edge(source, child))
    
print(len(edges))



In [66]:

    
nodes = []

for id in clixo_terms:
    node = get_node(PREFIX + id, term2count[id])
    nodes.append(node)

print(len(nodes))



In [67]:

    
clixoTree['elements']['nodes'] = nodes
clixoTree['elements']['edges'] = edges

with open('./data/clixo-tree.cyjs', 'w') as outfile:
    json.dump(clixoTree, outfile)

Layout with networkx



In [29]:

    
import networkx as nx

DG=nx.DiGraph()

for node in nodes:
    DG.add_node(node['data']['id'])

for edge in edges:
    DG.add_edge(edge['data']['source'], edge['data']['target'])



In [30]:

    
import matplotlib.pyplot as plt



In [31]:

    
nx.draw_circular(DG)



In [33]:

    
# pos = nx.nx_pydot.pydot_layout(DG)

	gene	clixo
0	YGR220C	10000
1	YPR100W	10000
2	YNR022C	10000
3	YNL252C	10000
4	YCR046C	10000

	clixo	go	similarity	genes
0	8607	GO:0031417	0.919767	3
1	7720	GO:0004354	0.911533	2
2	8636	GO:0036437	0.911062	3
3	9228	GO:0004574	0.909649	5
4	9773	GO:0019773	0.908339	7

	parent	child	type	in_tree
136996	10051	YHR083W	gene	NOT_TREE
136997	10704	YHR083W	gene	NOT_TREE
136998	10699	YHR083W	gene	NOT_TREE
136999	10705	YHR083W	gene	NOT_TREE
137000	9816	YHR083W	gene	NOT_TREE