In [1]:
from bioservices.kegg import KEGG
import numpy as np
import matplotlib.pyplot as plt
import readline
import random
# FDR
from rpy2.robjects.packages import importr
from rpy2.robjects.vectors import FloatVector
# Hypergeom
from scipy.stats import hypergeom
# LogNorm color scheme
import matplotlib.colors as colors
# Combinations
from itertools import combinations

# ORA_msc
import ora_msc

Preparation for the analysis


In [2]:
# Stating the annotation files & modzscore files
DATA_PATH = '/data/zx2313/Zamboni/'

pos_annot = DATA_PATH + 'annotation_pos.txt'
pos_mod = DATA_PATH + 'modzscore_pos_annotated.tsv'
neg_annot = DATA_PATH + 'annotation_neg.txt'
neg_mod = DATA_PATH + 'modzscore_neg_annotated.tsv'
# Initialise KEGG instance
kegg = KEGG()
kegg.organism = "eco"
# Initialise both backgrounds
test_compounds = ora_msc.get_all_compounds('eco')
zamboni_bg = ora_msc.loadTsv(DATA_PATH + 'annotation_all.txt')
zamboni_bg = zamboni_bg & test_compounds
# build {pathway: compounds} dictionary for E.coli
ecoli_pathways = kegg.pathwayIds
pathway_2_compounds = dict()
for pathway in ecoli_pathways:
    parsed_output = kegg.parse(kegg.get(pathway)) # parsed_ouput has lots of information about the pathway
    try:
        compounds = set(parsed_output['COMPOUND'].keys())
        pathway_2_compounds[pathway] = compounds
    except KeyError: # Some pathways do not have defined compounds
        #name = parsed_output['NAME']
        #print(pathway, name)
        pass

In [ ]:
# Translate KO number to gene name
sample_id_all = DATA_PATH + 'sample_id_modzscore.tsv'
all_knockouts = []# End product
fh_sample_id_all = open(sample_id_all, 'r')
for knockout in fh_sample_id_all:
    all_knockouts.append(knockout.rstrip())
fh_sample_id_all.close()
#print(all_knockouts)

Making null models


In [6]:
for i in range(0, 100):
    filename = ('./null_models/nonoverlap/' + 
                'model' + str(i) + '.tsv')
    null_model = ora_msc.make_null_model(pathway_2_compounds, list(test_compounds), False)
    ora_msc.save_null_model(null_model, filename)
    #print(null_model['path:eco00010'])

Getting results from the null models


In [ ]:
sig_count = 0
for ko_number in range(0, len(all_knockouts)):
    nullmod_pval, nullmod_pathway_id, nullmod_sizes = oras_ko(ko_number, ecoli_pathways, zamboni_bg, null_model, 
                        pos_annot, pos_mod, neg_annot, neg_mod, 2, False, False, 0, [])
    for i in nullmod_pval:
        if i < 0.05:
            sig_count += 1
print(sig_count)