In [2]:
from bioservices.kegg import KEGG
import numpy as np
import matplotlib.pyplot as plt
import readline
import random
# FDR
from rpy2.robjects.packages import importr
from rpy2.robjects.vectors import FloatVector
# Hypergeom
from scipy.stats import hypergeom
# LogNorm color scheme
import matplotlib.colors as colors
# Combinations
from itertools import combinations
In [3]:
def read_annotations(annotation_file):
annotation_fh = open(annotation_file, 'r')
annotations = annotation_fh.readlines()
annotations = list(map(str.rstrip, annotations))
annotation_fh.close()
return annotations
def filter_zscores(ko_index, modzscore_file, annotations, zscore_threshold):
'''
Due to RAM restrictions, must generate input list on the fly
'''
metabolite_hits = []
modz_fh = open(modzscore_file, 'r')
modz_lines = modz_fh.readlines()
modz_fh.close()
for index in range(0, len(modz_lines)):
line = modz_lines[index]
zscores = line.rstrip().split()
exp_zscore = float(zscores[ko_index]) # The zscore of that knockout
if abs(exp_zscore) >= zscore_threshold:
kegg_id = annotations[index]
if kegg_id.startswith('C'): # Verifying KEGG ids
metabolite_hits.append(kegg_id)
metabolite_hits = set(metabolite_hits)
return metabolite_hits
def build_metabo_input(ko_index,
pos_annotation_file, pos_modzscore_file,
neg_annotation_file, neg_modzscore_file,
zscore_threshold):
pos_annotations = read_annotations(pos_annotation_file)
neg_annotations = read_annotations(neg_annotation_file)
pos_hits = filter_zscores(ko_index, pos_modzscore_file, pos_annotations, zscore_threshold)
neg_hits = filter_zscores(ko_index, neg_modzscore_file, neg_annotations, zscore_threshold)
metabolite_hits = pos_hits | neg_hits
return metabolite_hits
In [4]:
# Get ALL the compounds within a species's pathway db
def get_all_compounds(species):
# Initiate KEGG instance
kegg_inst = KEGG()
kegg_inst.organism = species
# Get all compounds
all_compounds = set()
species_pathways = kegg_inst.pathwayIds
for pathway in species_pathways:
parsed_output = kegg_inst.parse(kegg_inst.get(pathway)) # parsed_ouput has lots of information about the pathway
try:
compounds = set(parsed_output['COMPOUND'].keys())
all_compounds = all_compounds | compounds
except KeyError: # Some pathways do not have defined compounds
pass
return all_compounds
In [5]:
def loadTsv(filename):
fh = open(filename, 'r')
outset = set()
for line in fh:
if line.startswith('C'):
outset = outset | set([line.rstrip()])
fh.close()
return outset
In [6]:
# ORA (only for E.coli)
def ora(in_metabolites, pathway_id, bg_metabolites, pathway_2_compounds, least_num_metabolites=1):
'''
Specifying a KEGG instance is way faster than creating one on the fly
Need to specify organism for that instance though
'''
# Get compounds
try:
compounds = pathway_2_compounds[pathway_id]
except KeyError:
return 'No compounds (DB)'
# Background filtering
test_pathway_compounds = compounds & bg_metabolites
if len(test_pathway_compounds) == 0:
return 'No compounds (BG)'
# Hypergeometric test
test_in_metabolites = in_metabolites & test_pathway_compounds
if len(test_in_metabolites) < least_num_metabolites:
return 'Low metabolites'
hyperg_test = hypergeom(len(bg_metabolites), len(test_pathway_compounds), len(in_metabolites & bg_metabolites))
#print(len(in_metabolites), len(in_metabolites & bg_metabolites))
#print(in_metabolites - bg_metabolites)
ora_raw_pval = 1 - hyperg_test.cdf(len(test_in_metabolites)) + hyperg_test.pmf(len(test_in_metabolites))
#print(hyperg_test.cdf(len(test_in_metabolites)))
#print(hyperg_test.pmf(len(test_in_metabolites)))
return ora_raw_pval, pathway_id, len(test_pathway_compounds)
In [7]:
def misidentify(in_metabolites, times, pool_metabolites, return_changes = False):
'''
in_metabolites: set of in metabolites
times: number of misidentification
pool_metabolites: set pool of metabolites which the new identification can be
'''
in_metabolites = list(in_metabolites)
pool_metabolites = list(pool_metabolites)
# Adjusting times so times <= length of in metabolties
if times > len(in_metabolites):
times = len(in_metabolites)
elif times > (len(pool_metabolites) - len(in_metabolites)):
times = (len(pool_metabolites) - len(in_metabolites))
if times == 0:
return in_metabolites
# Generating new identifications
new_ident = []
for i in range(0, times):
random.shuffle(pool_metabolites)
new_metabolite = pool_metabolites[0]
# If new metab is already in the input list, discard and generate new ones
while new_metabolite in in_metabolites:
random.shuffle(pool_metabolites)
new_metabolite = pool_metabolites[0]
new_ident.append(new_metabolite)
# Swapping existed metabolites for new metabolites
old_ident = []
random.shuffle(in_metabolites)
for i in range(0, times):
old_ident.append(in_metabolites.pop())
in_metabolites += new_ident
out_metabolites = set(in_metabolites)
# Added an option to return identity changes
if return_changes:
return out_metabolites, old_ident, new_ident
else:
return out_metabolites
In [8]:
# Run ORAs for all pathways in a single knockout
def oras_ko(ko_number, testing_pathways, background_metabolites, pathway_2_compounds,
pos_annotation_file, pos_modzscore_file,
neg_annotation_file, neg_modzscore_file,
zscore_threshold,
multiple_testing_correction, minus_log_trans, max_mutation, mutation_pool):
ko_metabolites = build_metabo_input(ko_number,
pos_annotation_file, pos_modzscore_file,
neg_annotation_file, neg_modzscore_file,
zscore_threshold)
# Random Mutation
if max_mutation:
tmp_metabolites = ko_metabolites # Copy ko_metabolites
(background_metabolites, old_met, new_met) = misidentify(background_metabolites,
max_mutation, mutation_pool, True)
met_translate = dict(zip(old_met, new_met)) # Make a dictionary for translation
ko_metabolites = set() # Empty ko_metabolites
for met in tmp_metabolites: # For each member in tmp, translate them and add them back to ko_metabolites
ko_metabolites.add(met_translate.get(met, met))
# Generating raw p values
pval = []
pathwayid = []
pathwaysize = []
for pathway in testing_pathways:
ora_res = ora(ko_metabolites, pathway, background_metabolites, pathway_2_compounds)
if len(ora_res) == 3: # if both ora_raw_pval and pathway_id are returned
pval.append(ora_res[0])
pathwayid.append(ora_res[1])
pathwaysize.append(ora_res[2])
# Multiple testing correction
if multiple_testing_correction:
pval = list(importr('stats').p_adjust(FloatVector(pval), method = 'BH'))
# -log transformation
if minus_log_trans:
pval = list(map(np.log10, pval))
pval = list(map(np.negative, pval))
return pval, pathwayid, pathwaysize
In [22]:
# Generating null models
def make_null_model(original_model, metabolite_pool, overlap=False):
'''
original_model -- the dictionary with pathways as keys and metabolites as values
metabolite_pool -- the collection of metabolites which the null model can include
overlap -- True: overlap between pathways will not be randomised; False: overlap will be randomised
'''
if overlap:
# If keeping the metabolite overlap, we only shuffle the labels of the metabolites
# Getting the original metabolites
original_metabolites = set()
for value in original_model.values():
original_metabolites = original_metabolites | value
original_metabolites = list(original_metabolites)
random.shuffle(metabolite_pool)
metabolite_translate = dict(zip(original_metabolites, metabolite_pool[:len(original_metabolites)]))
new_model = dict()
for pathway in original_model:
pathwaysize = len(original_model[pathway])
if overlap:
new_model[pathway] = set()
for metabolite in original_model[pathway]:
new_model[pathway].add(metabolite_translate[metabolite])
else:
for i in range(0, pathwaysize):
random.shuffle(metabolite_pool)
new_model[pathway] = set(metabolite_pool[:pathwaysize])
return new_model
In [10]:
# Save null models and load null models
def save_null_model(model, filename):
'''
model in the type of a dictionary
keys = pathway name
values = metabolites (set)
'''
with open(filename, 'w') as fh:
for pathway in model:
metabolites = list(model[pathway])
fh.write(pathway+'\t')
fh.write(','.join(metabolites))
fh.write('\n')
def load_null_model(filename):
model = dict()
with open(filename, 'r') as fh:
for line in fh.readlines():
fields = line.rstrip().split('\t')
pathway = fields[0]
metabolites = set(fields[1].split(','))
model[pathway] = metabolites
return model
In [24]:
for i in range(0, 100):
filename = ('/home/zxu/Documents/mscbioinfo/Data Project/scripts/Jupyter/null_models/random/' +
'model' + str(i) + '.tsv')
null_model = make_null_model(pathway_2_compounds, list(test_compounds), True)
#save_null_model(null_model, filename)
print(null_model['path:eco00010'])
In [60]:
print(len(null_model['path:eco00010'] & null_model['path:eco00030']))
print(len(pathway_2_compounds['path:eco00010'] & pathway_2_compounds['path:eco00030']))
In [80]:
null_jaccard = []
kegg_jaccard = []
for i in combinations(pathway_2_compounds, 2):
null_jaccard.append(len(null_model[i[0]] & null_model[i[1]]) / len(null_model[i[0]] | null_model[i[1]]))
kegg_jaccard.append(len(pathway_2_compounds[i[0]] & pathway_2_compounds[i[1]]) / len(pathway_2_compounds[i[0]] | pathway_2_compounds[i[1]]))
In [79]:
np.var(null_jaccard)
Out[79]:
In [81]:
bins = np.arange(0, 0.5, 0.01)
hist, bin_edges = np.histogram(null_jaccard, bins=bins)
plt.clf()
plt.bar(bins[:-1],hist,width=np.diff(bins))
plt.show()
In [82]:
bins = np.arange(0, 0.5, 0.01)
hist, bin_edges = np.histogram(kegg_jaccard, bins=bins)
plt.clf()
plt.bar(bins[:-1],hist,width=np.diff(bins))
plt.show()
In [24]:
sig_count = 0
for ko_number in range(0, len(all_knockouts)):
nullmod_pval, nullmod_pathway_id, nullmod_sizes = oras_ko(ko_number, ecoli_pathways, zamboni_bg, null_model,
pos_annot, pos_mod, neg_annot, neg_mod, 2, False, False, 0, [])
for i in nullmod_pval:
if i < 0.05:
sig_count += 1
print(sig_count)
In [58]:
'C00186' in zamboni_bg
Out[58]:
In [57]:
build_metabo_input(2281, pos_annot, pos_mod, neg_annot, neg_mod, 5)
Out[57]:
In [54]:
fh = open('/home/zxu/Documents/mscbioinfo/Data Project/Zamboni/modzscore_neg_annotated.tsv', 'r')
lines = fh.readlines()
ko_scores = []
for line in lines:
#print(len(line.rstrip().split('\t')))
score = line.rstrip().split('\t')[2281]
ko_scores.append(score)
plt.scatter(np.arange(len(ko_scores)), ko_scores)
plt.show()
In [44]:
all_knockouts.index('ybjO')
Out[44]:
In [17]:
build_metabo_input(1541, pos_annot, pos_mod, neg_annot, neg_mod, 10)
Out[17]:
In [8]:
for ko_number in range(0, len(all_knockouts)):
fh = open('./Backgrounds/KO' + str(ko_number) + '.tsv', 'r')
nobg_pval = []
nobg_pathways = []
nobg_size = []
zamboni_pval = []
lines = fh.readlines()
for line in lines:
fields = line.rstrip().split('\t')
nobg_pval.append(float(fields[1]))
nobg_pathways.append(fields[0])
nobg_size.append(fields[2])
zamboni_pval.append(float(fields[3]))
fh.close()
if len(nobg_pval) == 0:
continue
elif max(nobg_pval) < 1.30 and max(zamboni_pval) < 1.30:
continue
print('<OPTION VALUE="{}">{}</option>'.format(ko_number, all_knockouts[ko_number]), end='')
In [11]:
# Stating the annotation files & modzscore files
pos_annot = '/home/zxu/Documents/mscbioinfo/Data Project/Zamboni/annotation_pos.txt'
pos_mod = '/home/zxu/Documents/mscbioinfo/Data Project/Zamboni/modzscore_pos_annotated.tsv'
neg_annot = '/home/zxu/Documents/mscbioinfo/Data Project/Zamboni/annotation_neg.txt'
neg_mod = '/home/zxu/Documents/mscbioinfo/Data Project/Zamboni/modzscore_neg_annotated.tsv'
# Initialise KEGG instance
k = KEGG()
k.organism = "eco"
# Initialise both backgrounds
test_compounds = get_all_compounds('eco')
zamboni_bg = loadTsv('/home/zxu/Documents/mscbioinfo/Data Project/Zamboni/annotation_all.txt')
zamboni_bg = zamboni_bg & test_compounds
# build {pathway: compounds} dictionary for E.coli
ecoli_pathways = k.pathwayIds
pathway_2_compounds = dict()
for pathway in ecoli_pathways:
parsed_output = k.parse(k.get(pathway)) # parsed_ouput has lots of information about the pathway
try:
compounds = set(parsed_output['COMPOUND'].keys())
pathway_2_compounds[pathway] = compounds
except KeyError: # Some pathways do not have defined compounds
#name = parsed_output['NAME']
#print(pathway, name)
pass
In [12]:
# Translate KO number to gene name
sample_id_all = '/home/zxu/Documents/mscbioinfo/Data Project/Zamboni/sample_id_modzscore.tsv'
all_knockouts = []# End product
fh_sample_id_all = open(sample_id_all, 'r')
for knockout in fh_sample_id_all:
all_knockouts.append(knockout.rstrip())
fh_sample_id_all.close()
#print(all_knockouts)
In [30]:
fh = open('rabinowitz.txt', 'r')
rabinowitz_lines = fh.readlines()
for line in rabinowitz_lines[80:85]:
compound = line.rstrip()
if compound.endswith(')'):
compound = compound.split(' (')[0]
print(compound)
print(k.find('compound', compound))
print('=' * 20)
fh.close()
In [10]:
met_conc = {}
with open('/home/zxu/Documents/mscbioinfo/Bioinfo Project/rabinowitz_conc.csv', 'r') as fh:
lines = fh.readlines()
for line in lines:
fields = line.rstrip().split('\t')
# Converting the concentration to actual numbers
conc = fields[0]
value = conc.split(' ')[0]
power = conc.split('− ')[1]
number = float(value) * 10 ** (-int(power))
metabolite = fields[1]
met_conc[metabolite] = number
# Might use if we want to cut off based on concentrations
rab_met = sorted(met_conc, key=met_conc.get, reverse=True)
In [11]:
for ko_number in range(0, len(all_knockouts)):
rabbg_pval, rabbg_pathway_id, rabbg_sizes = oras_ko(ko_number, ecoli_pathways, zamboni_bg & set(rab_met), pathway_2_compounds,
pos_annot, pos_mod, neg_annot, neg_mod, 2, True, False, 0, [])
with open('./rabinowitz/full/KO' + str(ko_number) + '.tsv', 'w') as fh:
for i in range(0, len(rabbg_pval)):
fh.write(rabbg_pathway_id[i][5:] + '\t' + str(rabbg_pval[i]) + '\n')
In [13]:
zamboni_pval = []
rabinowitz_pval = []
for ko_number in range(0, len(all_knockouts)):
ora_results = {}
rabinowitz_results = './rabinowitz/full/KO' + str(ko_number) + '.tsv'
zamboni_results = './allresult/KO' + str(ko_number) + '.tsv'
with open(rabinowitz_results, 'r') as rabinowitz_fh:
for line in rabinowitz_fh.readlines():
fields = line.rstrip().split('\t')
pathname = fields[0]
pathpval = float(fields[1])
ora_results[pathname] = pathpval
with open(zamboni_results, 'r') as zamboni_fh:
for line in zamboni_fh.readlines():
fields = line.rstrip().split('\t')
pathname = fields[0]
pathpval = float(fields[1])
try:
rabinowitz_pval.append(ora_results[pathname])
zamboni_pval.append(pathpval)
except KeyError:
pass
print(len(zamboni_pval), len(rabinowitz_pval))
In [16]:
zamboni_pval = list(map(np.log10, zamboni_pval))
zamboni_pval = list(map(np.negative, zamboni_pval))
rabinowitz_pval = list(map(np.log10, rabinowitz_pval))
rabinowitz_pval = list(map(np.negative, rabinowitz_pval))
In [27]:
xedges = np.arange(0, 2, 0.05)
yedges = np.arange(0, 2, 0.05)
heatmap, xedges, yedges = np.histogram2d(zamboni_pval, rabinowitz_pval, bins=(xedges, yedges))
extent = [xedges[0], xedges[-1], yedges[0], yedges[-1]]
plt.clf()
plt.plot([0,10], [0,10], color="Black", label='y=x')
plt.imshow(heatmap.T, extent=extent, origin='lower', norm=colors.LogNorm(), aspect='auto')
plt.set_cmap('rainbow')
plt.colorbar(orientation="horizontal", pad=0.20)
plt.ylabel('Rabinowitz Background')
plt.xlabel('Zamboni Background')
plt.title('-log(P) in two backgrounds')
plt.legend(bbox_to_anchor=(1, 1.22))
plt.savefig('minuslog_zamboni_rabinowitz', transparent=False)
#plt.scatter(nobg, zamboni)
#plt.plot([0,10], [0,10])
#plt.xlim([0, 20])
#plt.ylim([0, 10])
#plt.show()
Verdict:
1) A more specified background tends to make p-values less significant
2) Some KOs survived the multiple testing correction
In [32]:
path_2_pathname = {}
for path in ecoli_pathways:
pathname = path[5:]
path_2_pathname[pathname] = k.parse(k.get(pathname))['NAME'][0][:-31]
In [33]:
path_2_pathname
Out[33]:
In [40]:
out_fh = open('datajs2.js', 'w')
for ko_number in range(0, 3717):
fh = open('./Backgrounds/KO' + str(ko_number) + '.tsv', 'r')
nobg_pval = []
nobg_pathways = []
nobg_size = []
zamboni_pval = []
lines = fh.readlines()
for line in lines:
fields = line.rstrip().split('\t')
nobg_pval.append(float(fields[1]))
nobg_pathways.append(fields[0])
nobg_size.append(fields[2])
zamboni_pval.append(float(fields[3]))
fh.close()
if len(nobg_pval) == 0:
continue
elif max(nobg_pval) < 1.30 and max(zamboni_pval) < 1.30:
continue
out_fh.write('else if(selVal == \"' + str(ko_number) + '\")')
out_fh.write('{options.series = [{data: [')
# Plotting
for i in range(0, len(nobg_pval)):
if nobg_pval[i] > 1.30 or zamboni_pval[i] > 1.30:
name = path_2_pathname[nobg_pathways[i]]
out_fh.write('{ ' +
'x: {}, y: {}, z: {}, name: "{}", country: "{}"'.format(nobg_pval[i], zamboni_pval[i], nobg_size[i],
nobg_pathways[i], name) +
' }')
if i == len(nobg_pval)-1:
pass
else:
out_fh.write(',')
#name = path_2_pathname[nobg_pathways[-1]]
#out_fh.write('{ ' +
# 'x: {}, y: {}, z: {}, name: "{}", country: "{}"'.format(nobg_pval[-1], zamboni_pval[-1], nobg_size[-1],
# nobg_pathways[-1], name) +
# ' }')
out_fh.write(']}], options.title = {text: \'OverRepresentation Analysis of ' + all_knockouts[ko_number] + '\'}}' + '\n')
out_fh.close()
In [10]:
# Background Analysis
for ko_number in range(2406, 3717):
nobg_pval, nobg_pathway_id, nobg_sizes = oras_ko(ko_number, ecoli_pathways, test_compounds, pathway_2_compounds,
pos_annot, pos_mod, neg_annot, neg_mod, 2, True, True, 0, [])
zamboni_pval, zamboni_pathway_id, zamboni_sizes = oras_ko(ko_number, ecoli_pathways, zamboni_bg, pathway_2_compounds,
pos_annot, pos_mod, neg_annot, neg_mod, 2, True, True, 0, [])
result_file = './Backgrounds/KO' + str(ko_number) + '.tsv'
fh = open(result_file, 'w')
for i in range(0, len(nobg_pathway_id)):
fh.write('{}\t{}\t{}\t{}\t{}\n'.format(nobg_pathway_id[i][5:], nobg_pval[i], nobg_sizes[i], zamboni_pval[i], zamboni_sizes[i]))
fh.close()
In [54]:
print(len(fp), len(fn))
In [45]:
nobg = []
zamboni = []
fp = []
fn = []
for ko_number in range(0, 3717):
result_file = './Backgrounds/KO' + str(ko_number) + '.tsv'
fh = open(result_file, 'r')
lines = fh.readlines()
for line in lines:
fields = line.rstrip().split('\t')
nobg_pval = float(fields[1])
zamboni_pval = float(fields[3])
nobg.append(nobg_pval)
zamboni.append(zamboni_pval)
if (nobg_pval > zamboni_pval) and (nobg_pval > 1.301) and (zamboni_pval < 1.301): # fp
fp.append(1)
elif (nobg_pval < zamboni_pval) and (zamboni_pval > 1.301) and (nobg_pval < 1.301): # fn
fn.append(1)
else:
pass
In [77]:
nobg
Out[77]:
In [146]:
xedges = np.arange(0, 6, 0.05)
yedges = np.arange(0, 3, 0.05)
heatmap, xedges, yedges = np.histogram2d(nobg, zamboni, bins=(xedges, yedges))
extent = [xedges[0], xedges[-1], yedges[0], yedges[-1]]
plt.clf()
plt.plot([0,10], [0,10], color="Black", label='y=x')
plt.imshow(heatmap.T, extent=extent, origin='lower', norm=LogNorm(), aspect='auto')
plt.set_cmap('rainbow')
plt.colorbar(orientation="horizontal", pad=0.20)
plt.ylabel('Specified Background')
plt.xlabel('Unspecified Background')
plt.title('-log(P) in two backgrounds')
plt.legend(bbox_to_anchor=(1, 1.22))
plt.savefig('contour', transparent=True)
#plt.scatter(nobg, zamboni)
#plt.plot([0,10], [0,10])
#plt.xlim([0, 20])
#plt.ylim([0, 10])
#plt.show()
In [39]:
for met in build_metabo_input(0, pos_annot, pos_mod, neg_annot, neg_mod, 2):
print(met)
Verdict:
1) Pval distribution is just another form of Analysis 1: Background metabolites so verdicts followed
2) Pathway Size distribution showed pathway size experienced a shift when the background was applied
In [ ]:
# pval distribution
for ko_number in range(0, 101):
ko_gene = all_knockouts[ko_number]
ko_metabolites = loadTsv('/home/zxu/Documents/mscbioinfo/Data Project/scripts/metaboAnalystqueries/maKO' +
str(ko_number) + 'Cutoff2.tsv')
ecoli_pathways = k.pathwayIds
nobg_pval = []
zamboni_pval = []
for pathway_index in range(0, len(ecoli_pathways)):
pathway = ecoli_pathways[pathway_index]
#print(pathway_index)
nobg_ora_res = ora(ko_metabolites, pathway, test_compounds, pathway_2_compounds)
if len(nobg_ora_res) == 2:
nobg_pval.append(nobg_ora_res[0])
zamboni_pval.append(ora(ko_metabolites, pathway, zamboni_bg, pathway_2_compounds)[0])
'''
nobg_pval = list(map(np.log10, nobg_pval))
zamboni_pval = list(map(np.log10, zamboni_pval))
nobg_pval = list(map(np.negative, nobg_pval))
zamboni_pval = list(map(np.negative, zamboni_pval))
'''
# Plotting
fig, ax = plt.subplots(nrows=2, ncols=1)
fig.subplots_adjust(bottom=-0.5)
binsize = 0.05
ax[0].hist(nobg_pval, bins=np.arange(0, 1+binsize, binsize))
ax[1].hist(zamboni_pval, bins=np.arange(0, 1+binsize, binsize))
ax[0].set_title('P-value distribution (all compounds) for knockout ' + ko_gene)
ax[1].set_title('P-value distribution (Zamboni background) for knockout ' + ko_gene)
ax[0].get_xaxis().set_visible(False)
for axe in ax:
axe.patches[0].set_color('r')
#ax.set_xlabel('No specified background')
#ax.set_ylabel('Zamboni background')
plt.tight_layout()
fig.savefig('./pvaldist/KO' + str(ko_number) + '.png')
#plt.show()
fig.clf()
In [10]:
size_dist = []
for pathway in pathway_2_compounds:
#if len(pathway_2_compounds[pathway]) == 1:
# print(pathway)
size_dist.append(len(pathway_2_compounds[pathway]))
In [11]:
zamboni_size_dist = []
for pathway in pathway_2_compounds:
compounds = pathway_2_compounds[pathway]
cmpd_count = 0
for compound in compounds:
if compound in zamboni_bg:
cmpd_count += 1
zamboni_size_dist.append(cmpd_count)
In [12]:
print(min(zamboni_size_dist), max(zamboni_size_dist))
plt.hist(zamboni_size_dist, bins=range(0, 145, 5))
plt.ylim(0, 40)
plt.xlabel('Pathway size')
plt.ylabel('Number of pathways')
plt.title('Pathway size distribution (Zamboni background)')
plt.show()
In [13]:
print(min(size_dist))
print(max(size_dist))
plt.hist(size_dist, bins=range(0, 145, 5))
plt.ylim(0, 40)
plt.xlabel('Pathway size')
plt.ylabel('Number of pathways')
plt.title('Pathway size distribution (all compounds)')
plt.show()
In [10]:
random_knockouts = np.random.randint(3717, size=50)
In [12]:
random_knockouts = np.array([2673, 470, 3457, 859, 2461, 2776, 514, 1537, 3114, 2120, 2880,
1312, 484, 3494, 110, 29, 1514, 791, 1925, 1131, 2776, 1274,
1342, 875, 2235, 2938, 1460, 2957, 718, 1214, 3058, 509, 3215,
2066, 2598, 3622, 3627, 436, 2223, 2691, 2442, 3439, 2490, 1223,
90, 1902, 1893, 929, 3349, 746])
In [25]:
print(zero, one, two, more)
print(20)
In [27]:
print(zero, one, two, more)
print(10)
In [26]:
import os
zero = {}
one = {}
two = {}
more = {}
for file in os.listdir('./mis_ident50/3more'):
path = './mis_ident10/3more/' + file
fh = open(path, 'r')
lines = fh.readlines()
for line in lines:
fields = line.rstrip().split('\t')
total_hits = int(fields[0])
false_pos = int(fields[1])
false_neg = int(fields[2])
if total_hits == 0:
zero['fp'] = zero.get('fp', 0) + false_pos
zero['fn'] = zero.get('fn', 0) + false_neg
elif total_hits == 1:
one['fp'] = one.get('fp', 0) + false_pos
one['fn'] = one.get('fn', 0) + false_neg
elif total_hits == 2:
two['fp'] = two.get('fp', 0) + false_pos
two['fn'] = two.get('fn', 0) + false_neg
elif total_hits >= 3:
more['fp'] = more.get('fp', 0) + false_pos
more['fn'] = more.get('fn', 0) + false_neg
fh.close()
In [13]:
# Random metabolite mutation
mutation_rate = 0.5
filename = './mis_ident/new/mrate50.tsv'
fh = open(filename, 'w')
for ko_number in random_knockouts:
fp = 0
fn = 0
ora_results = []
for i in range(0, 51):
ora_results.append([])
(pvals, pathwayids, junk) = oras_ko(ko_number, ecoli_pathways, zamboni_bg, pathway_2_compounds,
pos_annot, pos_mod, neg_annot, neg_mod, 2,
True, False, 0, [])
for ind in range(0, len(pvals)):
if pvals[ind] < 0.05:
ora_results[0].append(pathwayids[ind])
for k in range(0, 50): # Number of mutations per ko
#print(k)
(pvals_mut, pathwayids_mut, junk) = oras_ko(ko_number, ecoli_pathways, zamboni_bg, pathway_2_compounds,
pos_annot, pos_mod, neg_annot, neg_mod, 2,
True, False, int(mutation_rate * 413), test_compounds)
for ind in range(0, len(pvals_mut)):
if pvals_mut[ind] < 0.05:
ora_results[k+1].append(pathwayids_mut[ind])
# write ora_result to a file
for i in range(1, len(ora_results)):
result = ora_results[i]
fp += len(set(result) - set(ora_results[0]))
fn += len(set(ora_results[0]) - set(result))
fh.write('\t'.join([str(len(ora_results[0])), str(fp), str(fn), str(ko_number)]))
fh.write('\n')
fh.close()
In [28]:
fh = open('./mis_ident/new/mrate50.tsv', 'r')
lines = fh.readlines()
fp_all = []
fn_all = []
for line in lines:
fields = line.rstrip().split('\t')
tp = int(fields[0])
fp = int(fields[1])
fn = int(fields[2])
fp_all.append(fp)
if tp != 0:
fn_all.append(fn)
fh.close()
print(np.mean(fp_all)/50, np.mean(fn_all)/50, sum(fn_all)/2500)
In [80]:
count = -1
for k in ora_results:
if k == ora_results[0]:
count += 1
print(count)
In [14]:
eco_enzymes
Out[14]:
In [12]:
eco_enzymes = []
for i in range(0, len(all_knockouts)):
print(i)
try:
if 'Enzymes' in k.parse(k.get(k.find('eco', all_knockouts[i])[:9])).get('BRITE', []):
eco_enzymes.append(i)
except AttributeError:
pass
In [21]:
outfile = './locality/allKO.tsv'
fh = open(outfile, 'w')
for ko_number in eco_enzymes:
print(ko_number)
ko_gene = all_knockouts[ko_number]
gene_id = k.find("eco", ko_gene)[:9]
try:
gene_pathways = k.parse(k.get(gene_id))['PATHWAY'].keys()
except KeyError:
gene_pathways = []
except TypeError:
gene_pathways = []
if len(gene_pathways) > 0:
fh.write(str(ko_number) + '\t' + ko_gene + '\t')
fh.write(' '.join(list(gene_pathways)))
fh.write('\n')
fh.close()
In [37]:
from operator import itemgetter
# Get KO number
# Load rankings for that KO
# Get significant/not related/average rank
kegg_results = './locality/allKO.tsv'
fh = open(kegg_results, 'r')
result_lines = fh.readlines()
fh.close()
out_fh = open('./locality/outresult_sig.tsv', 'w')
out_fh.write('KO_number\tGene\tSigpathways\tNot_KEGG\tSigKEGG\tRank\tPath_count\n')
for line in result_lines:
fields = line.rstrip().split('\t')
ko_number = fields[0]
ko_name = fields[1]
ko_kegg_pathways = fields[2].split()
ko_ora_results = './allresult/KO' + ko_number + '.tsv'
fh = open(ko_ora_results, 'r')
ora_lines = fh.readlines()
fh.close()
ora_pvals = []
ora_pathways = []
ora_sigpathways = []
for oraline in ora_lines:
ora_pathway_result = oraline.rstrip().split('\t')
ora_pvals.append(ora_pathway_result[1])
ora_pathways.append(ora_pathway_result[0])
if float(ora_pathway_result[1]) < 0.05:
ora_sigpathways.append(ora_pathway_result[0])
pathway_rank = dict(zip(ora_pathways, list(dup_argsort(ora_pvals))))
# Sigpathway
not_related_pathways = len(set(ora_sigpathways) - set(ko_kegg_pathways))
missing_pathways = len(set(ko_kegg_pathways) - set(ora_sigpathways))
# average rank
ranksum = 0
path_count = 0
for path in ko_kegg_pathways:
try:
rank = pathway_rank[path]
ranksum += rank
path_count += 1
except KeyError:
pass
if path_count != 0:
rankavg = ranksum/path_count
else:
rankavg = 'N'
output_str = '\t'.join([ko_number, ko_name,
str(len(ora_sigpathways)), str(not_related_pathways), str(len(ora_sigpathways) - not_related_pathways), str(rankavg), str(path_count)])
if len(ora_sigpathways) > 0:
out_fh.write(output_str+'\n')
out_fh.close()
In [25]:
a2 = np.array([4,2,1,1,2])
def dup_argsort(in_val):
u, v = np.unique(in_val, return_inverse=True)
out_ind = (np.cumsum(np.concatenate(([0], np.bincount(v)))))[v]
return out_ind
dup_argsort(a2)
Out[25]:
In [51]:
fh = open('./2281.tsv', 'w')
fh.write('Gene\tKegg\tKegg_path\tSig_path(0.1)\tMinPval\n')
for ko_number in range(2281, 2282):
print(ko_number)
ko_gene = all_knockouts[ko_number]
gene_id = k.find("eco", ko_gene)[:9]
try:
gene_pathways = k.parse(k.get(gene_id))['PATHWAY'].keys()
except KeyError:
gene_pathways = []
except TypeError:
gene_pathways = []
if len(gene_pathways) >= 0:
#print(ko_gene, gene_id)
#print(list(gene_pathways))
significant_pathways = []
(pvals, pathwayids, junk) = oras_ko(ko_number, ecoli_pathways, zamboni_bg, pathway_2_compounds,
pos_annot, pos_mod, neg_annot, neg_mod, 2,
False, False, 0, [])
for ind in range(0, len(pvals)):
if pvals[ind] < 0.1:
significant_pathways.append(pathwayids[ind])
fh.write('{}\t{}\t{}\t{}\t{}\n'.format(ko_gene, gene_id, list(gene_pathways), significant_pathways, min(pvals)))
fh.close()
In [43]:
for metabolite in build_metabo_input(0, pos_annot, pos_mod, neg_annot, neg_mod, 2):
print(metabolite)
In [46]:
(pval, pathwayid, pathwaysize) = oras_ko(0, ecoli_pathways, zamboni_bg, pathway_2_compounds,
pos_annot, pos_mod, neg_annot, neg_mod, 2, False, False, 0)
In [77]:
def kegg_2_name(kegg_id, kegg_instance):
return kegg_instance.parse(kegg_instance.get(kegg_id))['NAME'][0].split(' - ')[0]
In [110]:
kegg_2_name('path:eco00330', k)
Out[110]:
In [112]:
len(k.parse(k.get('path:eco00401'))['COMPOUND'])
Out[112]:
In [109]:
pval = np.array(pval)
pvalind = np.argsort(pval)
for i in pvalind:
size = pathwaysize[i]
ptw = pathwayid[i]
ptwname = kegg_2_name(ptw, k)
print(ptw, size, ptwname, pval[i])
In [74]:
for ko_number in range(0, 3717):
result_file = './Backgrounds/KO' + str(ko_number) + '.tsv'
in_fh = open(result_file, 'r')
lines = in_fh.readlines()
out_fh = open(('./allresult/KO') + str(ko_number) + '.tsv', 'w')
for line in lines:
fields = line.rstrip().split('\t')
pathway_id = fields[0]
zamboni_pval = float(fields[3])
pvalue = 10**(np.negative(zamboni_pval))
out_fh.write('{}\t{}\n'.format(pathway_id, pvalue))
out_fh.close()
in_fh.close()
In [10]:
number_of_hits = {}
for ko_number in range(0, 3717):
result_file = './allresult/KO' + str(ko_number) + '.tsv'
fh = open(result_file, 'r')
lines = fh.readlines()
sig_pathway = 0
for line in lines:
fields = line.rstrip().split('\t')
pvalue = float(fields[1])
if pvalue < 0.05:
sig_pathway += 1
number_of_hits[ko_number] = sig_pathway
In [82]:
oras_ko(333, ecoli_pathways, zamboni_bg, pathway_2_compounds, pos_annot, pos_mod, neg_annot, neg_mod, 2, True, False, 0, [])
Out[82]:
In [11]:
no_hits = []
for ko in number_of_hits:
if number_of_hits[ko] == 0:
no_hits.append(ko)
In [21]:
no_hits
Out[21]:
In [89]:
total = 0
for ko in number_of_hits:
total += number_of_hits[ko]
total
Out[89]: