In [5]:
class MetabolomicsSample:
def __init__(self, sample_id, liver_fat, infusion, serum, patient_id):
'''
id, liver fat, insulin infusion, serum type, patient, compound:abundance
'''
self.sample_id = sample_id
self.liver_fat = liver_fat
self.infusion = infusion
self.serum = serum
self.patient_id = patient_id
self._data = dict()
def __repr__(self):
return self.sample_id
def getSampleInfo(self):
print(self.sample_id)
print("Liver fat(percentage):", self.liver_fat)
print("Insulin infusion: " + self.infusion)
print("Serum type: " + self.serum)
print("Patient id: " + self.patient_id)
def addOmicsData(self, metabolite_id, value):
'''
Find the existing entry of the metabolite in the data dictionary, add value to the existing entry
If no existing entry can be found, create an entry for the metabolite
Values have to be in float!!
'''
if not isinstance(value, float):
print("Data values must be floats")
return 0
self._data[metabolite_id] = self._data.get(metabolite_id, 0) + value
def getOmicsData(self):
return self._data
def clearOmicsData(self):
self._data = dict()
def avgOmicsData(self, replicate):
'''
Average the omics data from two replicates
Return with a new object that has the average omics data
***If one replicate has 0 for a metabolite, the average will be 0
Could potentially add an attribute that states whether a study has been averaged or not
'''
if not isinstance(replicate, MetabolomicsSample):
print("Replicate must be a MetabolomicsSample sample")
return 0
elif (replicate.liver_fat != self.liver_fat or
replicate.infusion != self.infusion or
replicate.serum != self.serum or
replicate.patient_id != self.patient_id):
print("Study info do not match")
return 0
mean_study = MetabolomicsSample(self.sample_id, self.liver_fat,
self.infusion, self.serum, self.patient_id)
for metabolite in self.getOmicsData():
if self.getOmicsData()[metabolite] == 0 or replicate.getOmicsData()[metabolite] == 0:
avg_abundance = 0.0
else:
avg_abundance = (self.getOmicsData()[metabolite] + replicate.getOmicsData()[metabolite]) / 2
mean_study.addOmicsData(metabolite, avg_abundance)
return mean_study
In [1]:
import itertools
In [39]:
# Average Jaccard coefficient
def mean_jaccard_coeff(in_list):
all_jaccard_coeff = []
for combination in itertools.combinations(in_list, 2):
(pathway1, pathway2) = combination
#print(pathway1, pathway2)
intersect_size = len(pathway1 & pathway2)
union_size = len(pathway1 | pathway2)
jaccard_coeff = intersect_size / union_size
all_jaccard_coeff.append(jaccard_coeff)
all_jaccard_coeff = np.mean(all_jaccard_coeff)
return all_jaccard_coeff
In [6]:
from bioservices import KEGG, ChEBI
import ora_msc
In [7]:
ch = ChEBI()
In [8]:
def conv_chebi_kegg(chebi_id, chebi_instance):
'''
converting chebi ids to kegg ids
if the chebi entry does not have database links
return the ascii name of that entry
'''
res = chebi_instance.getCompleteEntity(chebi_id)
try:
for link in res.DatabaseLinks:
if link.type == 'KEGG COMPOUND accession':
kegg_id = link.data
return kegg_id
except AttributeError:
pass
return res.chebiAsciiName
def avg_replicates_study(repstudies):
'''
Only works when there are exactly two replicates
Must sort the input list by patient id first!!!
'''
avg_studies = []
for i in range(0, len(repstudies), 2):
study_rep1 = repstudies[i]
study_rep2 = repstudies[i+1]
study_avg = study_rep1.avgOmicsData(study_rep2)
avg_studies.append(study_avg)
return avg_studies
In [9]:
directory = '/data/zx2313/MTBLS298/' # The location of the maf file
#directory = '/home/zxu/Documents/mscbioinfo/data/MTBLS298/'
maf = directory + 'm_catheterization_study_metabolite_profiling_mass_spectrometry_v2_maf.tsv'
sample_info = directory + 's_Catheterization study.txt'
In [10]:
# Initialise metabolomics samples
met_studies = []
with open(sample_info, 'r') as fh:
for line in fh.readlines()[1:]:
fields = line.rstrip().split('\t')
sample_id = fields[8][1:-1]
liver_fat = int(fields[9][1:-1])
infusion = fields[16][1:-1]
serum = fields[19][1:-1]
patient_id = fields[22][1:-1]
met_studies.append(MetabolomicsSample(sample_id, liver_fat, infusion, serum, patient_id))
In [11]:
with open(maf, 'r') as fh:
study_indices = []
lines = fh.readlines()
for line in lines[0:1]:
fields = line.rstrip().split('\t')
#print(fields[0], fields[4], fields[11], fields[14], fields[16], fields[17], fields[21:81])# 0 4 11 14 16 17 21-80
for field in fields[21:81]:
sample_id = field[1:-5]
for index in range(0, len(met_studies)):
study = met_studies[index]
if sample_id == study.sample_id:
study_indices.append(index)
for line in lines[1:]:
fields = line.rstrip().split('\t')
database_id = fields[0][1:-1]
if not database_id.startswith('CHEBI'):
continue # If the metabolite is not chebi, go to the next line
else:
converted_id = conv_chebi_kegg(database_id, ch)
if not (converted_id.startswith('C') and len(converted_id) == 6):
continue # If the converted id is not kegg, go to the next line
for index in range(21, 81): #fields[21:81]:
study_index = study_indices[index - 21]
omics_value = float(fields[index][1:-1])
met_studies[study_index].addOmicsData(converted_id, omics_value)
In [ ]:
met_studies.sort(key = lambda x: x.patient_id)
In [ ]:
basal_studies = []
insulin_studies = []
for study in met_studies:
if study.infusion == 'Basal':
basal_studies.append(study)
elif study.infusion == 'Insulin':
insulin_studies.append(study)
In [ ]:
avg_basal_studies = avg_replicates_study(basal_studies)
avg_insulin_studies = avg_replicates_study(insulin_studies)
In [ ]:
avg_basal_studies = avg_basal_studies[:8] + avg_basal_studies[10:]
In [ ]:
avg_insulin_studies = avg_insulin_studies[:5] + avg_insulin_studies[6:7] + avg_insulin_studies[8:9] + avg_insulin_studies[10:11] + avg_insulin_studies[12:]
In [ ]:
for study in avg_basal_studies:
print(study.patient_id)
In [ ]:
for study in avg_insulin_studies:
print(study.patient_id)
In [ ]:
import scipy.stats
# FDR
from rpy2.robjects.packages import importr
from rpy2.robjects.vectors import FloatVector
In [ ]:
# go through one metabolite in all samples
# do the same thing with the other condition
# if the paired t-test is significant, add the metabolite to the DE list
# if any zero is present in one sample, remove the related data in the other condition
p_cutoff = 1
de_metabolites = []
all_metabolites = list(avg_basal_studies[0].getOmicsData().keys())
for metabolite in all_metabolites:
b_study_paired_abun = []
i_study_paired_abun = []
for index in range(0, len(avg_basal_studies)):
b_study = avg_basal_studies[index]
i_study = avg_insulin_studies[index]
if b_study.getOmicsData()[metabolite] != 0.0 and i_study.getOmicsData()[metabolite] != 0.0:
# if neither of the abundance is zero, add the data points to the paired list
b_study_paired_abun.append(b_study.getOmicsData()[metabolite])
i_study_paired_abun.append(i_study.getOmicsData()[metabolite])
#print(len(b_study_paired_abun), len(i_study_paired_abun))
if len(b_study_paired_abun) == 1 or len(i_study_paired_abun) == 1: # Skip this met if DOF too small
continue
p_val = scipy.stats.ttest_rel(b_study_paired_abun, i_study_paired_abun)[1]
if p_val < p_cutoff:
#print(metabolite, p_val)
de_metabolites.append(metabolite)
In [ ]:
de_metabolites
In [12]:
import random
In [13]:
kegg = KEGG()
kegg.organism = 'hsa'
In [14]:
hsa_pathways = kegg.pathwayIds
pathway_2_compounds = dict()
for pathway in hsa_pathways:
parsed_output = kegg.parse(kegg.get(pathway)) # parsed_ouput has lots of information about the pathway
try:
compounds = set(parsed_output['COMPOUND'].keys())
pathway_2_compounds[pathway] = compounds
except KeyError: # Some pathways do not have defined compounds
#name = parsed_output['NAME']
#print(pathway, name)
pass
In [15]:
# Simulating DE metabolites
def simulateDEmet(dataset_paths, sigpaths_num,
pathway_2_compounds, background_met,
verbose=False):
augmented_paths = []
for i in range(0, sigpaths_num): # Select significant pathways
random.shuffle(dataset_paths)
augmented_paths.append(dataset_paths[0])
if verbose:
print(dataset_paths[0])
de_metabolites = set()
for pathway in augmented_paths: # Make DE metabolites
pathway_met = list(pathway_2_compounds[pathway] & background_met) # background mets which existed in KEGG pathway DB
trim = int(len(pathway_met) * 0.6) + 1
random.shuffle(pathway_met)
pathway_met = pathway_met[:trim]
de_metabolites = de_metabolites.union(pathway_met)
if verbose:
print(de_metabolites)
return de_metabolites
In [33]:
mean_jaccard_coeff(new_p2c.values())
Out[33]:
In [32]:
new_p2c = dict()
for pathway in dataset_paths:
detectable_metabolite = pathway_2_compounds[pathway] & background_met
new_p2c[pathway] = detectable_metabolite
In [30]:
dataset_paths = ora_msc.oras_allpaths(background_met, hsa_pathways, background_met, pathway_2_compounds, True, False, 0, [])[1]
In [31]:
de_metabolites = simulateDEmet(dataset_paths, 2, pathway_2_compounds, background_met, True)
In [22]:
import numpy as np
import operator
In [26]:
mets = list(met_studies[0].getOmicsData().keys())
#print(mets)
met_abundance = dict()
for study in met_studies:
for metabolite in mets:
#print(met_abundance)
#print(metabolite)
study_abundance = study.getOmicsData()[metabolite]
#print(study_abundance)
if study_abundance == 0:
continue
#print(study_abundance)
try:
met_abundance[metabolite].append(study_abundance)
except KeyError:
met_abundance[metabolite] = [study_abundance]
In [27]:
# Change all abundance to a mean abundance
for metabolite in met_abundance:
mean_abundance = np.mean(met_abundance[metabolite])
met_abundance[metabolite] = mean_abundance
In [28]:
# Sort metabolites based on their average abundance
sorted_met = sorted(met_abundance.items(), key=operator.itemgetter(1))
In [29]:
background_met = set(met_abundance.keys())
In [ ]:
#background_met = set(avg_basal_studies[0].getOmicsData().keys())
background_met = set(background_met)
In [ ]:
len(background_met)
In [ ]:
(path_pvals, path_ids, path_sizes) = ora_msc.oras_allpaths(set(de_metabolites), hsa_pathways, background_met, pathway_2_compounds, True, False, 0, [])
In [ ]:
for path in dict(zip(path_ids, path_pvals)):
if dict(zip(path_ids, path_pvals))[path] < 0.05:
print(path, dict(zip(path_ids, path_pvals))[path])
In [58]:
# ORA with abundance cutoffs
#nums_removed_metabolites = [0, 25, 50, 75]
PVAL_CUTOFF = 0.05
de_metabolites = simulateDEmet(dataset_paths, 2, pathway_2_compounds, background_met)
(path_pvals, path_ids, path_sizes) = ora_msc.oras_allpaths(set(de_metabolites), hsa_pathways, background_met,
pathway_2_compounds, True, False, 0, [])
correct_pathway_hits = set()
for path in dict(zip(path_ids, path_pvals)):
if dict(zip(path_ids, path_pvals))[path] < PVAL_CUTOFF:
correct_pathway_hits.add(path)
all_tested_paths = set(path_ids) # Store for the calculation of negatives
print('# DE metabolites:', de_metabolites)
print('# P-val cutoff:', PVAL_CUTOFF)
print('# Significant pathways:', str(len(correct_pathway_hits)))
print('removed', 'tp', 'fp', 'tn', 'fn', 'nonsig', 'lowmet', 'tested', 'accu', 'recall', 'precision', sep='\t')
for rem in range(0, 100, 10):
new_de_mets = set()
new_background = set()
removed_mets = []
new_pathway_hits = set()
new_pathway_nonsig = set()
new_pathway_negs = set()
# Select removed metabolites and remove them from DE and background
for item in sorted_met[:rem]:
metabolite = item[0]
removed_mets.append(metabolite)
for metabolite in de_metabolites:
if metabolite not in removed_mets:
new_de_mets.add(metabolite)
for metabolite in background_met:
if metabolite not in removed_mets:
new_background.add(metabolite)
# Calculate average jaccard coefficient
new_p2c = dict()
for pathway in dataset_paths:
detectable_metabolite = pathway_2_compounds[pathway] & new_background
if len(detectable_metabolite) == 0:
continue
new_p2c[pathway] = detectable_metabolite
print(mean_jaccard_coeff(new_p2c.values()))
# ORA
(path_pvals, path_ids, path_sizes) = ora_msc.oras_allpaths(new_de_mets, hsa_pathways, new_background,
pathway_2_compounds, True, False, 0, [])
#if len(path_ids) == 0:
# pass
# Report results
for path in dict(zip(path_ids, path_pvals)):
if dict(zip(path_ids, path_pvals))[path] < PVAL_CUTOFF:
new_pathway_hits.add(path)
else:
new_pathway_nonsig.add(path)
new_pathway_negs = all_tested_paths.difference(new_pathway_hits)
tp = len(correct_pathway_hits & new_pathway_hits)
fp = len(new_pathway_hits.difference(correct_pathway_hits))
fn = len(new_pathway_negs & correct_pathway_hits)
tn = len(new_pathway_negs.difference(correct_pathway_hits))
fn_nonsig = len(new_pathway_nonsig & correct_pathway_hits)
fn_lowmet = fn - fn_nonsig
accuracy = (tp + tn) / (tp + fp + fn + tn)
#recall = tp / (tp + fn)
try:
recall = tp / (tp + fn)
except ZeroDivisionError:
recall = np.NaN
try:
precision = tp /(tp + fp)
except ZeroDivisionError:
precision = np.NaN
#print(len(path_ids))
print(rem, tp, fp, tn, fn, fn_nonsig, fn_lowmet, len(path_ids),
"%.2f" % accuracy, "%.2f" % recall, "%.2f" % precision, sep='\t')
#print('=' * 10 + str(rem) + '='* 10)
In [ ]:
# ORA with abundance cutoffs
#nums_removed_metabolites = [0, 25, 50, 75]
PVAL_CUTOFF = 0.05
#fh = open('/home/zxu/Documents/github_local/ora-msc/data/' + 'abundance.tsv', 'a')
fh = open('./data/' + 'abundance.tsv', 'w')
fh.write('\t'.join(['# removed', 'tp', 'fp', 'tn', 'fn', 'nonsig', 'lowmet', 'tested', 'de', 'accu', 'recall', 'precision']))
fh.write('\n')
fh.write('# P-val cutoff: ')
fh.write(str(PVAL_CUTOFF) + '\n')
for k in range(0, 500):
de_metabolites = simulateDEmet(dataset_paths, 2, pathway_2_compounds, background_met)
(path_pvals, path_ids, path_sizes) = ora_msc.oras_allpaths(set(de_metabolites), hsa_pathways, background_met,
pathway_2_compounds, True, False, 0, [])
correct_pathway_hits = set()
for path in dict(zip(path_ids, path_pvals)):
if dict(zip(path_ids, path_pvals))[path] < PVAL_CUTOFF:
correct_pathway_hits.add(path)
# If no significant pathways, run again
# (Seems it does not work the way I wanted)
if len(correct_pathway_hits) == 0:
k -= 1
continue
all_tested_paths = set(path_ids) # Store for the calculation of negatives
fh.write('# DE metabolites: ')
fh.write(','.join(list(de_metabolites)))
fh.write('\n')
fh.write('# Significant pathways: ')
fh.write(str(len(correct_pathway_hits)) + '\n')
for rem in range(0, 100, 10):
new_de_mets = set()
new_background = set()
removed_mets = []
new_pathway_hits = set()
new_pathway_nonsig = set()
new_pathway_negs = set()
# Select removed metabolites and remove them from DE and background
for item in sorted_met[:rem]:
metabolite = item[0]
removed_mets.append(metabolite)
for metabolite in de_metabolites:
if metabolite not in removed_mets:
new_de_mets.add(metabolite)
for metabolite in background_met:
if metabolite not in removed_mets:
new_background.add(metabolite)
# ORA
(path_pvals, path_ids, path_sizes) = ora_msc.oras_allpaths(new_de_mets, hsa_pathways, new_background,
pathway_2_compounds, True, False, 0, [])
#if len(path_ids) == 0:
# pass
# Report results
for path in dict(zip(path_ids, path_pvals)):
if dict(zip(path_ids, path_pvals))[path] < PVAL_CUTOFF:
new_pathway_hits.add(path)
else:
new_pathway_nonsig.add(path)
new_pathway_negs = all_tested_paths.difference(new_pathway_hits)
tp = len(correct_pathway_hits & new_pathway_hits)
fp = len(new_pathway_hits.difference(correct_pathway_hits))
fn = len(new_pathway_negs & correct_pathway_hits)
tn = len(new_pathway_negs.difference(correct_pathway_hits))
fn_nonsig = len(new_pathway_nonsig & correct_pathway_hits)
fn_lowmet = fn - fn_nonsig
accuracy = (tp + tn) / (tp + fp + fn + tn)
recall = tp / (tp + fn)
try:
precision = tp /(tp + fp)
except ZeroDivisionError:
precision = np.NaN
#print(len(path_ids))
#'\t'.join(list(map(str, [rem, tp, fp, tn, fn])))
fh.write('\t'.join(list(map(str, [rem, tp, fp, tn, fn, fn_nonsig, fn_lowmet, len(path_ids), len(new_de_mets),
"%.2f" % accuracy, "%.2f" % recall, "%.2f" % precision]))))
fh.write('\n')
#print('=' * 10 + str(rem) + '='* 10)
fh.close()
In [ ]:
# ORA with abundance cutoffs
#nums_removed_metabolites = [0, 25, 50, 75]
PVAL_CUTOFF = 0.05
#fh = open('/home/zxu/Documents/github_local/ora-msc/data/' + 'shrink.tsv', 'a')
fh = open('./data/' + 'shrink_add.tsv', 'w')
fh.write('\t'.join(['# removed', 'tp', 'fp', 'tn', 'fn', 'nonsig', 'lowmet', 'tested', 'de', 'accu', 'recall', 'jaccard']))
fh.write('\n')
fh.write('# P-val cutoff: ')
fh.write(str(PVAL_CUTOFF) + '\n')
for k in range(0, 2):
print(k)
de_metabolites = simulateDEmet(dataset_paths, 2, pathway_2_compounds, background_met)
(path_pvals, path_ids, path_sizes) = ora_msc.oras_allpaths(set(de_metabolites), hsa_pathways, background_met,
pathway_2_compounds, True, False, 0, [])
correct_pathway_hits = set()
for path in dict(zip(path_ids, path_pvals)):
if dict(zip(path_ids, path_pvals))[path] < PVAL_CUTOFF:
correct_pathway_hits.add(path)
# If no significant pathways, run again
if len(correct_pathway_hits) == 0:
k -= 1
continue
all_tested_paths = set(path_ids) # Store for the calculation of negatives
fh.write('# DE metabolites: ')
fh.write(','.join(list(de_metabolites)))
fh.write('\n')
fh.write('# Significant pathways: ')
fh.write(str(len(correct_pathway_hits)) + '\n')
for i in range(0, 50):
# Shuffle background met so they can be randomly removed
metlist = list(background_met)
random.shuffle(metlist)
for rem in range(0, 100, 10):
new_de_mets = set()
new_background = set()
removed_mets = []
new_pathway_hits = set()
new_pathway_nonsig = set()
new_pathway_negs = set()
# Select removed metabolites and remove them from DE and background
for metabolite in metlist[:rem]:
removed_mets.append(metabolite)
for metabolite in de_metabolites:
if metabolite not in removed_mets:
new_de_mets.add(metabolite)
for metabolite in background_met:
if metabolite not in removed_mets:
new_background.add(metabolite)
# Calculate average jaccard coefficient
new_p2c = dict()
for pathway in dataset_paths:
detectable_metabolite = pathway_2_compounds[pathway] & new_background
if len(detectable_metabolite) == 0:
continue
new_p2c[pathway] = detectable_metabolite
jcd_coeff = (mean_jaccard_coeff(new_p2c.values()))
# ORA
(path_pvals, path_ids, path_sizes) = ora_msc.oras_allpaths(new_de_mets, hsa_pathways, new_background,
pathway_2_compounds, True, False, 0, [])
if len(path_ids) == 0:
pass
# Report results
for path in dict(zip(path_ids, path_pvals)):
if dict(zip(path_ids, path_pvals))[path] < PVAL_CUTOFF:
new_pathway_hits.add(path)
else:
new_pathway_nonsig.add(path)
new_pathway_negs = all_tested_paths.difference(new_pathway_hits)
tp = len(correct_pathway_hits & new_pathway_hits)
fp = len(new_pathway_hits.difference(correct_pathway_hits))
fn = len(new_pathway_negs & correct_pathway_hits)
tn = len(new_pathway_negs.difference(correct_pathway_hits))
fn_nonsig = len(new_pathway_nonsig & correct_pathway_hits)
fn_lowmet = fn - fn_nonsig
accuracy = (tp + tn) / (tp + fp + fn + tn)
recall = tp / (tp + fn)
try:
precision = tp /(tp + fp)
except ZeroDivisionError:
precision = np.NaN
#print(len(path_ids))
#'\t'.join(list(map(str, [rem, tp, fp, tn, fn])))
fh.write('\t'.join(list(map(str, [rem, tp, fp, tn, fn, fn_nonsig, fn_lowmet, len(path_ids), len(new_de_mets),
"%.2f" % accuracy, "%.2f" % recall, jcd_coeff]))))
fh.write('\n')
#print('=' * 10 + str(rem) + '='* 10)
fh.close()
In [ ]:
hsa_cmpds = list(ora_msc.get_all_compounds('hsa'))
In [ ]:
{'C00187', C17714, C16834, 'C00219', C00334}
for k in range(0, 10):
null_model = ora_msc.make_null_model(pathway_2_compounds, hsa_cmpds, True)
(path_pvals, path_ids, path_sizes) = ora_msc.oras_allpaths(set(de_metabolites), hsa_pathways, set(background_met),
null_model, True, False, 0, [])
for path in dict(zip(path_ids, path_pvals)):
if dict(zip(path_ids, path_pvals))[path] < 0.05:
print(path, dict(zip(path_ids, path_pvals))[path])
print(null_model[path] & background_met)
print(path_sizes)
print('=' * 20)
In [ ]:
PATHWAY_COMPOUND_SIZE = 2
PATHWAY_DE_METABOLITES = 1
BACKGROUND_SIZE = 108
DE_MET_SIZE = 5
for DE_MET_SIZE in range(5, 6):
hyperg_test = hypergeom(BACKGROUND_SIZE, PATHWAY_COMPOUND_SIZE, DE_MET_SIZE)
ora_raw_pval = 1 - hyperg_test.cdf(PATHWAY_DE_METABOLITES) + hyperg_test.pmf(PATHWAY_DE_METABOLITES)
print(ora_raw_pval)
In [67]:
def plot_dict(in_dict):
xvals = []
yvals = []
sorted_keys = sorted(list(in_dict.keys()))[::-1]
for key in sorted_keys:
xvals.append(key)
yvals.append(in_dict[key])
print(xvals)
print(yvals)
plt.plot(xvals, yvals)
plt.show()
In [68]:
#abun_results = '/home/zxu/Documents/github_local/ora-msc/data/' + 'abundance.tsv'
#shrink_results = '/home/zxu/Documents/github_local/ora-msc/data/' + 'shrink.tsv'
shrink_results = './data/' + 'shrink_test.tsv'
In [ ]:
de_simcount = 0
de_mets_vs_sigpaths = dict()
with open(shrink_results, 'r') as fh:
lines = fh.readlines()
for line in lines[2:]:
if line.startswith('# DE'):
de_mets = line.rstrip().split(': ')[1]
de_mets = de_mets.split(',')
elif line.startswith('# Sig'):
sigpaths = int(line.rstrip().split(': ')[1])
de_simcount += 1
try:
de_mets_vs_sigpaths[len(de_mets)].append(sigpaths)
except KeyError:
de_mets_vs_sigpaths[len(de_mets)] = [sigpaths]
for key in de_mets_vs_sigpaths:
de_mets_vs_sigpaths[key] = np.mean(de_mets_vs_sigpaths[key])
In [ ]:
de_simcount
In [ ]:
import matplotlib.pyplot as plt
plot_dict(de_mets_vs_sigpaths)
In [69]:
accu_vs_remove = dict()
recall_vs_remove = dict()
with open(shrink_results, 'r') as fh:
lines = fh.readlines()
for line in lines:
if not line.startswith('#'):
fields = line.rstrip().split('\t')
removed = fields[0]
accu = float(fields[9])
recall = float(fields[10])
try:
accu_vs_remove[removed].append(accu)
except KeyError:
accu_vs_remove[removed] = [accu]
try:
recall_vs_remove[removed].append(recall)
except KeyError:
recall_vs_remove[removed] = [recall]
for key in accu_vs_remove:
accu_vs_remove[key] = np.mean(accu_vs_remove[key])
for key in recall_vs_remove:
recall_vs_remove[key] = np.mean(recall_vs_remove[key])
In [ ]:
accu_vs_remove
In [71]:
import matplotlib.pyplot as plt
plot_dict(recall_vs_remove)
In [ ]:
last_fp = 0
last_de = 100
increased_fp = []
decreased_de = []
with open(shrink_results, 'r') as fh:
lines = fh.readlines()
for line in lines:
if not line.startswith('#'):
fields = line.rstrip().split('\t')
fp = int(fields[2])
de = int(fields[8])
if fp > last_fp and de < last_de:
increased_fp.append(1)
decreased_de.append(1)
elif fp > last_fp and de == last_de:
increased_fp.append(0)
elif de < last_de:
decreased_de.append(0)
last_fp = fp
last_de = de
In [ ]:
print(np.sum(increased_fp))
print(np.sum(decreased_de))
In [ ]:
print(len(increased_fp))
print(len(decreased_de))
In [ ]:
ecoli_compounds = ora_msc.get_all_compounds('eco')