In [1]:
import os
import sys
src_dir = os.path.abspath('../src')
if src_dir not in sys.path:
sys.path.append(src_dir)
In [2]:
%matplotlib inline
import collections
import itertools
import string
import matplotlib.patheffects as path_effects
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import pandas as pd
import pyteomics.auxiliary
import seaborn as sns
import tqdm
from matplotlib_venn import venn3, venn3_circles
from ann_solo import reader, spectrum, util
In [3]:
# plot styling
plt.style.use(['seaborn-white', 'seaborn-paper'])
plt.rc('font', family='serif')
sns.set_palette('Set1')
sns.set_context('paper', font_scale=1.3) # single-column figure
In [4]:
tqdm.tqdm = tqdm.tqdm_notebook
In [5]:
def extract_time_from_log(filename):
with open(filename, 'r') as f_in:
for line in f_in:
if line.startswith('user'):
# user time
usertime = line.split()[1]
minutes = int(usertime[:usertime.find('m')])
seconds = float(usertime[usertime.find('m') + 1: usertime.rfind('s')])
usertime = minutes * 60 + seconds
# sys time
line = next(f_in)
systime = line.split()[1]
minutes = int(systime[:systime.find('m')])
seconds = float(systime[systime.find('m') + 1: systime.rfind('s')])
systime = minutes * 60 + seconds
return usertime + systime
In [6]:
def read_spectrast_psms(filename):
psms = []
psms_df = pd.read_csv(filename, sep='\t', header=0)
for psm_tuple in psms_df.itertuples(False):
sequence, charge = psm_tuple.ID.split('/')
query_spectrum = spectrum.Spectrum(psm_tuple._0, psm_tuple.MzDiff,
int(charge))
psm = spectrum.SpectrumMatch(query_spectrum,
search_engine_score=psm_tuple.Dot)
psm.sequence = sequence
psm.calc_mass_to_charge = 0
psm.is_decoy = 'DECOY_' in psm_tuple.Proteins
psms.append(psm)
return psms
def read_msfragger_psms(filename):
psms = []
psms_df = pd.read_csv(
filename, sep='\t', header=None, names=[
'ScanID', 'Precursor neutral mass (Da)',
'Retention time (minutes)', 'Precursor charge', 'Hit rank',
'Peptide Sequence', 'Upstream Amino Acid', 'Downstream Amino Acid',
'Protein', 'Matched fragment ions',
'Total possible number of matched theoretical fragment ions',
'Neutral mass of peptide (including any variable modifications) (Da)',
'Mass difference', 'Number of tryptic termini',
'Number of missed cleavages', 'Variable modifications detected',
'Hyperscore', 'Next score', 'Intercept of expectation model',
'Slope of expectation model'])
for psm_tuple in psms_df.itertuples(False):
charge = psm_tuple._3
mz = psm_tuple._12 / charge
query_spectrum = spectrum.Spectrum(psm_tuple.ScanID, mz, charge, psm_tuple._2)
psm = spectrum.SpectrumMatch(query_spectrum,
search_engine_score=psm_tuple.Hyperscore)
psm.sequence = psm_tuple._5
psm.calc_mass_to_charge = 0
psm.is_decoy = 'decoy_' in psm_tuple.Protein
psms.append(psm)
return psms
In [7]:
def psms_to_df(psms):
sequences, ids, scores, charges, exp_masses, calc_masses, decoys = ([] for _ in range(7))
for psm in psms:
sequences.append(psm.sequence)
ids.append(psm.query_id)
scores.append(psm.search_engine_score)
charges.append(psm.charge)
exp_masses.append(psm.exp_mass_to_charge)
calc_masses.append(psm.calc_mass_to_charge)
decoys.append(psm.is_decoy)
return pd.DataFrame({'sequence': sequences, 'PSM_ID': ids,
'search_engine_score[1]': scores, 'charge': charges,
'exp_mass_to_charge': exp_masses,
'calc_mass_to_charge': calc_masses,
'opt_ms_run[1]_cv_MS:1002217_decoy_peptide': decoys})
In [8]:
max_fdr = 0.01
tol_mass = 0.1
tol_mode = 'Da'
min_group_size = 20
In [9]:
hek293_dir = '../data/processed/hek293'
In [10]:
num_ids = []
runtimes = []
psms = collections.defaultdict(list)
total = 24 * 2 * 6 # 24 raw files * 2 (IDs & log) * 6 (search engine combos)
with tqdm.tqdm(desc='Files processed', unit='files', total=total) as pbar:
for search_engine in ('ann-solo', 'spectrast', 'msfragger'):
for search_mode in ('oms', 'std'):
for filename in os.listdir(
os.path.join(hek293_dir, search_engine, search_mode)):
filename_full = os.path.join(hek293_dir, search_engine,
search_mode, filename)
_, ext = os.path.splitext(filename_full)
if ext == '.log':
runtimes.append((
search_engine, search_mode,
os.path.splitext(filename)[0],
extract_time_from_log(filename_full)))
pbar.update(1)
elif ext in ('.mztab', '.txt', '.tsv'):
if ext == '.mztab': # ANN-SoLo
file_psms = reader.read_mztab_psms(filename_full)
elif ext == '.txt': # SpectraST
file_psms = psms_to_df(list(util.filter_group_fdr(
read_spectrast_psms(filename_full),
max_fdr, tol_mass, tol_mode, min_group_size)))
elif ext == '.tsv': # MSFragger
file_psms = psms_to_df(list(util.filter_group_fdr(
read_msfragger_psms(filename_full),
max_fdr, tol_mass, tol_mode, min_group_size)))
psms[(search_engine, search_mode)].append(file_psms)
num_ids.append((search_engine, search_mode,
os.path.splitext(filename)[0],
len(file_psms)))
pbar.update(1)
num_ids_df = pd.DataFrame.from_records(
num_ids, columns=['search_engine', 'search_mode', 'filename', 'psms'])
time_df = pd.DataFrame.from_records(
runtimes, columns=['search_engine', 'search_mode', 'filename', 'time'])
stats = (pd.merge(num_ids_df, time_df,
on=['search_engine', 'search_mode', 'filename'])
.sort_values(['search_engine', 'search_mode', 'filename'])
.reset_index(drop=True))
summary = (stats.groupby(['search_engine', 'search_mode'])
.agg({'psms': 'sum', 'time': 'mean'}))
summary['time'] = summary['time'] / 60
psms_df, peptides = [], collections.defaultdict(dict)
for (search_engine, search_mode), psm_list in psms.items():
sequences = set(pd.concat(psm_list)['sequence']
.str.replace(r'n?\[\d+\]', '').unique())
psms_df.append((search_engine, search_mode, len(sequences)))
peptides[search_mode][search_engine] = sequences
psms_df = pd.DataFrame.from_records(
psms_df, index=['search_engine', 'search_mode'],
columns=['search_engine', 'search_mode', 'peptides'])
summary = summary.join(psms_df)
In [11]:
summary
Out[11]:
In [12]:
# export LaTeX table
with open('../../reports/rev/tab-hek293_stats.tex', 'w') as f_out:
f_out.write('\\begin{tabularx}{0.5\\textwidth}{Xrrr}\n')
f_out.write('\\toprule\n')
f_out.write('Search engine & Time (\\si{\\minute}) & \\#~SSMs & \\#~Peptides \\\\\n')
f_out.write('\\midrule\n')
f_out.write('\\multicolumn{4}{l}{\\emph{Standard search}} \\\\\n')
for search_engine in ('MSFragger', 'SpectraST', 'ANN-SoLo'):
num_psms, time, num_pep = summary.loc[search_engine.lower(), 'std']
f_out.write(f'\\mbox{{{search_engine}}} & \\num{{{round(time, 1)}}} & '
f'\\num{{{int(num_psms)}}} & \\num{{{int(num_pep)}}} \\\\\n')
f_out.write('\\addlinespace\n')
f_out.write('\\multicolumn{4}{l}{\\emph{Open search}} \\\\\n')
for search_engine in ('MSFragger', 'SpectraST', 'ANN-SoLo'):
num_psms, time, num_pep = summary.loc[search_engine.lower(), 'oms']
f_out.write(f'\\mbox{{{search_engine}}} & \\num{{{round(time, 1)}}} & '
f'\\num{{{int(num_psms)}}} & \\num{{{int(num_pep)}}} \\\\\n')
f_out.write('\\bottomrule\n')
f_out.write('\\end{tabularx}\n')
In [13]:
for search_mode, search_engines in peptides.items():
for search_engine, sequences in search_engines.items():
peptides[search_mode][search_engine] = set([pep.replace('I', 'L') for pep in sequences])
In [14]:
height = 7
width = height * 2
fig, axes = plt.subplots(1, 2, figsize=(width, height))
label_map = {'ann-solo': 'ANN-SoLo', 'msfragger': 'MSFragger',
'spectrast': 'SpectraST'}
for i, (search_mode, ax) in enumerate(zip(('std', 'oms'), axes)):
v = venn3(peptides[search_mode].values(),
[label_map[key] for key in peptides[search_mode].keys()],
set_colors=[next(ax._get_lines.prop_cycler)['color'],
next(ax._get_lines.prop_cycler)['color'],
next(ax._get_lines.prop_cycler)['color']],
alpha=1, ax=ax)
# make sure the numbers are legible
for text in v.subset_labels:
text.set_color('white')
text.set_path_effects([path_effects.Stroke(linewidth=3, foreground='black'),
path_effects.Normal()])
c = venn3_circles(peptides[search_mode].values(), linewidth=1.0, ax=ax)
ax.annotate(string.ascii_uppercase[i], xy=(0.5 * i, 1), xytext=(5, -20),
xycoords='figure fraction', textcoords='offset pixels',
fontsize='xx-large', weight='bold')
plt.tight_layout()
plt.savefig('hek293_peptides_venn.pdf', dpi=300, bbox_inches='tight')
plt.show()
plt.close()
In [15]:
def get_mass_groups(psms, tol_mass, tol_mode):
# start with the highest ranked PSM
mass_groups = []
psms_remaining = psms.sort_values('search_engine_score[1]', ascending=False)
while len(psms_remaining) > 0:
# find all remaining PSMs within the precursor mass window
mass_diff = psms_remaining['mass_diff'].iloc[0]
if tol_mass is None or tol_mode not in ('Da', 'ppm'):
psms_selected = psms_remaining
elif tol_mode == 'Da':
psms_selected = \
psms_remaining[(psms_remaining['mass_diff'] - mass_diff).abs()
<= tol_mass]
elif tol_mode == 'ppm':
psms_selected = \
psms_remaining[(psms_remaining['mass_diff'] - mass_diff).abs()
/ psms_remaining['exp_mass_to_charge'] * 10 ** 6
<= tol_mass]
mass_groups.append(psms_selected)
# exclude the selected PSMs from further selections
psms_remaining.drop(psms_selected.index, inplace=True)
mass_group_stats = []
for mass_group in mass_groups:
mass_group_stats.append((mass_group['mass_diff'].median(),
mass_group['mass_diff'].mean(),
len(mass_group)))
mass_group_stats = pd.DataFrame.from_records(
mass_group_stats, columns=['mass_diff_median', 'mass_diff_mean', 'num_psms'])
return mass_group_stats
In [16]:
for psms_key, psms_list in psms.items():
psms[psms_key] = pd.concat(psms_list, ignore_index=True)
In [17]:
search_engines = ('ANN-SoLo', 'SpectraST', 'MSFragger')
for search_engine in search_engines:
psms_oms = psms[(search_engine.lower(), 'oms')]
psms_oms['mass_diff'] = (
(psms_oms['exp_mass_to_charge'] - psms_oms['calc_mass_to_charge'])
* psms_oms['charge'])
In [18]:
mass_groups = {}
for search_engine in search_engines:
psms_oms = psms[(search_engine.lower(), 'oms')]
psms_oms['mass_diff'] = (
(psms_oms['exp_mass_to_charge'] - psms_oms['calc_mass_to_charge'])
* psms_oms['charge'])
mass_groups[search_engine.lower()] = get_mass_groups(psms_oms, tol_mass, tol_mode)
In [19]:
width = 7
height = width / 1.618 / 2 * len(search_engines)
fig, axes = plt.subplots(len(search_engines), 1, sharex=True, figsize=(width, height))
for ax, search_engine in zip(axes, search_engines):
mass_group = mass_groups[search_engine.lower()]
mask = mass_group['mass_diff_median'].abs() > tol_mass # exclude unmodified PSMs
ax.bar(mass_group[mask]['mass_diff_median'], mass_group[mask]['num_psms'],
width=0.4, log=False, color='black')
ax.set_xlim((-50, 200))
# format y-axis numbers
ax.yaxis.set_major_formatter(mticker.StrMethodFormatter('{x:,g}'))
ax.set_ylabel(search_engine)
sns.despine(ax=ax)
# set tick labels at nice positions
axes[-1].xaxis.set_major_locator(mticker.MultipleLocator(50))
axes[-1].set_xlabel('Precursor mass difference (Da)')
ax = fig.add_subplot(111, frameon=False)
ax.grid(False)
ax.tick_params(labelcolor='none', top='off', bottom='off', left='off', right='off')
ax.set_ylabel(f'Number of SSMs (FDR={max_fdr})', labelpad=40)
plt.savefig('mass_diff.pdf', dpi=300, bbox_inches='tight')
plt.show()
plt.close()