HEK293 identification statistics


In [1]:
import os
import sys

src_dir = os.path.abspath('../src')
if src_dir not in sys.path:
    sys.path.append(src_dir)

In [2]:
%matplotlib inline
import collections
import itertools
import string

import matplotlib.patheffects as path_effects
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import pandas as pd
import pyteomics.auxiliary
import seaborn as sns
import tqdm
from matplotlib_venn import venn3, venn3_circles

from ann_solo import reader, spectrum, util

In [3]:
# plot styling
plt.style.use(['seaborn-white', 'seaborn-paper'])
plt.rc('font', family='serif')
sns.set_palette('Set1')
sns.set_context('paper', font_scale=1.3)    # single-column figure

In [4]:
tqdm.tqdm = tqdm.tqdm_notebook

In [5]:
def extract_time_from_log(filename):
    with open(filename, 'r') as f_in:
        for line in f_in:
            if line.startswith('user'):
                # user time
                usertime = line.split()[1]
                minutes = int(usertime[:usertime.find('m')])
                seconds = float(usertime[usertime.find('m') + 1: usertime.rfind('s')])
                usertime = minutes * 60 + seconds
                # sys time
                line = next(f_in)
                systime = line.split()[1]
                minutes = int(systime[:systime.find('m')])
                seconds = float(systime[systime.find('m') + 1: systime.rfind('s')])
                systime = minutes * 60 + seconds
                
                return usertime + systime

In [6]:
def read_spectrast_psms(filename):
    psms = []
    psms_df = pd.read_csv(filename, sep='\t', header=0)
    for psm_tuple in psms_df.itertuples(False):
        sequence, charge = psm_tuple.ID.split('/')
        query_spectrum = spectrum.Spectrum(psm_tuple._0, psm_tuple.MzDiff,
                                           int(charge))
        psm = spectrum.SpectrumMatch(query_spectrum,
                                     search_engine_score=psm_tuple.Dot)
        psm.sequence = sequence
        psm.calc_mass_to_charge = 0
        psm.is_decoy = 'DECOY_' in psm_tuple.Proteins
        psms.append(psm)
    return psms

def read_msfragger_psms(filename):
    psms = []
    psms_df = pd.read_csv(
        filename, sep='\t', header=None, names=[
            'ScanID', 'Precursor neutral mass (Da)',
            'Retention time (minutes)', 'Precursor charge', 'Hit rank',
            'Peptide Sequence', 'Upstream Amino Acid', 'Downstream Amino Acid',
            'Protein', 'Matched fragment ions',
            'Total possible number of matched theoretical fragment ions',
            'Neutral mass of peptide (including any variable modifications) (Da)',
            'Mass difference', 'Number of tryptic termini',
            'Number of missed cleavages', 'Variable modifications detected',
            'Hyperscore', 'Next score', 'Intercept of expectation model',
            'Slope of expectation model'])
    for psm_tuple in psms_df.itertuples(False):
        charge = psm_tuple._3
        mz = psm_tuple._12 / charge
        query_spectrum = spectrum.Spectrum(psm_tuple.ScanID, mz, charge, psm_tuple._2)
        psm = spectrum.SpectrumMatch(query_spectrum,
                                     search_engine_score=psm_tuple.Hyperscore)
        psm.sequence = psm_tuple._5
        psm.calc_mass_to_charge = 0
        psm.is_decoy = 'decoy_' in psm_tuple.Protein
        psms.append(psm)
    return psms

In [7]:
def psms_to_df(psms):
    sequences, ids, scores, charges, exp_masses, calc_masses, decoys = ([] for _ in range(7))
    for psm in psms:
        sequences.append(psm.sequence)
        ids.append(psm.query_id)
        scores.append(psm.search_engine_score)
        charges.append(psm.charge)
        exp_masses.append(psm.exp_mass_to_charge)
        calc_masses.append(psm.calc_mass_to_charge)
        decoys.append(psm.is_decoy)
    
    return pd.DataFrame({'sequence': sequences, 'PSM_ID': ids,
                         'search_engine_score[1]': scores, 'charge': charges,
                         'exp_mass_to_charge': exp_masses,
                         'calc_mass_to_charge': calc_masses,
                         'opt_ms_run[1]_cv_MS:1002217_decoy_peptide': decoys})

In [8]:
max_fdr = 0.01
tol_mass = 0.1
tol_mode = 'Da'
min_group_size = 20

In [9]:
hek293_dir = '../data/processed/hek293'

In [10]:
num_ids = []
runtimes = []
psms = collections.defaultdict(list)
total = 24 * 2 * 6    # 24 raw files * 2 (IDs & log) * 6 (search engine combos)
with tqdm.tqdm(desc='Files processed', unit='files', total=total) as pbar:
    for search_engine in ('ann-solo', 'spectrast', 'msfragger'):
        for search_mode in ('oms', 'std'):
            for filename in os.listdir(
                    os.path.join(hek293_dir, search_engine, search_mode)):
                filename_full = os.path.join(hek293_dir, search_engine,
                                             search_mode, filename)
                _, ext = os.path.splitext(filename_full)
                if ext == '.log':
                    runtimes.append((
                        search_engine, search_mode,
                        os.path.splitext(filename)[0],
                        extract_time_from_log(filename_full)))
                    pbar.update(1)
                elif ext in ('.mztab', '.txt', '.tsv'):
                    if ext == '.mztab':  # ANN-SoLo
                        file_psms = reader.read_mztab_psms(filename_full)
                    elif ext == '.txt':  # SpectraST
                        file_psms = psms_to_df(list(util.filter_group_fdr(
                            read_spectrast_psms(filename_full),
                            max_fdr, tol_mass, tol_mode, min_group_size)))
                    elif ext == '.tsv':  # MSFragger
                        file_psms = psms_to_df(list(util.filter_group_fdr(
                            read_msfragger_psms(filename_full),
                            max_fdr, tol_mass, tol_mode, min_group_size)))
                    psms[(search_engine, search_mode)].append(file_psms)
                    num_ids.append((search_engine, search_mode,
                                    os.path.splitext(filename)[0],
                                    len(file_psms)))
                    pbar.update(1)

num_ids_df = pd.DataFrame.from_records(
    num_ids, columns=['search_engine', 'search_mode', 'filename', 'psms'])
time_df = pd.DataFrame.from_records(
    runtimes, columns=['search_engine', 'search_mode', 'filename', 'time'])
stats = (pd.merge(num_ids_df, time_df,
                 on=['search_engine', 'search_mode', 'filename'])
         .sort_values(['search_engine', 'search_mode', 'filename'])
         .reset_index(drop=True))

summary = (stats.groupby(['search_engine', 'search_mode'])
           .agg({'psms': 'sum', 'time': 'mean'}))
summary['time'] = summary['time'] / 60

psms_df, peptides = [], collections.defaultdict(dict)
for (search_engine, search_mode), psm_list in psms.items():
    sequences = set(pd.concat(psm_list)['sequence']
                   .str.replace(r'n?\[\d+\]', '').unique())
    psms_df.append((search_engine, search_mode, len(sequences)))
    peptides[search_mode][search_engine] = sequences
psms_df = pd.DataFrame.from_records(
    psms_df, index=['search_engine', 'search_mode'],
    columns=['search_engine', 'search_mode', 'peptides'])
summary = summary.join(psms_df)


/Users/Wout/.conda/envs/ann-solo/lib/python3.6/site-packages/pyteomics/auxiliary/target_decoy.py:79: RuntimeWarning: divide by zero encountered in true_divide
  q = tfalse / (ind - cumsum) / ratio


In [11]:
summary


Out[11]:
psms time peptides
search_engine search_mode
ann-solo oms 647469 108.531303 153605
std 352938 24.015849 105870
msfragger oms 526027 34.745542 126364
std 344998 0.697961 104672
spectrast oms 473729 1276.663253 112375
std 369079 5.205294 102077

In [12]:
# export LaTeX table
with open('../../reports/rev/tab-hek293_stats.tex', 'w') as f_out:
    f_out.write('\\begin{tabularx}{0.5\\textwidth}{Xrrr}\n')
    f_out.write('\\toprule\n')
    f_out.write('Search engine & Time (\\si{\\minute}) & \\#~SSMs & \\#~Peptides \\\\\n')
    f_out.write('\\midrule\n')
    f_out.write('\\multicolumn{4}{l}{\\emph{Standard search}} \\\\\n')
    for search_engine in ('MSFragger', 'SpectraST', 'ANN-SoLo'):
        num_psms, time, num_pep = summary.loc[search_engine.lower(), 'std']
        f_out.write(f'\\mbox{{{search_engine}}} & \\num{{{round(time, 1)}}} & '
                    f'\\num{{{int(num_psms)}}} & \\num{{{int(num_pep)}}} \\\\\n')
    f_out.write('\\addlinespace\n')
    f_out.write('\\multicolumn{4}{l}{\\emph{Open search}} \\\\\n')
    for search_engine in ('MSFragger', 'SpectraST', 'ANN-SoLo'):
        num_psms, time, num_pep = summary.loc[search_engine.lower(), 'oms']
        f_out.write(f'\\mbox{{{search_engine}}} & \\num{{{round(time, 1)}}} & '
                    f'\\num{{{int(num_psms)}}} & \\num{{{int(num_pep)}}} \\\\\n')
    f_out.write('\\bottomrule\n')
    f_out.write('\\end{tabularx}\n')

In [13]:
for search_mode, search_engines in peptides.items():
    for search_engine, sequences in search_engines.items():
        peptides[search_mode][search_engine] = set([pep.replace('I', 'L') for pep in sequences])

In [14]:
height = 7
width = height * 2
fig, axes = plt.subplots(1, 2, figsize=(width, height))

label_map = {'ann-solo': 'ANN-SoLo', 'msfragger': 'MSFragger',
             'spectrast': 'SpectraST'}
for i, (search_mode, ax) in enumerate(zip(('std', 'oms'), axes)):
    v = venn3(peptides[search_mode].values(),
              [label_map[key] for key in peptides[search_mode].keys()],
              set_colors=[next(ax._get_lines.prop_cycler)['color'],
                          next(ax._get_lines.prop_cycler)['color'],
                          next(ax._get_lines.prop_cycler)['color']],
              alpha=1, ax=ax)
    # make sure the numbers are legible
    for text in v.subset_labels:
        text.set_color('white')
        text.set_path_effects([path_effects.Stroke(linewidth=3, foreground='black'),
                               path_effects.Normal()])
    c = venn3_circles(peptides[search_mode].values(), linewidth=1.0, ax=ax)
    
    ax.annotate(string.ascii_uppercase[i], xy=(0.5 * i, 1), xytext=(5, -20),
                xycoords='figure fraction', textcoords='offset pixels',
                fontsize='xx-large', weight='bold')
    
plt.tight_layout()

plt.savefig('hek293_peptides_venn.pdf', dpi=300, bbox_inches='tight')
plt.show()
plt.close()



In [15]:
def get_mass_groups(psms, tol_mass, tol_mode):
    # start with the highest ranked PSM
    mass_groups = []
    psms_remaining = psms.sort_values('search_engine_score[1]', ascending=False)
    while len(psms_remaining) > 0:
        # find all remaining PSMs within the precursor mass window
        mass_diff = psms_remaining['mass_diff'].iloc[0]
        if tol_mass is None or tol_mode not in ('Da', 'ppm'):
            psms_selected = psms_remaining
        elif tol_mode == 'Da':
            psms_selected = \
                psms_remaining[(psms_remaining['mass_diff'] - mass_diff).abs()
                               <= tol_mass]
        elif tol_mode == 'ppm':
            psms_selected = \
                psms_remaining[(psms_remaining['mass_diff'] - mass_diff).abs()
                               / psms_remaining['exp_mass_to_charge'] * 10 ** 6
                               <= tol_mass]
        mass_groups.append(psms_selected)
        # exclude the selected PSMs from further selections
        psms_remaining.drop(psms_selected.index, inplace=True)

    mass_group_stats = []
    for mass_group in mass_groups:
        mass_group_stats.append((mass_group['mass_diff'].median(),
                                 mass_group['mass_diff'].mean(),
                                 len(mass_group)))
    mass_group_stats = pd.DataFrame.from_records(
        mass_group_stats, columns=['mass_diff_median', 'mass_diff_mean', 'num_psms'])
    return mass_group_stats

In [16]:
for psms_key, psms_list in psms.items():
    psms[psms_key] = pd.concat(psms_list, ignore_index=True)

In [17]:
search_engines = ('ANN-SoLo', 'SpectraST', 'MSFragger')
for search_engine in search_engines:
    psms_oms = psms[(search_engine.lower(), 'oms')]
    psms_oms['mass_diff'] = (
        (psms_oms['exp_mass_to_charge'] - psms_oms['calc_mass_to_charge'])
        * psms_oms['charge'])

In [18]:
mass_groups = {}
for search_engine in search_engines:
    psms_oms = psms[(search_engine.lower(), 'oms')]
    psms_oms['mass_diff'] = (
        (psms_oms['exp_mass_to_charge'] - psms_oms['calc_mass_to_charge'])
        * psms_oms['charge'])
    mass_groups[search_engine.lower()] = get_mass_groups(psms_oms, tol_mass, tol_mode)

In [19]:
width = 7
height = width / 1.618 / 2 * len(search_engines)
fig, axes = plt.subplots(len(search_engines), 1, sharex=True, figsize=(width, height))

for ax, search_engine in zip(axes, search_engines):
    mass_group = mass_groups[search_engine.lower()]
    mask = mass_group['mass_diff_median'].abs() > tol_mass   # exclude unmodified PSMs
    ax.bar(mass_group[mask]['mass_diff_median'], mass_group[mask]['num_psms'],
           width=0.4, log=False, color='black')

    ax.set_xlim((-50, 200))

    # format y-axis numbers
    ax.yaxis.set_major_formatter(mticker.StrMethodFormatter('{x:,g}'))
    
    ax.set_ylabel(search_engine)

    sns.despine(ax=ax)
    
# set tick labels at nice positions
axes[-1].xaxis.set_major_locator(mticker.MultipleLocator(50))
axes[-1].set_xlabel('Precursor mass difference (Da)')

ax = fig.add_subplot(111, frameon=False)
ax.grid(False)
ax.tick_params(labelcolor='none', top='off', bottom='off', left='off', right='off')
ax.set_ylabel(f'Number of SSMs (FDR={max_fdr})', labelpad=40)

plt.savefig('mass_diff.pdf', dpi=300, bbox_inches='tight')
plt.show()
plt.close()