Kim2014 identification statistics



In [1]:

    
import os
import sys

src_dir = os.path.abspath('../src')
if src_dir not in sys.path:
    sys.path.append(src_dir)



In [2]:

    
%matplotlib inline
import functools

import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import numpy as np
import pandas as pd
import seaborn as sns
import tqdm

from ann_solo import reader, spectrum, utils



In [3]:

    
# plot styling
plt.style.use(['seaborn-white', 'seaborn-paper'])
plt.rc('font', family='serif')
sns.set_palette('Set1')
sns.set_context('paper', font_scale=1.)    # single-column figure



In [4]:

    
tqdm.tqdm = tqdm.tqdm_notebook



In [5]:

    
def extract_time_from_log(filename):
    with open(filename, 'r') as f_in:
        for line in f_in:
            if line.startswith('real'):
                # Wall clock time.
                realtime = line.split()[1]
                minutes = int(realtime[:realtime.find('m')])
                seconds = float(realtime[realtime.find('m') + 1:
                                         realtime.rfind('s')])
                realtime = minutes * 60 + seconds
                
                return realtime



In [6]:

    
max_fdr = 0.01
tol_mass = 0.1
tol_mode = 'Da'
min_group_size = 20



In [7]:

    
id_dir = '../../data/processed/kim2014/gpu'



In [8]:

    
ssms = []
num_ssms, num_peptides, runtimes = [], [], []
for filename in tqdm.tqdm(os.listdir(id_dir), desc='Files processed',
                          unit='files'):
    filename_full = os.path.join(id_dir, filename)
    filename_base, ext = os.path.splitext(filename)
    if ext == '.log':
        runtimes.append((os.path.splitext(filename)[0],
                         extract_time_from_log(filename_full)))
    elif ext == '.mztab':
        file_ssms = reader.read_mztab_ssms(filename_full)
        ssms.append(file_ssms)
        file_peptides = (file_ssms['sequence'].str.replace(r'n?\[\d+\]', '')
                         .unique())
        num_ssms.append((filename_base, len(file_ssms)))
        num_peptides.append((filename_base, len(file_peptides)))
        
ssms = pd.concat(ssms, ignore_index=True)

num_ssms_df = pd.DataFrame.from_records(num_ssms,
                                        columns=['filename', 'ssms'])
num_peptides_df = pd.DataFrame.from_records(num_peptides,
                                            columns=['filename', 'peptides'])
time_df = pd.DataFrame.from_records(runtimes, columns=['filename', 'time'])

stats = functools.reduce(
    lambda left, right: pd.merge(left, right, on='filename'),
    [num_ssms_df, num_peptides_df, time_df])
stats = stats.sort_values('filename').reset_index(drop=True)



In [9]:

    
print(f'Number of identified spectra: {stats["ssms"].sum():,.0f}\n'
      f'Total search time: {stats["time"].sum() / 3600:.0f} h '
      f'({stats["time"].sum() / 86400:.1f} days)\n'
      f'Search time per file: {stats["time"].mean() / 60:,.1f} '
      f'± {stats["time"].std() / 60:,.1f} min')









    



Number of identified spectra: 14,032,494
Total search time: 281 h (11.7 days)
Search time per file: 8.0 ± 3.1 min



In [10]:

    
num_ssm_unmod = num_ssm_total = 0
for filename in os.listdir(id_dir):
    filename_full = os.path.join(id_dir, filename)
    filename_base, ext = os.path.splitext(filename)
    if ext == '.log':
        with open(filename_full, 'r') as f_in:
            for line in f_in:
                if line.endswith('spectra identified after the standard search\n'):
                    start = line.find(' : ') + 3
                    end = line.find(' ', start)
                    num_ssm_unmod += int(line[start:end])
                if line.endswith('spectra identified after the open search\n'):
                    start = line.find(' : ') + 3
                    end = line.find(' ', start)
                    num_ssm_total += int(line[start:end])
num_ssm_mod = num_ssm_total - num_ssm_unmod



In [11]:

    
print(f'Total number of identified spectra: {num_ssm_total:,.0f}\n'
      f'Number of unmodified identified spectra: {num_ssm_unmod:,.0f}\n'
      f'Number of modified identified spectra: {num_ssm_mod:,.0f}\n')









    



Total number of identified spectra: 14,032,494
Number of unmodified identified spectra: 9,760,497
Number of modified identified spectra: 4,271,997



In [12]:

    
def get_mass_groups(ssms, tol_mass, tol_mode):
    ssms_remaining = ssms.sort_values('search_engine_score[1]',
                                      ascending=False)
    ssms_remaining['mass_diff'] = ((ssms_remaining['exp_mass_to_charge'] -
                                    ssms_remaining['calc_mass_to_charge']) *
                                   ssms_remaining['charge'])

    # Start with the highest ranked SSM.
    mass_groups = []
    while ssms_remaining.size > 0:
        # Find all remaining PSMs within the mass difference window.
        mass_diff = ssms_remaining['mass_diff'].iloc[0]
        if (tol_mass is None or tol_mode not in ('Da', 'ppm') or
                min_group_size is None):
            mask = np.full(len(ssms_remaining), True, dtype=bool)
        elif tol_mode == 'Da':
            mask = (np.fabs(ssms_remaining['mass_diff'] - mass_diff) <=
                    tol_mass)
        elif tol_mode == 'ppm':
            mask = (np.fabs(ssms_remaining['mass_diff'] - mass_diffs) /
                    ssms_remaining['exp_mass_to_charge'] * 10 ** 6
                    <= tol_mass)
        mass_groups.append(ssms_remaining[mask])
        # Exclude the selected SSMs from further selections.
        ssms_remaining = ssms_remaining[~mask]

    mass_group_stats = []
    for mass_group in mass_groups:
        mass_group_stats.append((mass_group['mass_diff'].median(),
                                 mass_group['mass_diff'].mean(),
                                 len(mass_group)))
    mass_group_stats = pd.DataFrame.from_records(
        mass_group_stats, columns=['mass_diff_median', 'mass_diff_mean',
                                   'num_ssms'])
    return mass_group_stats



In [13]:

    
mass_groups = get_mass_groups(ssms, tol_mass, tol_mode)



In [14]:

    
mass_groups.sort_values('num_ssms', ascending=False).head(10)









    Out[14]:







  
    
      
      mass_diff_median
      mass_diff_mean
      num_ssms
    
  
  
    
      0
      0.000646
      0.001236
      9882777
    
    
      8
      57.022315
      57.022839
      308387
    
    
      21
      27.995986
      27.998766
      246428
    
    
      16
      0.993853
      0.987444
      219006
    
    
      11
      15.995468
      15.996442
      211927
    
    
      5
      -0.986346
      -0.999017
      163269
    
    
      7
      14.015814
      14.013037
      133020
    
    
      76
      -17.025133
      -17.019578
      129687
    
    
      54
      -18.009568
      -18.006636
      111075
    
    
      4
      1.988133
      1.989208
      99286



In [15]:

    
width = 7
height = width / 1.618
fig, ax = plt.subplots(figsize=(width * 1.5, height / 1.5))

# Exclude unmodified SSMs.
mask = mass_groups['mass_diff_median'].abs() > tol_mass
ax.bar(mass_groups[mask]['mass_diff_median'], mass_groups[mask]['num_ssms'],
       width=0.4, color='black')

# Annotate the most frequent modifications.
modifications = [('carbamidomethyl', 5, 400000), ('formylation', 5, 400000),
                 ('first isotopic peak', -10, 400000),
                 ('oxidation', 0, 300000), ('amidation', -25, 300000)]
for (annot, x, y), mass_group in zip(
        modifications, mass_groups.sort_values(
            'num_ssms', ascending=False).head(10)[1:].itertuples()):
    ax.annotate(annot,
                (mass_group.mass_diff_median, mass_group.num_ssms + 5000),
                (mass_group.mass_diff_median + x, y),
                arrowprops={'arrowstyle': '<-', 'linewidth': 1}, ha='center')

ax.set_xlim((-50, 100))
ax.set_ylim(0, 500000)

# Format y-axis numbers.
ax.yaxis.set_major_formatter(mticker.StrMethodFormatter('{x:,g}'))

sns.despine(ax=ax)

ax.set_xlabel('Precursor mass difference (Da)')
ax.set_ylabel(f'Number of SSMs\n(FDR={max_fdr})')

plt.savefig('mass_diff.pdf', dpi=300, bbox_inches='tight')
plt.show()
plt.close()

	mass_diff_median	mass_diff_mean	num_ssms
0	0.000646	0.001236	9882777
8	57.022315	57.022839	308387
21	27.995986	27.998766	246428
16	0.993853	0.987444	219006
11	15.995468	15.996442	211927
5	-0.986346	-0.999017	163269
7	14.015814	14.013037	133020
76	-17.025133	-17.019578	129687
54	-18.009568	-18.006636	111075
4	1.988133	1.989208	99286