In [1]:
import os
import sys
src_dir = os.path.abspath('../src')
if src_dir not in sys.path:
sys.path.append(src_dir)
In [2]:
%matplotlib inline
import functools
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import numpy as np
import pandas as pd
import seaborn as sns
import tqdm
from ann_solo import reader, spectrum, utils
In [3]:
# plot styling
plt.style.use(['seaborn-white', 'seaborn-paper'])
plt.rc('font', family='serif')
sns.set_palette('Set1')
sns.set_context('paper', font_scale=1.) # single-column figure
In [4]:
tqdm.tqdm = tqdm.tqdm_notebook
In [5]:
def extract_time_from_log(filename):
with open(filename, 'r') as f_in:
for line in f_in:
if line.startswith('real'):
# Wall clock time.
realtime = line.split()[1]
minutes = int(realtime[:realtime.find('m')])
seconds = float(realtime[realtime.find('m') + 1:
realtime.rfind('s')])
realtime = minutes * 60 + seconds
return realtime
In [6]:
max_fdr = 0.01
tol_mass = 0.1
tol_mode = 'Da'
min_group_size = 20
In [7]:
id_dir = '../../data/processed/kim2014/gpu'
In [8]:
ssms = []
num_ssms, num_peptides, runtimes = [], [], []
for filename in tqdm.tqdm(os.listdir(id_dir), desc='Files processed',
unit='files'):
filename_full = os.path.join(id_dir, filename)
filename_base, ext = os.path.splitext(filename)
if ext == '.log':
runtimes.append((os.path.splitext(filename)[0],
extract_time_from_log(filename_full)))
elif ext == '.mztab':
file_ssms = reader.read_mztab_ssms(filename_full)
ssms.append(file_ssms)
file_peptides = (file_ssms['sequence'].str.replace(r'n?\[\d+\]', '')
.unique())
num_ssms.append((filename_base, len(file_ssms)))
num_peptides.append((filename_base, len(file_peptides)))
ssms = pd.concat(ssms, ignore_index=True)
num_ssms_df = pd.DataFrame.from_records(num_ssms,
columns=['filename', 'ssms'])
num_peptides_df = pd.DataFrame.from_records(num_peptides,
columns=['filename', 'peptides'])
time_df = pd.DataFrame.from_records(runtimes, columns=['filename', 'time'])
stats = functools.reduce(
lambda left, right: pd.merge(left, right, on='filename'),
[num_ssms_df, num_peptides_df, time_df])
stats = stats.sort_values('filename').reset_index(drop=True)
In [9]:
print(f'Number of identified spectra: {stats["ssms"].sum():,.0f}\n'
f'Total search time: {stats["time"].sum() / 3600:.0f} h '
f'({stats["time"].sum() / 86400:.1f} days)\n'
f'Search time per file: {stats["time"].mean() / 60:,.1f} '
f'± {stats["time"].std() / 60:,.1f} min')
In [10]:
num_ssm_unmod = num_ssm_total = 0
for filename in os.listdir(id_dir):
filename_full = os.path.join(id_dir, filename)
filename_base, ext = os.path.splitext(filename)
if ext == '.log':
with open(filename_full, 'r') as f_in:
for line in f_in:
if line.endswith('spectra identified after the standard search\n'):
start = line.find(' : ') + 3
end = line.find(' ', start)
num_ssm_unmod += int(line[start:end])
if line.endswith('spectra identified after the open search\n'):
start = line.find(' : ') + 3
end = line.find(' ', start)
num_ssm_total += int(line[start:end])
num_ssm_mod = num_ssm_total - num_ssm_unmod
In [11]:
print(f'Total number of identified spectra: {num_ssm_total:,.0f}\n'
f'Number of unmodified identified spectra: {num_ssm_unmod:,.0f}\n'
f'Number of modified identified spectra: {num_ssm_mod:,.0f}\n')
In [12]:
def get_mass_groups(ssms, tol_mass, tol_mode):
ssms_remaining = ssms.sort_values('search_engine_score[1]',
ascending=False)
ssms_remaining['mass_diff'] = ((ssms_remaining['exp_mass_to_charge'] -
ssms_remaining['calc_mass_to_charge']) *
ssms_remaining['charge'])
# Start with the highest ranked SSM.
mass_groups = []
while ssms_remaining.size > 0:
# Find all remaining PSMs within the mass difference window.
mass_diff = ssms_remaining['mass_diff'].iloc[0]
if (tol_mass is None or tol_mode not in ('Da', 'ppm') or
min_group_size is None):
mask = np.full(len(ssms_remaining), True, dtype=bool)
elif tol_mode == 'Da':
mask = (np.fabs(ssms_remaining['mass_diff'] - mass_diff) <=
tol_mass)
elif tol_mode == 'ppm':
mask = (np.fabs(ssms_remaining['mass_diff'] - mass_diffs) /
ssms_remaining['exp_mass_to_charge'] * 10 ** 6
<= tol_mass)
mass_groups.append(ssms_remaining[mask])
# Exclude the selected SSMs from further selections.
ssms_remaining = ssms_remaining[~mask]
mass_group_stats = []
for mass_group in mass_groups:
mass_group_stats.append((mass_group['mass_diff'].median(),
mass_group['mass_diff'].mean(),
len(mass_group)))
mass_group_stats = pd.DataFrame.from_records(
mass_group_stats, columns=['mass_diff_median', 'mass_diff_mean',
'num_ssms'])
return mass_group_stats
In [13]:
mass_groups = get_mass_groups(ssms, tol_mass, tol_mode)
In [14]:
mass_groups.sort_values('num_ssms', ascending=False).head(10)
Out[14]:
In [15]:
width = 7
height = width / 1.618
fig, ax = plt.subplots(figsize=(width * 1.5, height / 1.5))
# Exclude unmodified SSMs.
mask = mass_groups['mass_diff_median'].abs() > tol_mass
ax.bar(mass_groups[mask]['mass_diff_median'], mass_groups[mask]['num_ssms'],
width=0.4, color='black')
# Annotate the most frequent modifications.
modifications = [('carbamidomethyl', 5, 400000), ('formylation', 5, 400000),
('first isotopic peak', -10, 400000),
('oxidation', 0, 300000), ('amidation', -25, 300000)]
for (annot, x, y), mass_group in zip(
modifications, mass_groups.sort_values(
'num_ssms', ascending=False).head(10)[1:].itertuples()):
ax.annotate(annot,
(mass_group.mass_diff_median, mass_group.num_ssms + 5000),
(mass_group.mass_diff_median + x, y),
arrowprops={'arrowstyle': '<-', 'linewidth': 1}, ha='center')
ax.set_xlim((-50, 100))
ax.set_ylim(0, 500000)
# Format y-axis numbers.
ax.yaxis.set_major_formatter(mticker.StrMethodFormatter('{x:,g}'))
sns.despine(ax=ax)
ax.set_xlabel('Precursor mass difference (Da)')
ax.set_ylabel(f'Number of SSMs\n(FDR={max_fdr})')
plt.savefig('mass_diff.pdf', dpi=300, bbox_inches='tight')
plt.show()
plt.close()