In [1]:
import os
import sys
src_dir = os.path.abspath('../src')
if src_dir not in sys.path:
sys.path.append(src_dir)
In [2]:
%matplotlib inline
import itertools
import logging
import multiprocessing
import re
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import tqdm
from ann_solo import reader, spectral_library, writer
from ann_solo.config import config
In [3]:
tqdm.tqdm = tqdm.tqdm_notebook
# plot styling
plt.style.use(['seaborn-white', 'seaborn-paper'])
plt.rc('font', family='serif')
sns.set_palette('Set1')
sns.set_context(font_scale=1.3) # single-column figure
# initialize logging
logging.basicConfig(format='%(asctime)s [%(levelname)s/%(processName)s] '
'%(module)s.%(funcName)s : %(message)s',
level=logging.INFO)
In [4]:
config_filename = '../../bin/ann-solo/iprg2012.ini'
splib_filename = '../../data/interim/iprg2012/human_yeast_targetdecoy.splib'
mgf_filename = '../../data/external/iprg2012/iPRG2012.mgf'
out_dir = '../../data/processed/iprg2012/spectrum_representation'
We consider the following options for the representation of MS2 spectra:
These representation options are evaluated based on the identification performance in terms of the number of SSMs at a 1% FDR.
Search settings:
In [5]:
def _do_search(settings):
config.parse(settings)
spec_lib = spectral_library.SpectralLibrary(
config.spectral_library_filename)
identifications = spec_lib.search(config.query_filename)
writer.write_mztab(identifications, config.out_filename,
spec_lib._library_reader)
spec_lib.shutdown()
# Go through all combinations of number of peaks and scaling methods.
for num_peaks_query, num_peaks_library, scaling in itertools.product(
np.arange(25, 151, 25), np.arange(25, 151, 25), ('sqrt', 'rank')):
out_filename = os.path.join(out_dir,
f'num_peaks_query_{num_peaks_query}-'\
f'num_peaks_library_{num_peaks_library}-'\
f'scaling_{scaling}.mztab')
if not os.path.isfile(out_filename):
settings = [f'--config {config_filename}', '--mode bf',
f'--max_peaks_used {num_peaks_query}',
f'--max_peaks_used_library {num_peaks_library}',
f'--scaling {scaling}',
splib_filename, mgf_filename, out_filename]
proc = multiprocessing.Process(target=_do_search,
args=(' '.join(settings),))
proc.start()
proc.join()
In [6]:
pattern_numpeaks_scaling = re.compile('^num_peaks_query_(\d+)-'
'num_peaks_library_(\d+)-'
'scaling_(rank|sqrt).mztab$')
ssms = {'num_peaks_query': [], 'num_peaks_library': [], 'scaling': [],
'ssms': []}
for filename in os.listdir(out_dir):
match = pattern_numpeaks_scaling.match(filename)
if match is not None:
ssms['num_peaks_query'].append(int(match.group(1)))
ssms['num_peaks_library'].append(int(match.group(2)))
ssms['scaling'].append(match.group(3))
ssms['ssms'].append(
len(reader.read_mztab_ssms(os.path.join(out_dir, filename))))
ssms = pd.DataFrame(ssms)
In [7]:
def _plot_heatmap(*args, **kwargs):
sns.heatmap(kwargs.pop('data').pivot(
'num_peaks_query', 'num_peaks_library', 'ssms'), **kwargs)
fg = sns.FacetGrid(ssms, col='scaling', sharex=True, sharey=True,
legend_out=True, height=7, aspect=1)
cbar_ax = fg.fig.add_axes([1.02, 0.15, 0.025, 0.7])
fg.map_dataframe(_plot_heatmap,
vmin=ssms['ssms'].min(), vmax=ssms['ssms'].max(),
cmap='viridis', annot=True, fmt='d', linewidths=1.0,
cbar_ax=cbar_ax, square=True)
fg.facet_axis(0, 0).invert_yaxis()
fg.set_axis_labels('Number of library spectrum peaks',
'Number of query spectrum peaks')
cbar_ax.set_ylabel('Number of SSMs (1% FDR)', size='large')
for ax in fg.axes.ravel():
ax.tick_params(axis='both', which='major', labelsize='medium')
ax.set_xlabel(ax.get_xlabel(), fontsize='large')
ax.set_ylabel(ax.get_ylabel(), fontsize='large')
fg.set_titles(size='large')
cbar_ax.tick_params(labelsize='medium')
plt.savefig('preprocess.pdf', dpi=300, bbox_inches='tight')
plt.show()
plt.close()
In [8]:
logging.shutdown()