In [1]:
import os
import sys
src_dir = os.path.abspath('../src')
if src_dir not in sys.path:
sys.path.append(src_dir)
In [2]:
%matplotlib inline
import math
import Levenshtein
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import squarify
from matplotlib_venn import venn2, venn2_circles
from ann_solo import reader
In [3]:
# plot styling
plt.style.use(['seaborn-white', 'seaborn-paper'])
plt.rc('font', family='serif')
sns.set_palette('Set1')
sns.set_context('paper', font_scale=1.) # two-column figure
In [4]:
psms_consensus = pd.read_csv(
'../../data/external/iprg2012/iprg2012ConsensusSpectrumIDcomparison.tsv',
sep='\t', header=0, skipfooter=4, engine='python').rename(
columns={'bestSequence': 'sequence_consensus',
'Precursor_z': 'charge_consensus'})
# get the same PSM identifiers
psms_consensus = psms_consensus.set_index(psms_consensus['Index1_de'] - 1)
psms_annsolo = reader.read_mztab_ssms(
'../../data/processed/iprg2012/brute_force/bf_oms_shifted.mztab')
In [5]:
psms_annsolo['mass_diff'] = (
(psms_annsolo['exp_mass_to_charge'] - psms_annsolo['calc_mass_to_charge'])
* psms_annsolo['charge'])
psms = psms_annsolo[['sequence', 'charge', 'search_engine_score[1]', 'mass_diff']].join(
psms_consensus[['sequence_consensus', 'charge_consensus']], how='outer')
# don't disambiguate between I and L
for label in ['sequence', 'sequence_consensus']:
psms[label] = psms[label].str.replace('I', 'L')
# remove SpectraST modification masses
psms['sequence'] = psms['sequence'].str.replace(r'n?\[\d+\]', '')
In [6]:
def edit_distance(seq1, seq2, normed=False):
if not pd.isnull(seq1) and not pd.isnull(seq2):
dist = Levenshtein.distance(seq1, seq2)
if normed:
dist /= max(len(seq1), len(seq2))
return dist
else:
return math.inf
psms['edit_dist'] = psms.apply(
lambda psm: edit_distance(psm['sequence'], psm['sequence_consensus']),
axis=1)
psms['edit_dist_norm'] = psms.apply(
lambda psm: edit_distance(psm['sequence'], psm['sequence_consensus'], True),
axis=1)
In [7]:
# get unique keys for spectrum ID - peptide assignment combinations
set_consensus = set(psms.loc[
psms['sequence_consensus'].notnull(), 'sequence_consensus'].items())
set_annsolo = set(psms.loc[psms['sequence'].notnull(), 'sequence'].items())
In [8]:
width = 7
height = width / 1.618 # golden ratio
fig, ax = plt.subplots(figsize=(width, height))
v = venn2([set_annsolo, set_consensus],
set_labels=['ANN-SoLo', 'iPRG2012 consensus'],
set_colors=[next(ax._get_lines.prop_cycler)['color'],
next(ax._get_lines.prop_cycler)['color']],
alpha=1., ax=ax)
c = venn2_circles([set_annsolo, set_consensus], linewidth=1.0, ax=ax)
# plt.savefig('iprg2012_consensus_venn.pdf', dpi=300, bbox_inches='tight')
plt.show()
plt.close()
In [9]:
psms_match = psms[psms['sequence_consensus'].notnull() &
psms['sequence'].notnull() &
(psms['sequence_consensus'] == psms['sequence'])]
psms_different = psms[psms['sequence_consensus'].notnull() &
psms['sequence'].notnull() &
(psms['sequence_consensus'] != psms['sequence'])]
psms_unique_consensus = (psms[psms['sequence_consensus'].notnull()]
.drop(psms_match.index, errors='ignore')
.drop(psms_different.index, errors='ignore'))
psms_unique_annsolo = (psms[psms['sequence'].notnull()]
.drop(psms_match.index, errors='ignore')
.drop(psms_different.index, errors='ignore'))
In [10]:
print(f'# identical PSMs: {len(psms_match)}\n'
f'# conflicting PSMs: {len(psms_different)}\n'
f'# unique PSMs ANN-SoLo: {len(psms_unique_annsolo)}\n'
f'# unique PSMs iPRG2012 consensus: {len(psms_unique_consensus)}')
In [11]:
width = 7
height = width / 1.618 # golden ratio
fig, ax = plt.subplots(figsize=(width, height))
# ax.hist(psms_different['edit_dist_norm'], bins=np.arange(0, 1.05, 0.05))
ax.hist(psms_different['edit_dist'], bins=np.arange(0, 25, 1))
ax.set_xlabel('Edit distance')
ax.set_ylabel('Number of conflicting SSMs')
sns.despine()
plt.savefig('iprg2012_consensus_distance.pdf', dpi=300, bbox_inches='tight')
plt.show()
plt.close()
In [12]:
threshold_different = 3
num_high_sim = len(psms_different[psms_different['edit_dist'] <= threshold_different])
num_low_sim = len(psms_different[psms_different['edit_dist'] > threshold_different])
print(f'# conflicting PSMs with high sequence similarity: {num_high_sim}\n'
f'# conflicting PSMs with low sequence similarity: {num_low_sim}\n'
f' (sequence similarity threshold = {threshold_different} amino acids)')
In [13]:
width = 7
height = width / 1.618 # golden ratio
fig, ax = plt.subplots(figsize=(width, height))
squares = {'identical': len(psms_match),
'conflicting\nsimilar': num_high_sim,
'conflicting\ndifferent': num_low_sim,
'unique ANN-SoLo': len(psms_unique_annsolo),
'unique iPRG2012 consensus': len(psms_unique_consensus)}
squares = {f'{key}\n({value} SSMs)': value
for (key, value) in squares.items()}
colors = sns.color_palette('Set1', len(squares))
squarify.plot(sizes=squares.values(), color=colors, label=squares.keys(),
ax=ax, alpha=0.8, linewidth=1, edgecolor='black')
ax.set_xticks([])
ax.set_yticks([])
plt.savefig('iprg2012_consensus_treemap.pdf', dpi=300, bbox_inches='tight')
plt.show()
plt.close()
In [14]:
peptides_library = set()
filename_pepidx = '../../data/interim/iprg2012/human_yeast_targetdecoy.pepidx'
with open(filename_pepidx) as f_in:
for line in f_in:
if not line.startswith('#'):
peptide, info, _ = line.split()
charge = info.split('|')[0]
peptides_library.add((peptide, float(charge)))
In [15]:
psms_notfound = psms_different # pd.concat([psms_different, psms_unique_consensus])
peptides_notfound = set(zip(psms_notfound['sequence_consensus'],
psms_notfound['charge_consensus']))
peptides_notfound_library = peptides_notfound & peptides_library
ions_consensus = psms['sequence_consensus'] + psms['charge_consensus'].map(str)
psms_notfound_library = (psms[ions_consensus.isin(
[f'{seq}{charge}' for seq, charge in peptides_notfound_library])]
.merge(psms_notfound))
peptides_notfound_notlibrary = peptides_notfound - peptides_library
psms_notfound_notlibrary = (psms[ions_consensus.isin(
[f'{seq}{charge}' for seq, charge in peptides_notfound_notlibrary])]
.merge(psms_notfound))
print(f'# conflicting iPRG consensus identification ions not found IN library: '
f'{len(peptides_notfound_library)} '
f'({len(psms_notfound_library)} spectra)\n'
f'# conflicting iPRG consensus identification ions not found NOT IN library: '
f'{len(peptides_notfound_notlibrary)} '
f'({len(psms_notfound_notlibrary)} spectra)\n')