This explores different ways to analyze the quality of PSM quantifications
In [1]:
from itertools import chain
bad_data = [
('ELcSAAITMSDNTAANLLLTTIGGPk', 8846),
('FVESVDVAVNLGIDAR',7466 ),
('ELcSAAITMSDNTAANLLLTTIGGPK', 9209),
('FVESVDVAVNLGIDAR', 9213),
('FVESVDVAVNLGIDAR', 9426),
('AVTLYLGAVAATVR', 6660),
('AVTLYLGAVAATVR', 8958),
('IVVIYTTGSQATMDER', 4505),
('VGYIELDLNSGk', 5624),
('LLTGELLTLASR', 6942),
('FVESVDVAVNLGIDAr', 9184),
('ELcSAAITMSDNTAANLLLTTIGGPk', 9458),
('VGYIELDLNSGk', 5238),
('IVVIYTTGSQATMDERNR', 4024),
('AVTLYLGAVAATVR', 9652),
('ELcSAAITMSDNTAANLLLTTIGGPk', 8883),
('IVVIYTTGSQATMDERNR', 4005),
('FVESVDVAVNLGIDAR', 9950),
('AQHSALDDIPR', 2510),
('FVESVDVAVNLGIDAR', 9980),
('VGYIELDLNSGk', 9546),
('IVVIYTTGSQATMDER', 9933),
('HFESTPDTPEIIATIHGEGYR', 4488),
('YYLGNADEIAAK', 3703),
('FVESVDVAVNLGIDAR', 6879),
('RDDSILLAQHTR', 1849),
('EQGYALDSEENEQGVR', 2536),
('VLLcGAVLSR', 4541),
('LGYPITDDLDIYTr', 5790),
('VGYIELDLNSGk', 8965),
('FVESVDVAVNLGIDAR', 7796),
]
good_data = [
('VHIINLEK', 2373),
('HITDRDVR', 863),
('GATVLPHGTGr', 1244),
('GATVLPHGTGR', 1238),
('EQGLHFYAAGHHATER', 1570),
('VPLHTLr', 1371),
('IHVAVAQEVPGTGVDTPEDLER', 4157),
('cIFDNISLTVPR', 6174),
('HLTDGmTVR', 974),
('AGVHFGHQTR', 1002),
('AHHYPSELSGGQQQR', 1142),
('HYGALQGLNk', 1738),
('HITGLHYNPITNTFk', 3590),
('IGLLEHANR', 2008),
('ALEINSQSLDNNAAFIR', 5217),
('RIYGVLER', 2188),
('FQDVGSFDYGR', 3734),
('AVQNAMR', 995),
('IGVGGTITYPR', 3358),
('GmGESNPVTGNTcDNVk', 1558),
('MVEEDPAHPr', 1177),
('AIENQAYVAGcNr', 1914),
('FIAQQLGVSR', 3332),
('MPEDLLTr', 3424),
('mVEEDPAHPr', 1016),
('GFSVNFER', 3790),
('TPVGNTAAIcIYPR', 4031),
('IDAILVDR', 3375),
('LVAVGNTFVYPIAGYSk', 5966),
]
peptides = ' '.join(i[0] for i in chain(bad_data, good_data))
scans = ' '.join(str(i[1]) for i in chain(bad_data, good_data))
out = 'ml_train'
In [24]:
# %%bash -s "$peptides" "$scans" "$out"
# pyQuant --search-file "/home/chris/gdrive/Dropbox/Manuscripts/SILAC Fix/EColi/PD/Chris_Ecoli_1-2-4-(01).msf" \
# --scan-file "/home/chris/gdrive/Dropbox/Manuscripts/SILAC Fix/EColi/Chris_Ecoli_1-2-4.mzML" \
# --peptide $1 --scan $2 \
# -o $3 \
# -p 9
In [33]:
# %%bash -s "$peptides" "$scans" "$out"
# pyQuant --search-file "/home/chris/gdrive/Dropbox/Manuscripts/SILAC Fix/EColi/PD/Chris_Ecoli_1-2-4-(01).msf" \
# --scan-file "/home/chris/gdrive/Dropbox/Manuscripts/SILAC Fix/EColi/Chris_Ecoli_1-2-4.mzML" \
# -o $3 \
# -p 9
In [ ]:
In [74]:
%matplotlib inline
from tpot import TPOT
from sklearn.cross_validation import train_test_split
import numpy as np
from scipy.special import logit
import pandas as pd
pd.options.display.max_columns = None
from patsy import dmatrix
dat = pd.read_table(out)
dat = dat[dat['Peptide'].str.count('R')+dat['Peptide'].str.count('K')+dat['Peptide'].str.count('k')+dat['Peptide'].str.count('r') == 1]
dat['Class'] = None
dat.loc[dat['Peptide'].str.count('R')+dat['Peptide'].str.count('r') == 1, 'Class'] = 'R'
dat.loc[dat['Peptide'].str.count('K')+dat['Peptide'].str.count('k') == 1, 'Class'] = 'K'
dat.set_index(['Peptide', 'MS2 Spectrum ID'], inplace=True)
dat.drop(['Modifications', 'Raw File', 'Accession', 'MS1 Spectrum ID', 'Charge', 'Medium Calibrated Precursor', 'Medium Precursor', 'Heavy/Medium', 'Heavy Calibrated Precursor', 'Heavy Precursor', 'Light Calibrated Precursor', 'Light Precursor', 'Retention Time', 'Heavy/Light Confidence', 'Medium/Heavy', 'Medium/Heavy Confidence', 'Medium/Light Confidence', 'Light/Medium Confidence', 'Heavy/Medium Confidence', 'Light/Heavy Confidence'], inplace=True, axis=1)
# Arg H/L -> -1.86
# Arg M/L = -1
# Lys H/L -> 1.89
# Lys M/L = 0.72
nds = []
for numerator, denominator in zip(['Heavy', 'Medium'], ['Light', 'Light']):
ratio = '{}/{}'.format(numerator, denominator)
cols=['Isotopes Found', 'Intensity', 'RT Width', 'Mean Offset', 'Residual', 'R^2', 'SNR']
nd = pd.DataFrame([], columns=[
'Label1 Isotopes Found',
'Label1 Intensity',
'Label1 RT Width',
'Label1 Mean Offset',
'Label1 Residual',
'Label1 R^2',
'Label1 SNR',
'Label2 Isotopes Found',
'Label2 Intensity',
'Label2 RT Width',
'Label2 Mean Offset',
'Label2 Residual',
'Label2 R^2',
'Label2 SNR',
'Deviation',
'Class',
])
median, std = np.log2(dat[dat['Class']=='R'][ratio]).median(), np.log2(dat[dat['Class']=='R'][ratio]).std()
expected = median
nd['Deviation'] = np.log2(dat[dat['Class']=='R'][ratio])-expected
nd['Class'] = np.abs(np.log2(dat[dat['Class']=='R'][ratio])-median).apply(lambda x: 1 if x < std else 0)
for label, new_label in zip([numerator, denominator], ['Label1', 'Label2']):
for col in cols:
nd['{} {}'.format(new_label, col)] = dat['{} {}'.format(label, col)]
nd['Label1 Intensity'] = np.log2(nd['Label1 Intensity'])
nd['Label2 Intensity'] = np.log2(nd['Label2 Intensity'])
nd['Label1 R^2'] = logit(nd['Label1 R^2'])
nd['Label2 R^2'] = logit(nd['Label2 R^2'])
nds.append(nd)
for numerator, denominator in zip(['Heavy', 'Medium'], ['Light', 'Light']):
ratio = '{}/{}'.format(numerator, denominator)
cols=['Isotopes Found', 'Intensity', 'RT Width', 'Mean Offset', 'Residual', 'R^2', 'SNR']
nd = pd.DataFrame([], columns=[
'Label1 Isotopes Found',
'Label1 Intensity',
'Label1 RT Width',
'Label1 Mean Offset',
'Label1 Residual',
'Label1 R^2',
'Label1 SNR',
'Label2 Isotopes Found',
'Label2 Intensity',
'Label2 RT Width',
'Label2 Mean Offset',
'Label2 Residual',
'Label2 R^2',
'Label2 SNR',
'Deviation',
'Class'
])
median, std = np.log2(dat[dat['Class']=='K'][ratio]).median(), np.log2(dat[dat['Class']=='K'][ratio]).std()
expected = median
nd['Deviation'] = np.log2(dat[dat['Class']=='K'][ratio])-expected
nd['Class'] = np.abs(np.log2(dat[dat['Class']=='K'][ratio])-median).apply(lambda x: 1 if x < std else 0)
for label, new_label in zip([numerator, denominator], ['Label1', 'Label2']):
for col in cols:
nd['{} {}'.format(new_label, col)] = dat['{} {}'.format(label, col)]
nd['Label1 Intensity'] = np.log2(nd['Label1 Intensity'])
nd['Label2 Intensity'] = np.log2(nd['Label2 Intensity'])
nd['Label1 R^2'] = logit(nd['Label1 R^2'])
nd['Label2 R^2'] = logit(nd['Label2 R^2'])
nds.append(nd)
pd.concat(nds)
Out[74]:
In [223]:
Out[223]:
In [75]:
df = pd.concat(nds)
df = df.replace([np.inf,-np.inf], np.nan).dropna()
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV
X = preprocessing.scale(df.drop('Deviation', axis=1).drop('Class', axis=1).values)
y = df.loc[:, ['Deviation', 'Class']].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)
y_test_reg = y_test[:, 0]
y_test_class = y_test[:, 1]
y_train_reg = y_train[:, 0]
y_train_class = y_train[:, 1]
In [82]:
from sklearn.svm import SVC as Classifier
clf = Classifier()
clf = clf.fit(X_train, y_train_class)
from sklearn.metrics import accuracy_score
print accuracy_score(y_test_class, clf.predict(X_test))
In [341]:
from sklearn.qda import QDA as Classifier
clf = Classifier()
clf = clf.fit(X_train, y_train_class)
from sklearn.metrics import accuracy_score
print accuracy_score(y_test_class, clf.predict(X_test))
In [349]:
from sklearn.gaussian_process import GaussianProcessClassifier as Classifier
clf = Classifier()
clf = clf.fit(X_train, y_train_class)
from sklearn.metrics import accuracy_score
print accuracy_score(y_test_class, clf.predict(X_test))
In [83]:
from sklearn.neural_network import MLPClassifier as Classifier
clf = Classifier()
clf = clf.fit(X_train, y_train_class)
from sklearn.metrics import accuracy_score
print accuracy_score(y_test_class, clf.predict(X_test))
In [84]:
import pickle
pickle.dump(clf, open('/home/chris/Devel/pyquant/pyquant/static/new_classifier2.pickle', 'wb'))
In [375]:
from sklearn.ensemble import AdaBoostRegressor as Regressor
clf = Regressor()
clf = clf.fit(X_train, y_train_reg)
from sklearn import metrics
print metrics.median_absolute_error(y_test_reg, clf.predict(X_test))
from matplotlib import pyplot as plt
plt.scatter(y_test_reg, clf.predict(X_test))
plt.plot([-6, 6], [-6, 6], 'r-')
Out[375]:
In [24]:
from sklearn.neural_network import MLPRegressor
reg = MLPRegressor()
clf = GridSearchCV(reg, {})
clf.fit(X_train, y_train_reg)
print metrics.median_absolute_error(y_test_reg, clf.predict(X_test))
plt.scatter(y_test_reg, clf.predict(X_test))
plt.plot([-6, 6], [-6, 6], 'r-')
Out[24]:
In [377]:
from sklearn.ensemble import GradientBoostingRegressor
reg = GradientBoostingRegressor()
parameters = {
'loss': ['ls', 'lad'],
'learning_rate': [0.01, 0.1, 0.5],
'n_estimators': [50, 100, 200],
}
clf = GridSearchCV(reg, parameters)
clf.fit(X_train, y_train_reg)
from sklearn.metrics import r2_score
r2_score(y_test_reg, clf.predict(X_test))
plt.scatter(y_test_reg, clf.predict(X_test))
plt.plot([-6, 6], [-6, 6], 'r-')
Out[377]:
In [379]:
from sklearn.tree import DecisionTreeRegressor as Regressor
clf = Regressor()
clf.fit(X_train, y_train_reg)
print r2_score(y_test_reg, clf.predict(X_test))
plt.scatter(y_test_reg, clf.predict(X_test))
plt.plot([-6, 6], [-6, 6], 'r-')
Out[379]:
In [50]:
np.log2(dat[dat['Class']=='R']['Heavy/Light']).plot(kind='hist')
Out[50]:
In [41]:
dat.columns.tolist()
Out[41]:
In [5]:
from tpot import TPOT
from sklearn.cross_validation import train_test_split
import numpy as np
import pandas as pd
from patsy import dmatrix
dat = pd.read_table(out)
dat.set_index(['Peptide', 'MS2 Spectrum ID'], inplace=True)
dat.drop(['Modifications', 'Raw File', 'Accession', 'MS1 Spectrum ID', 'Charge', 'Retention Time', 'Heavy/Light', 'Heavy/Light Confidence', 'Medium/Light', 'Medium/Heavy', 'Medium/Heavy Confidence', 'Medium/Light', 'Medium/Light Confidence', 'Light/Medium', 'Light/Medium Confidence', 'Heavy/Medium', 'Heavy/Medium Confidence', 'Light/Heavy Confidence', 'Light/Heavy'], inplace=True, axis=1)
for i in ['Heavy', 'Medium', 'Light']:
for j in ['Precursor', 'Calibrated Precursor']:
dat.drop(i + ' ' +j, inplace=True, axis=1)
to_drop = []
for j in dat.columns:
if j.startswith('Heavy'):
to_drop.append(j)
dat.drop(to_drop, inplace=True, axis=1)
dat['Class'] = None
for i in bad_data:
dat.loc[i, 'Class'] = 0
for i in good_data:
dat.loc[i, 'Class'] = 1
dat.dropna(inplace=True)
labels = dat['Class']
# # preprocess
dat['Medium Intensity'] = np.log2(dat['Medium Intensity'])
dat['Light Intensity'] = np.log2(dat['Light Intensity'])
# extra info
for i in ['RT Width', 'Isotopes Found']:
dat['Medium/Light {}'.format(i)] = dat['Medium {}'.format(i)]/dat['Light {}'.format(i)]
# dat = dat.loc[:, ['Medium R^2', 'Light R^2', 'Class']]
dat.reset_index(drop=True, inplace=True)
training_indices, testing_indices = train_test_split(dat.index, stratify = labels.values, train_size=0.5, test_size=0.5)
tpot = TPOT(verbosity=2, generations=10)
tpot.fit(dat.drop('Class',axis=1).loc[training_indices].values, dat.loc[training_indices,'Class'].values.astype(int))
tpot.score(dat.drop('Class',axis=1).loc[testing_indices].values, dat.loc[testing_indices, 'Class'].values.astype(int))
Out[5]:
In [17]:
# %matplotlib inline
# from sklearn.svm import SVC
# predictor = SVC()
# predictor.fit(dat.drop('Class',axis=1).loc[training_indices].values, dat.loc[training_indices,'Class'].values.astype(int))
# predictor.score(dat.drop('Class',axis=1).loc[training_indices].values, dat.loc[training_indices,'Class'].values.astype(int))
# # plt.scatter(dat.iloc[:, 0], dat.iloc[:, 1], c=dat.iloc[:, 2])
Out[17]:
In [4]:
tpot.export('pipe.py')
In [85]:
dat = pd.read_table('/home/chris/Devel/pyquant/ml_test_cl2_stats')
dat = dat[dat['Peptide'].str.count('R')+dat['Peptide'].str.count('K')+dat['Peptide'].str.count('k')+dat['Peptide'].str.count('r') == 1]
dat['Class'] = None
dat.loc[dat['Peptide'].str.count('R')+dat['Peptide'].str.count('r') == 1, 'Class'] = 'R'
dat.loc[dat['Peptide'].str.count('K')+dat['Peptide'].str.count('k') == 1, 'Class'] = 'K'
In [98]:
np.log2(dat.loc[dat['Class']=='R','Heavy/Light']).plot(kind='density', c='r')
np.log2(dat.loc[(dat['Class']=='R') & (dat['Heavy/Light Confidence']>5),'Heavy/Light']).plot(kind='density', c='g')
np.log2(dat.loc[(dat['Class']=='R') & (dat['Heavy/Light Confidence']>8),'Heavy/Light']).plot(kind='density', c='k')
Out[98]:
In [97]:
isotope = 'K'
ratio = 'Heavy/Light'
df_1 = np.log2(dat.loc[dat['Class']==isotope,ratio])
df_2 = np.log2(dat.loc[(dat['Class']==isotope) & (dat['{} Confidence'.format(ratio)]>5),ratio])
df_3 = np.log2(dat.loc[(dat['Class']==isotope) & (dat['{} Confidence'.format(ratio)]>9),ratio])
df = pd.concat([df_1, df_2, df_3], axis=1)
df.columns=['All', '5', '8']
df.plot(kind='box')
Out[97]:
In [94]:
dat.loc[dat['Class']=='K', '{} Confidence'.format('Heavy/Light')].plot(kind='density')
Out[94]: