This explores different ways to analyze the quality of PSM quantifications


In [1]:
from itertools import chain

bad_data = [
    ('ELcSAAITMSDNTAANLLLTTIGGPk', 8846),
    ('FVESVDVAVNLGIDAR',7466 ),
    ('ELcSAAITMSDNTAANLLLTTIGGPK', 9209),
    ('FVESVDVAVNLGIDAR', 9213),
    ('FVESVDVAVNLGIDAR', 9426),
    ('AVTLYLGAVAATVR', 6660),
    ('AVTLYLGAVAATVR', 8958),
    ('IVVIYTTGSQATMDER', 4505),
    ('VGYIELDLNSGk', 5624),
    ('LLTGELLTLASR', 6942),
    ('FVESVDVAVNLGIDAr', 9184),
    ('ELcSAAITMSDNTAANLLLTTIGGPk', 9458),
    ('VGYIELDLNSGk', 5238),
    ('IVVIYTTGSQATMDERNR', 4024),
    ('AVTLYLGAVAATVR', 9652),
    ('ELcSAAITMSDNTAANLLLTTIGGPk', 8883),
    ('IVVIYTTGSQATMDERNR', 4005),
    ('FVESVDVAVNLGIDAR', 9950),
    ('AQHSALDDIPR', 2510),
    ('FVESVDVAVNLGIDAR', 9980),
    ('VGYIELDLNSGk', 9546),
    ('IVVIYTTGSQATMDER', 9933),
    ('HFESTPDTPEIIATIHGEGYR', 4488),
    ('YYLGNADEIAAK', 3703),
    ('FVESVDVAVNLGIDAR', 6879),
    ('RDDSILLAQHTR', 1849),
    ('EQGYALDSEENEQGVR', 2536),
    ('VLLcGAVLSR', 4541),
    ('LGYPITDDLDIYTr', 5790),
    ('VGYIELDLNSGk', 8965),
    ('FVESVDVAVNLGIDAR', 7796),
]

good_data = [
    ('VHIINLEK', 2373),
    ('HITDRDVR', 863),
    ('GATVLPHGTGr', 1244),
    ('GATVLPHGTGR', 1238),
    ('EQGLHFYAAGHHATER', 1570),
    ('VPLHTLr', 1371),
    ('IHVAVAQEVPGTGVDTPEDLER', 4157),
    ('cIFDNISLTVPR', 6174),
    ('HLTDGmTVR', 974),
    ('AGVHFGHQTR', 1002),
    ('AHHYPSELSGGQQQR', 1142),
    ('HYGALQGLNk', 1738),
    ('HITGLHYNPITNTFk', 3590),
    ('IGLLEHANR', 2008),
    ('ALEINSQSLDNNAAFIR', 5217),
    ('RIYGVLER', 2188),
    ('FQDVGSFDYGR', 3734),
    ('AVQNAMR', 995),
    ('IGVGGTITYPR', 3358),
    ('GmGESNPVTGNTcDNVk', 1558),
    ('MVEEDPAHPr', 1177),
    ('AIENQAYVAGcNr', 1914),
    ('FIAQQLGVSR', 3332),
    ('MPEDLLTr', 3424),
    ('mVEEDPAHPr', 1016),
    ('GFSVNFER', 3790),
    ('TPVGNTAAIcIYPR', 4031),
    ('IDAILVDR', 3375),
    ('LVAVGNTFVYPIAGYSk', 5966),
]

peptides = ' '.join(i[0] for i in chain(bad_data, good_data))
scans = ' '.join(str(i[1]) for i in chain(bad_data, good_data))
out = 'ml_train'

In [24]:
# %%bash -s "$peptides" "$scans" "$out"
# pyQuant --search-file "/home/chris/gdrive/Dropbox/Manuscripts/SILAC Fix/EColi/PD/Chris_Ecoli_1-2-4-(01).msf" \
#     --scan-file "/home/chris/gdrive/Dropbox/Manuscripts/SILAC Fix/EColi/Chris_Ecoli_1-2-4.mzML" \
#     --peptide $1 --scan $2 \
#     -o $3 \
#     -p 9


INFO:pyQuant:Reader done
msparser not found, Mascot DAT files unable to be parsed
Loading Scans:
.
Scans loaded.
Beginning quantification.
Processing /home/chris/gdrive/Dropbox/Manuscripts/SILAC Fix/EColi/Chris_Ecoli_1-2-4.mzML.
........................................................................................................../home/chris/Devel/pyquant/pyquant/worker.py:49: FutureWarning: sort is deprecated, use sort_values(inplace=True) for INPLACE sorting
  self.msn_rt_map.sort()
Chris_Ecoli_1-2-4 processed and placed into queue.
/home/chris/.virtualenvs/pyquant/local/lib/python2.7/site-packages/numpy/core/_methods.py:82: RuntimeWarning: Degrees of freedom <= 0 for slice
  warnings.warn("Degrees of freedom <= 0 for slice", RuntimeWarning)
/home/chris/.virtualenvs/pyquant/local/lib/python2.7/site-packages/numpy/core/_methods.py:82: RuntimeWarning: Degrees of freedom <= 0 for slice
  warnings.warn("Degrees of freedom <= 0 for slice", RuntimeWarning)
/home/chris/.virtualenvs/pyquant/local/lib/python2.7/site-packages/numpy/core/_methods.py:82: RuntimeWarning: Degrees of freedom <= 0 for slice
  warnings.warn("Degrees of freedom <= 0 for slice", RuntimeWarning)
/home/chris/.virtualenvs/pyquant/local/lib/python2.7/site-packages/numpy/core/_methods.py:82: RuntimeWarning: Degrees of freedom <= 0 for slice
  warnings.warn("Degrees of freedom <= 0 for slice", RuntimeWarning)
/home/chris/.virtualenvs/pyquant/local/lib/python2.7/site-packages/numpy/core/_methods.py:82: RuntimeWarning: Degrees of freedom <= 0 for slice
  warnings.warn("Degrees of freedom <= 0 for slice", RuntimeWarning)
/home/chris/.virtualenvs/pyquant/local/lib/python2.7/site-packages/numpy/core/_methods.py:82: RuntimeWarning: Degrees of freedom <= 0 for slice
  warnings.warn("Degrees of freedom <= 0 for slice", RuntimeWarning)
/home/chris/.virtualenvs/pyquant/local/lib/python2.7/site-packages/numpy/core/_methods.py:82: RuntimeWarning: Degrees of freedom <= 0 for slice
  warnings.warn("Degrees of freedom <= 0 for slice", RuntimeWarning)
16.67% Completed/home/chris/.virtualenvs/pyquant/local/lib/python2.7/site-packages/numpy/core/_methods.py:82: RuntimeWarning: Degrees of freedom <= 0 for slice
  warnings.warn("Degrees of freedom <= 0 for slice", RuntimeWarning)
/home/chris/.virtualenvs/pyquant/local/lib/python2.7/site-packages/numpy/lib/nanfunctions.py:675: RuntimeWarning: Mean of empty slice
  warnings.warn("Mean of empty slice", RuntimeWarning)
/home/chris/.virtualenvs/pyquant/local/lib/python2.7/site-packages/numpy/core/_methods.py:82: RuntimeWarning: Degrees of freedom <= 0 for slice
  warnings.warn("Degrees of freedom <= 0 for slice", RuntimeWarning)
100.00% CompletedUnable to calculate statistics for Heavy/Light.
 Traceback: Traceback (most recent call last):
  File "/home/chris/Devel/pyquant/pyquant/command_line.py", line 1148, in run_pyquant
    conf_ass = classifier.predict_proba(fit_predictors)[:,1]*10
  File "/home/chris/.virtualenvs/pyquant/local/lib/python2.7/site-packages/sklearn/ensemble/forest.py", line 537, in predict_proba
    X = self._validate_X_predict(X)
  File "/home/chris/.virtualenvs/pyquant/local/lib/python2.7/site-packages/sklearn/ensemble/forest.py", line 319, in _validate_X_predict
    return self.estimators_[0]._validate_X_predict(X, check_input=True)
  File "/home/chris/.virtualenvs/pyquant/local/lib/python2.7/site-packages/sklearn/tree/tree.py", line 376, in _validate_X_predict
    % (self.n_features_, n_features))
ValueError: Number of features of the model must  match the input. Model n_features is 19 and  input n_features is 11 
Unable to calculate statistics for Heavy/Medium.
 Traceback: Traceback (most recent call last):
  File "/home/chris/Devel/pyquant/pyquant/command_line.py", line 1148, in run_pyquant
    conf_ass = classifier.predict_proba(fit_predictors)[:,1]*10
  File "/home/chris/.virtualenvs/pyquant/local/lib/python2.7/site-packages/sklearn/ensemble/forest.py", line 537, in predict_proba
    X = self._validate_X_predict(X)
  File "/home/chris/.virtualenvs/pyquant/local/lib/python2.7/site-packages/sklearn/ensemble/forest.py", line 319, in _validate_X_predict
    return self.estimators_[0]._validate_X_predict(X, check_input=True)
  File "/home/chris/.virtualenvs/pyquant/local/lib/python2.7/site-packages/sklearn/tree/tree.py", line 376, in _validate_X_predict
    % (self.n_features_, n_features))
ValueError: Number of features of the model must  match the input. Model n_features is 19 and  input n_features is 11 
Unable to calculate statistics for Light/Heavy.
 Traceback: Traceback (most recent call last):
  File "/home/chris/Devel/pyquant/pyquant/command_line.py", line 1148, in run_pyquant
    conf_ass = classifier.predict_proba(fit_predictors)[:,1]*10
  File "/home/chris/.virtualenvs/pyquant/local/lib/python2.7/site-packages/sklearn/ensemble/forest.py", line 537, in predict_proba
    X = self._validate_X_predict(X)
  File "/home/chris/.virtualenvs/pyquant/local/lib/python2.7/site-packages/sklearn/ensemble/forest.py", line 319, in _validate_X_predict
    return self.estimators_[0]._validate_X_predict(X, check_input=True)
  File "/home/chris/.virtualenvs/pyquant/local/lib/python2.7/site-packages/sklearn/tree/tree.py", line 376, in _validate_X_predict
    % (self.n_features_, n_features))
ValueError: Number of features of the model must  match the input. Model n_features is 19 and  input n_features is 11 
Unable to calculate statistics for Light/Medium.
 Traceback: Traceback (most recent call last):
  File "/home/chris/Devel/pyquant/pyquant/command_line.py", line 1148, in run_pyquant
    conf_ass = classifier.predict_proba(fit_predictors)[:,1]*10
  File "/home/chris/.virtualenvs/pyquant/local/lib/python2.7/site-packages/sklearn/ensemble/forest.py", line 537, in predict_proba
    X = self._validate_X_predict(X)
  File "/home/chris/.virtualenvs/pyquant/local/lib/python2.7/site-packages/sklearn/ensemble/forest.py", line 319, in _validate_X_predict
    return self.estimators_[0]._validate_X_predict(X, check_input=True)
  File "/home/chris/.virtualenvs/pyquant/local/lib/python2.7/site-packages/sklearn/tree/tree.py", line 376, in _validate_X_predict
    % (self.n_features_, n_features))
ValueError: Number of features of the model must  match the input. Model n_features is 19 and  input n_features is 11 
Unable to calculate statistics for Medium/Heavy.
 Traceback: Traceback (most recent call last):
  File "/home/chris/Devel/pyquant/pyquant/command_line.py", line 1148, in run_pyquant
    conf_ass = classifier.predict_proba(fit_predictors)[:,1]*10
  File "/home/chris/.virtualenvs/pyquant/local/lib/python2.7/site-packages/sklearn/ensemble/forest.py", line 537, in predict_proba
    X = self._validate_X_predict(X)
  File "/home/chris/.virtualenvs/pyquant/local/lib/python2.7/site-packages/sklearn/ensemble/forest.py", line 319, in _validate_X_predict
    return self.estimators_[0]._validate_X_predict(X, check_input=True)
  File "/home/chris/.virtualenvs/pyquant/local/lib/python2.7/site-packages/sklearn/tree/tree.py", line 376, in _validate_X_predict
    % (self.n_features_, n_features))
ValueError: Number of features of the model must  match the input. Model n_features is 19 and  input n_features is 11 
Unable to calculate statistics for Medium/Light.
 Traceback: Traceback (most recent call last):
  File "/home/chris/Devel/pyquant/pyquant/command_line.py", line 1148, in run_pyquant
    conf_ass = classifier.predict_proba(fit_predictors)[:,1]*10
  File "/home/chris/.virtualenvs/pyquant/local/lib/python2.7/site-packages/sklearn/ensemble/forest.py", line 537, in predict_proba
    X = self._validate_X_predict(X)
  File "/home/chris/.virtualenvs/pyquant/local/lib/python2.7/site-packages/sklearn/ensemble/forest.py", line 319, in _validate_X_predict
    return self.estimators_[0]._validate_X_predict(X, check_input=True)
  File "/home/chris/.virtualenvs/pyquant/local/lib/python2.7/site-packages/sklearn/tree/tree.py", line 376, in _validate_X_predict
    % (self.n_features_, n_features))
ValueError: Number of features of the model must  match the input. Model n_features is 19 and  input n_features is 11 

In [33]:
# %%bash -s "$peptides" "$scans" "$out"
# pyQuant --search-file "/home/chris/gdrive/Dropbox/Manuscripts/SILAC Fix/EColi/PD/Chris_Ecoli_1-2-4-(01).msf" \
#     --scan-file "/home/chris/gdrive/Dropbox/Manuscripts/SILAC Fix/EColi/Chris_Ecoli_1-2-4.mzML" \
#     -o $3 \
#     -p 9


ERROR ON IGSDAYNQGLSER: Traceback (most recent call last):
  File "/home/chris/Devel/pyquant/pyquant/worker.py", line 702, in quantify_peaks
    peak_index = peaks.find_nearest_index(merged_x, valid_peaks[0]['mean'])
IndexError: list index out of range

INFO:pyQuant:Reader done
msparser not found, Mascot DAT files unable to be parsed
In file included from /home/chris/.virtualenvs/pyquant/local/lib/python2.7/site-packages/numpy/core/include/numpy/ndarraytypes.h:1777:0,
                 from /home/chris/.virtualenvs/pyquant/local/lib/python2.7/site-packages/numpy/core/include/numpy/ndarrayobject.h:18,
                 from /home/chris/.virtualenvs/pyquant/local/lib/python2.7/site-packages/numpy/core/include/numpy/arrayobject.h:4,
                 from /home/chris/.pyxbld/temp.linux-x86_64-2.7/pyrex/pyquant/cpeaks.c:266:
/home/chris/.virtualenvs/pyquant/local/lib/python2.7/site-packages/numpy/core/include/numpy/npy_1_7_deprecated_api.h:15:2: warning: #warning "Using deprecated NumPy API, disable it by " "#defining NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION" [-Wcpp]
 #warning "Using deprecated NumPy API, disable it by " \
  ^
In file included from /home/chris/.virtualenvs/pyquant/local/lib/python2.7/site-packages/numpy/core/include/numpy/ndarrayobject.h:27:0,
                 from /home/chris/.virtualenvs/pyquant/local/lib/python2.7/site-packages/numpy/core/include/numpy/arrayobject.h:4,
                 from /home/chris/.pyxbld/temp.linux-x86_64-2.7/pyrex/pyquant/cpeaks.c:266:
/home/chris/.virtualenvs/pyquant/local/lib/python2.7/site-packages/numpy/core/include/numpy/__multiarray_api.h:1448:1: warning: ‘_import_array’ defined but not used [-Wunused-function]
 _import_array(void)
 ^
/home/chris/.pyxbld/temp.linux-x86_64-2.7/pyrex/pyquant/cpeaks.c: In function ‘__pyx_pf_7pyquant_6cpeaks_12gauss_jac_old.isra.52’:
/home/chris/.pyxbld/temp.linux-x86_64-2.7/pyrex/pyquant/cpeaks.c:6003:37: warning: ‘__pyx_v_mu’ may be used uninitialized in this function [-Wmaybe-uninitialized]
       __pyx_t_1 = PyFloat_FromDouble(__pyx_v_mu); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 125, __pyx_L1_error)
                                     ^
/home/chris/.pyxbld/temp.linux-x86_64-2.7/pyrex/pyquant/cpeaks.c:5702:9: note: ‘__pyx_v_mu’ was declared here
   float __pyx_v_mu;
         ^
/home/chris/.pyxbld/temp.linux-x86_64-2.7/pyrex/pyquant/cpeaks.c:6056:37: warning: ‘__pyx_v_amp’ may be used uninitialized in this function [-Wmaybe-uninitialized]
       __pyx_t_4 = PyFloat_FromDouble(__pyx_v_amp); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 126, __pyx_L1_error)
                                     ^
/home/chris/.pyxbld/temp.linux-x86_64-2.7/pyrex/pyquant/cpeaks.c:5701:9: note: ‘__pyx_v_amp’ was declared here
   float __pyx_v_amp;
         ^
/home/chris/.pyxbld/temp.linux-x86_64-2.7/pyrex/pyquant/cpeaks.c: In function ‘__pyx_pf_7pyquant_6cpeaks_14gauss_jac.isra.50’:
/home/chris/.pyxbld/temp.linux-x86_64-2.7/pyrex/pyquant/cpeaks.c:6789:39: warning: ‘__pyx_v_mu’ may be used uninitialized in this function [-Wmaybe-uninitialized]
       __pyx_t_1 = PyFloat_FromDouble((-__pyx_v_mu)); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 144, __pyx_L1_error)
                                       ^
/home/chris/.pyxbld/temp.linux-x86_64-2.7/pyrex/pyquant/cpeaks.c:6440:9: note: ‘__pyx_v_mu’ was declared here
   float __pyx_v_mu;
         ^
/home/chris/.pyxbld/temp.linux-x86_64-2.7/pyrex/pyquant/cpeaks.c:6897:38: warning: ‘__pyx_v_amp’ may be used uninitialized in this function [-Wmaybe-uninitialized]
       __pyx_t_15 = PyFloat_FromDouble(__pyx_v_amp); if (unlikely(!__pyx_t_15)) __PYX_ERR(0, 146, __pyx_L1_error)
                                      ^
/home/chris/.pyxbld/temp.linux-x86_64-2.7/pyrex/pyquant/cpeaks.c:6439:9: note: ‘__pyx_v_amp’ was declared here
   float __pyx_v_amp;
         ^
/home/chris/.pyxbld/temp.linux-x86_64-2.7/pyrex/pyquant/cpeaks.c: In function ‘__pyx_pf_7pyquant_6cpeaks_10bigauss_jac.isra.54’:
/home/chris/.pyxbld/temp.linux-x86_64-2.7/pyrex/pyquant/cpeaks.c:5062:37: warning: ‘__pyx_v_amp’ may be used uninitialized in this function [-Wmaybe-uninitialized]
       __pyx_t_4 = PyFloat_FromDouble(__pyx_v_amp); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 104, __pyx_L1_error)
                                     ^
/home/chris/.pyxbld/temp.linux-x86_64-2.7/pyrex/pyquant/cpeaks.c:4510:9: note: ‘__pyx_v_amp’ was declared here
   float __pyx_v_amp;
         ^
/home/chris/.pyxbld/temp.linux-x86_64-2.7/pyrex/pyquant/cpeaks.c:4931:49: warning: ‘__pyx_v_sigma1’ may be used uninitialized in this function [-Wmaybe-uninitialized]
       __pyx_t_3 = PyFloat_FromDouble((2.0 * powf(__pyx_v_sigma1, 2.0))); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 101, __pyx_L1_error)
                                                 ^
/home/chris/.pyxbld/temp.linux-x86_64-2.7/pyrex/pyquant/cpeaks.c:4512:9: note: ‘__pyx_v_sigma1’ was declared here
   float __pyx_v_sigma1;
         ^
/home/chris/.pyxbld/temp.linux-x86_64-2.7/pyrex/pyquant/cpeaks.c:4920:39: warning: ‘__pyx_v_mu’ may be used uninitialized in this function [-Wmaybe-uninitialized]
       __pyx_t_3 = PyFloat_FromDouble((-__pyx_v_mu)); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 101, __pyx_L1_error)
                                       ^
/home/chris/.pyxbld/temp.linux-x86_64-2.7/pyrex/pyquant/cpeaks.c:4511:9: note: ‘__pyx_v_mu’ was declared here
   float __pyx_v_mu;
         ^
Loading Scans:
..
Scans loaded.
Beginning quantification.
Processing /home/chris/gdrive/Dropbox/Manuscripts/SILAC Fix/EColi/Chris_Ecoli_1-2-4.mzML.
........................................................................................................../home/chris/Devel/pyquant/pyquant/worker.py:49: FutureWarning: sort is deprecated, use sort_values(inplace=True) for INPLACE sorting
  self.msn_rt_map.sort()
Chris_Ecoli_1-2-4 processed and placed into queue.
/home/chris/.virtualenvs/pyquant/local/lib/python2.7/site-packages/numpy/core/_methods.py:82: RuntimeWarning: Degrees of freedom <= 0 for slice
  warnings.warn("Degrees of freedom <= 0 for slice", RuntimeWarning)
/home/chris/.virtualenvs/pyquant/local/lib/python2.7/site-packages/numpy/lib/nanfunctions.py:675: RuntimeWarning: Mean of empty slice
  warnings.warn("Mean of empty slice", RuntimeWarning)
/home/chris/.virtualenvs/pyquant/local/lib/python2.7/site-packages/numpy/core/_methods.py:82: RuntimeWarning: Degrees of freedom <= 0 for slice
  warnings.warn("Degrees of freedom <= 0 for slice", RuntimeWarning)
/home/chris/.virtualenvs/pyquant/local/lib/python2.7/site-packages/numpy/core/_methods.py:82: RuntimeWarning: Degrees of freedom <= 0 for slice
  warnings.warn("Degrees of freedom <= 0 for slice", RuntimeWarning)
/home/chris/.virtualenvs/pyquant/local/lib/python2.7/site-packages/numpy/core/_methods.py:82: RuntimeWarning: Degrees of freedom <= 0 for slice
  warnings.warn("Degrees of freedom <= 0 for slice", RuntimeWarning)
/home/chris/.virtualenvs/pyquant/local/lib/python2.7/site-packages/numpy/core/_methods.py:82: RuntimeWarning: Degrees of freedom <= 0 for slice
  warnings.warn("Degrees of freedom <= 0 for slice", RuntimeWarning)
/home/chris/.virtualenvs/pyquant/local/lib/python2.7/site-packages/numpy/core/_methods.py:82: RuntimeWarning: Degrees of freedom <= 0 for slice
  warnings.warn("Degrees of freedom <= 0 for slice", RuntimeWarning)
0.78% Completed/home/chris/.virtualenvs/pyquant/local/lib/python2.7/site-packages/numpy/core/_methods.py:82: RuntimeWarning: Degrees of freedom <= 0 for slice
  warnings.warn("Degrees of freedom <= 0 for slice", RuntimeWarning)
1.55% Completed/home/chris/.virtualenvs/pyquant/local/lib/python2.7/site-packages/numpy/core/_methods.py:82: RuntimeWarning: Degrees of freedom <= 0 for slice
  warnings.warn("Degrees of freedom <= 0 for slice", RuntimeWarning)
3.10% Completed/home/chris/.virtualenvs/pyquant/local/lib/python2.7/site-packages/numpy/core/_methods.py:82: RuntimeWarning: Degrees of freedom <= 0 for slice
  warnings.warn("Degrees of freedom <= 0 for slice", RuntimeWarning)
99.30% CompletedUnable to calculate statistics for Heavy/Light.
 Traceback: Traceback (most recent call last):
  File "/home/chris/Devel/pyquant/pyquant/command_line.py", line 1148, in run_pyquant
    conf_ass = classifier.predict_proba(fit_predictors)[:,1]*10
  File "/home/chris/.virtualenvs/pyquant/local/lib/python2.7/site-packages/sklearn/ensemble/forest.py", line 537, in predict_proba
    X = self._validate_X_predict(X)
  File "/home/chris/.virtualenvs/pyquant/local/lib/python2.7/site-packages/sklearn/ensemble/forest.py", line 319, in _validate_X_predict
    return self.estimators_[0]._validate_X_predict(X, check_input=True)
  File "/home/chris/.virtualenvs/pyquant/local/lib/python2.7/site-packages/sklearn/tree/tree.py", line 376, in _validate_X_predict
    % (self.n_features_, n_features))
ValueError: Number of features of the model must  match the input. Model n_features is 19 and  input n_features is 11 
Unable to calculate statistics for Heavy/Medium.
 Traceback: Traceback (most recent call last):
  File "/home/chris/Devel/pyquant/pyquant/command_line.py", line 1148, in run_pyquant
    conf_ass = classifier.predict_proba(fit_predictors)[:,1]*10
  File "/home/chris/.virtualenvs/pyquant/local/lib/python2.7/site-packages/sklearn/ensemble/forest.py", line 537, in predict_proba
    X = self._validate_X_predict(X)
  File "/home/chris/.virtualenvs/pyquant/local/lib/python2.7/site-packages/sklearn/ensemble/forest.py", line 319, in _validate_X_predict
    return self.estimators_[0]._validate_X_predict(X, check_input=True)
  File "/home/chris/.virtualenvs/pyquant/local/lib/python2.7/site-packages/sklearn/tree/tree.py", line 376, in _validate_X_predict
    % (self.n_features_, n_features))
ValueError: Number of features of the model must  match the input. Model n_features is 19 and  input n_features is 11 
Unable to calculate statistics for Light/Heavy.
 Traceback: Traceback (most recent call last):
  File "/home/chris/Devel/pyquant/pyquant/command_line.py", line 1148, in run_pyquant
    conf_ass = classifier.predict_proba(fit_predictors)[:,1]*10
  File "/home/chris/.virtualenvs/pyquant/local/lib/python2.7/site-packages/sklearn/ensemble/forest.py", line 537, in predict_proba
    X = self._validate_X_predict(X)
  File "/home/chris/.virtualenvs/pyquant/local/lib/python2.7/site-packages/sklearn/ensemble/forest.py", line 319, in _validate_X_predict
    return self.estimators_[0]._validate_X_predict(X, check_input=True)
  File "/home/chris/.virtualenvs/pyquant/local/lib/python2.7/site-packages/sklearn/tree/tree.py", line 376, in _validate_X_predict
    % (self.n_features_, n_features))
ValueError: Number of features of the model must  match the input. Model n_features is 19 and  input n_features is 11 
Unable to calculate statistics for Light/Medium.
 Traceback: Traceback (most recent call last):
  File "/home/chris/Devel/pyquant/pyquant/command_line.py", line 1148, in run_pyquant
    conf_ass = classifier.predict_proba(fit_predictors)[:,1]*10
  File "/home/chris/.virtualenvs/pyquant/local/lib/python2.7/site-packages/sklearn/ensemble/forest.py", line 537, in predict_proba
    X = self._validate_X_predict(X)
  File "/home/chris/.virtualenvs/pyquant/local/lib/python2.7/site-packages/sklearn/ensemble/forest.py", line 319, in _validate_X_predict
    return self.estimators_[0]._validate_X_predict(X, check_input=True)
  File "/home/chris/.virtualenvs/pyquant/local/lib/python2.7/site-packages/sklearn/tree/tree.py", line 376, in _validate_X_predict
    % (self.n_features_, n_features))
ValueError: Number of features of the model must  match the input. Model n_features is 19 and  input n_features is 11 
Unable to calculate statistics for Medium/Heavy.
 Traceback: Traceback (most recent call last):
  File "/home/chris/Devel/pyquant/pyquant/command_line.py", line 1148, in run_pyquant
    conf_ass = classifier.predict_proba(fit_predictors)[:,1]*10
  File "/home/chris/.virtualenvs/pyquant/local/lib/python2.7/site-packages/sklearn/ensemble/forest.py", line 537, in predict_proba
    X = self._validate_X_predict(X)
  File "/home/chris/.virtualenvs/pyquant/local/lib/python2.7/site-packages/sklearn/ensemble/forest.py", line 319, in _validate_X_predict
    return self.estimators_[0]._validate_X_predict(X, check_input=True)
  File "/home/chris/.virtualenvs/pyquant/local/lib/python2.7/site-packages/sklearn/tree/tree.py", line 376, in _validate_X_predict
    % (self.n_features_, n_features))
ValueError: Number of features of the model must  match the input. Model n_features is 19 and  input n_features is 11 
Unable to calculate statistics for Medium/Light.
 Traceback: Traceback (most recent call last):
  File "/home/chris/Devel/pyquant/pyquant/command_line.py", line 1148, in run_pyquant
    conf_ass = classifier.predict_proba(fit_predictors)[:,1]*10
  File "/home/chris/.virtualenvs/pyquant/local/lib/python2.7/site-packages/sklearn/ensemble/forest.py", line 537, in predict_proba
    X = self._validate_X_predict(X)
  File "/home/chris/.virtualenvs/pyquant/local/lib/python2.7/site-packages/sklearn/ensemble/forest.py", line 319, in _validate_X_predict
    return self.estimators_[0]._validate_X_predict(X, check_input=True)
  File "/home/chris/.virtualenvs/pyquant/local/lib/python2.7/site-packages/sklearn/tree/tree.py", line 376, in _validate_X_predict
    % (self.n_features_, n_features))
ValueError: Number of features of the model must  match the input. Model n_features is 19 and  input n_features is 11 

In [ ]:


In [74]:
%matplotlib inline
from tpot import TPOT
from sklearn.cross_validation import train_test_split
import numpy as np
from scipy.special import logit
import pandas as pd
pd.options.display.max_columns = None
from patsy import dmatrix

dat = pd.read_table(out)
dat = dat[dat['Peptide'].str.count('R')+dat['Peptide'].str.count('K')+dat['Peptide'].str.count('k')+dat['Peptide'].str.count('r') == 1]
dat['Class'] = None
dat.loc[dat['Peptide'].str.count('R')+dat['Peptide'].str.count('r') == 1, 'Class'] = 'R'
dat.loc[dat['Peptide'].str.count('K')+dat['Peptide'].str.count('k') == 1, 'Class'] = 'K'
dat.set_index(['Peptide', 'MS2 Spectrum ID'], inplace=True)
dat.drop(['Modifications', 'Raw File', 'Accession', 'MS1 Spectrum ID', 'Charge', 'Medium Calibrated Precursor', 'Medium Precursor', 'Heavy/Medium', 'Heavy Calibrated Precursor', 'Heavy Precursor', 'Light Calibrated Precursor', 'Light Precursor', 'Retention Time', 'Heavy/Light Confidence', 'Medium/Heavy', 'Medium/Heavy Confidence', 'Medium/Light Confidence', 'Light/Medium Confidence', 'Heavy/Medium Confidence', 'Light/Heavy Confidence'], inplace=True, axis=1)
# Arg H/L -> -1.86
# Arg M/L = -1
# Lys H/L -> 1.89
# Lys M/L = 0.72
nds = []
for numerator, denominator in zip(['Heavy', 'Medium'], ['Light', 'Light']):
    ratio = '{}/{}'.format(numerator, denominator)
    cols=['Isotopes Found', 'Intensity', 'RT Width', 'Mean Offset', 'Residual', 'R^2', 'SNR']
    nd = pd.DataFrame([], columns=[
         'Label1 Isotopes Found',
         'Label1 Intensity',
         'Label1 RT Width',
         'Label1 Mean Offset',
         'Label1 Residual',
         'Label1 R^2',
         'Label1 SNR',
         'Label2 Isotopes Found',
         'Label2 Intensity',
         'Label2 RT Width',
         'Label2 Mean Offset',
         'Label2 Residual',
         'Label2 R^2',
         'Label2 SNR',
         'Deviation',
         'Class',
    ])
    median, std = np.log2(dat[dat['Class']=='R'][ratio]).median(), np.log2(dat[dat['Class']=='R'][ratio]).std()
    expected = median
    nd['Deviation'] = np.log2(dat[dat['Class']=='R'][ratio])-expected
    nd['Class'] = np.abs(np.log2(dat[dat['Class']=='R'][ratio])-median).apply(lambda x: 1 if x < std else 0)
    for label, new_label in zip([numerator, denominator], ['Label1', 'Label2']):
        for col in cols:
            nd['{} {}'.format(new_label, col)] = dat['{} {}'.format(label, col)]
    nd['Label1 Intensity'] = np.log2(nd['Label1 Intensity'])
    nd['Label2 Intensity'] = np.log2(nd['Label2 Intensity'])
    nd['Label1 R^2'] = logit(nd['Label1 R^2'])
    nd['Label2 R^2'] = logit(nd['Label2 R^2'])
    nds.append(nd)
for numerator, denominator in zip(['Heavy', 'Medium'], ['Light', 'Light']):
    ratio = '{}/{}'.format(numerator, denominator)
    cols=['Isotopes Found', 'Intensity', 'RT Width', 'Mean Offset', 'Residual', 'R^2', 'SNR']
    nd = pd.DataFrame([], columns=[
         'Label1 Isotopes Found',
         'Label1 Intensity',
         'Label1 RT Width',
         'Label1 Mean Offset',
         'Label1 Residual',
         'Label1 R^2',
         'Label1 SNR',
         'Label2 Isotopes Found',
         'Label2 Intensity',
         'Label2 RT Width',
         'Label2 Mean Offset',
         'Label2 Residual',
         'Label2 R^2',
         'Label2 SNR',
         'Deviation',
         'Class'
    ])
    median, std = np.log2(dat[dat['Class']=='K'][ratio]).median(), np.log2(dat[dat['Class']=='K'][ratio]).std()
    expected = median
    nd['Deviation'] = np.log2(dat[dat['Class']=='K'][ratio])-expected
    nd['Class'] = np.abs(np.log2(dat[dat['Class']=='K'][ratio])-median).apply(lambda x: 1 if x < std else 0)
    for label, new_label in zip([numerator, denominator], ['Label1', 'Label2']):
        for col in cols:
            nd['{} {}'.format(new_label, col)] = dat['{} {}'.format(label, col)]
    nd['Label1 Intensity'] = np.log2(nd['Label1 Intensity'])
    nd['Label2 Intensity'] = np.log2(nd['Label2 Intensity'])
    nd['Label1 R^2'] = logit(nd['Label1 R^2'])
    nd['Label2 R^2'] = logit(nd['Label2 R^2'])
    nds.append(nd)
pd.concat(nds)


Out[74]:
Label1 Isotopes Found Label1 Intensity Label1 RT Width Label1 Mean Offset Label1 Residual Label1 R^2 Label1 SNR Label2 Isotopes Found Label2 Intensity Label2 RT Width Label2 Mean Offset Label2 Residual Label2 R^2 Label2 SNR Deviation Class
Peptide MS2 Spectrum ID
GcImGSAHQr 779 24.0 14.985734 0.053936 9.931775e-01 0.085489 3.528190 21.114721 32.0 16.703665 0.056111 1.434795e+00 0.416770 2.361091 9.658139 0.094592 1
GcImGSAHQR 783 17.0 12.924555 0.042203 2.686645e+00 0.492796 3.038507 8.359946 26.0 14.711507 0.055908 4.517510e-01 1.063376 2.148553 6.236617 0.000523 1
777 28.0 14.996194 0.043803 2.569664e+00 0.084761 3.823541 20.474859 34.0 16.708224 0.056213 1.406006e+00 0.412675 2.329501 9.082110 0.094830 1
TQDATHGNSLSHR 811 39.0 13.991486 0.048815 2.128489e+00 0.972111 1.881489 6.062496 50.0 15.962065 0.061264 4.103946e-01 0.238537 3.268813 2.282480 -0.101995 1
IEQAPGQHGAR 887 7.0 16.167331 0.050244 7.230988e-02 0.012362 13.026475 1.012433 11.0 17.872230 0.045521 6.600722e-02 0.034781 11.525348 1.143215 0.290681 1
AGVTGAENr 904 8.0 17.969158 0.067676 2.493468e-01 0.025712 5.443686 1.053801 10.0 19.819117 0.055905 1.362417e-01 0.015580 8.858486 1.034260 -0.065885 1
AGVTGAENR 903 8.0 17.969158 0.067676 2.493468e-01 0.025712 5.443686 1.053801 10.0 19.819117 0.055905 1.362417e-01 0.015580 8.858486 1.034260 -0.065885 1
GTAmNPVDHPHGGGEGR 917 8.0 16.550535 0.050721 1.833391e-01 0.019163 7.692895 1.021964 12.0 18.294151 0.051766 9.084471e-02 0.014866 9.459570 1.234008 0.085492 1
ALVSHPR 933 4.0 15.729831 0.090572 1.991482e-01 0.059474 3.887081 NaN 8.0 17.857216 0.070174 9.030660e-01 0.067431 4.512238 1.013494 0.111110 1
mTGDNPDAPR 944 2.0 13.843877 0.054663 1.666761e-01 0.034627 8.161228 NaN 7.0 15.876904 0.063909 6.023946e-01 0.034698 4.959971 1.001678 0.007644 1
VHPNGIR 898 6.0 15.131996 0.056340 4.975481e-01 0.247207 3.758966 NaN 11.0 17.245997 0.058525 1.444357e-01 0.052027 5.027183 1.406236 -0.135307 1
SVANAEQmDR 959 9.0 16.980617 0.069780 3.405412e-01 0.119328 4.002708 1.026194 12.0 18.786625 0.075726 4.197732e-01 0.104451 3.851186 0.931681 0.071929 1
SVANAEQmDr 962 9.0 16.980617 0.069780 3.405412e-01 0.119328 4.002708 1.026194 12.0 18.786625 0.075726 4.197732e-01 0.104451 3.851186 0.931681 0.071929 1
AAASHLVR 961 14.0 17.324065 0.058804 1.981803e+00 0.189402 8.559042 15.804382 27.0 19.420032 0.075272 1.019746e+00 1.396346 2.584104 19.286076 0.091354 1
AAASHLVr 964 15.0 17.324065 0.058804 1.981799e+00 0.203778 8.558997 10.880751 28.0 19.424549 0.075731 9.812200e-01 1.535988 2.584007 15.079136 0.083129 1
HLTDGmTVr 975 28.0 23.746521 0.093142 4.456008e-01 0.021245 5.482953 15.104723 35.0 25.416597 0.084778 1.511559e+00 0.072245 5.677385 16.507001 0.080686 1
HLTDGmTVR 974 28.0 23.746521 0.093142 4.456008e-01 0.021245 5.482953 15.104723 35.0 25.416597 0.084778 1.511559e+00 0.072245 5.677385 16.507001 0.080686 1
AVQNAMR 995 9.0 17.682340 0.061553 1.262045e+00 0.062885 4.914437 1.126810 18.0 19.446729 0.067331 9.486762e-01 0.099772 5.123132 3.522732 0.142120 1
AGVHFGHQTR 1002 12.0 21.008434 0.083990 5.339253e-01 0.055843 4.154514 1.224810 20.0 22.994444 0.075378 2.275157e-01 0.024981 5.482815 1.207646 -0.189688 1
mVEEDPAHPr 1016 13.0 18.121198 0.092191 7.536466e-01 0.071224 4.389010 6.852894 20.0 20.084497 0.093785 3.310712e-01 0.046965 4.964579 10.238337 -0.049676 1
mVEEDPAHPR 1020 12.0 18.117252 0.092013 7.461029e-01 0.072749 4.389664 1.142899 20.0 20.084497 0.093785 3.310712e-01 0.046965 4.964579 10.238337 -0.227445 1
AGVHFGHQTR 992 23.0 23.282827 0.076255 1.544298e-01 0.013500 7.192932 15.622094 30.0 25.061076 0.080505 3.480352e-01 0.063421 4.765415 14.052629 0.021684 1
AGVHFGHQTr 994 23.0 23.282827 0.076255 1.544298e-01 0.013500 7.192932 15.622094 31.0 25.061076 0.080505 3.483843e-01 0.064882 4.765081 14.052629 0.021684 1
mVEEDPAHPr 1025 6.0 15.698337 0.069287 4.783882e-01 0.046070 5.193861 2.638442 11.0 18.316519 0.079541 1.710395e-01 0.137727 5.748694 1.114434 -0.720857 1
AGVHFGHQTr 1004 12.0 21.008434 0.083990 5.339253e-01 0.055843 4.154514 1.224810 20.0 22.994444 0.075378 2.275157e-01 0.024981 5.482815 1.207646 -0.189688 1
GTAMNPVDHPHGGGEGR 1087 6.0 16.798970 0.052111 2.076855e-01 0.042727 4.882561 NaN 12.0 18.816326 0.051097 5.237172e-01 0.099033 5.016891 1.092491 -0.049417 1
TDLHGTAVR 1078 12.0 18.357713 0.060047 6.581272e-01 0.053875 4.749313 1.380350 8.0 19.553958 0.051518 1.114874e-01 0.032910 7.299707 1.038474 1.064196 0
AHHYPSELSGGQQQR 1137 19.0 20.740120 0.069004 7.869672e-01 0.034991 5.765859 1.344427 28.0 22.611332 0.080211 3.786946e-01 0.056959 4.790247 1.299816 0.020353 1
1142 13.0 18.501803 0.071385 6.950947e-01 0.036237 5.550402 1.138348 22.0 20.373860 0.069400 3.310561e-01 0.041349 5.465852 1.339840 0.052759 1
1145 5.0 18.857965 0.087171 7.221403e-01 0.016279 4.229913 1.098967 23.0 21.817347 0.073249 3.987897e-01 0.079853 4.701050 1.190595 0.222085 1
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
LAQmQIPADDYFIWITGEGk 7850 9.0 18.757221 0.077344 1.544988e+00 0.442809 2.354658 1.138290 5.0 17.276999 0.116156 8.156097e-01 1.656970 NaN 1.651758 -0.327431 1
EGAFVPFVTLGDPGIEQSLK 7849 43.0 23.650085 0.193217 1.206463e+00 0.729791 1.709437 6.358218 41.0 23.037434 0.207459 2.615308e-01 0.228140 3.322470 6.108227 -0.002491 1
ELcSAAITmSDNTAANLLLTTIGGPk 7866 23.0 20.358045 0.143019 9.635800e-01 1.476684 1.071712 10.838309 17.0 18.233260 0.076696 4.096748e+00 1.956180 1.403417 221.445566 1.077706 0
TQGAAAFEGAVIAYEPVWAIGTGk 7884 49.0 23.298585 0.219064 1.977094e-01 0.204182 3.837238 4.173747 39.0 22.537990 0.180326 1.371329e+00 0.624656 2.240398 6.524064 0.063856 1
ELcSAAITmSDNTAANLLLTTIGGPk 7874 49.0 24.919943 0.206087 2.303474e-01 0.853150 1.620717 4.859298 47.0 24.124365 0.212079 1.652292e-01 0.197110 3.682382 4.434152 0.058754 1
7886 21.0 19.999417 0.114682 3.075215e-01 1.977056 1.436670 7.416190 34.0 19.573735 0.056227 1.608774e+00 6.064123 2.089884 6.699860 0.739104 1
VLALAENYQPLYAALGLHPGMLEk 7915 18.0 19.815205 0.095440 5.583934e-01 0.903778 1.903067 120.276338 12.0 19.124448 0.071011 1.931950e+00 0.545626 2.040685 15.321044 0.767542 1
FGASSLLASLLk 8084 34.0 23.119274 0.148651 2.196329e+00 0.141995 3.944355 5.234995 35.0 22.498940 0.188970 4.770330e-01 0.213794 3.080469 4.816335 -0.080325 1
TqGAAAFEGAVIAYEPVWAIGTGk 8094 23.0 16.250988 0.041308 4.430558e-01 0.092574 11.158354 -0.743560 27.0 18.138290 0.040326 2.672755e+01 1.164123 12.381305 0.775548 -1.095661 0
DGVGLLPTVLDVVENPk 8559 38.0 21.899157 0.169400 1.452420e-01 0.516881 3.197701 7.688767 27.0 21.228191 0.184760 3.650728e-01 0.534657 2.116104 6.450245 -0.088636 1
ELcSAAITMSDNTAANLLLTTIGGPk 8695 29.0 22.895231 0.273095 3.034958e-01 0.280452 2.329319 4.004378 52.0 23.293799 0.232344 4.787828e-01 1.796918 0.960049 4.702896 -0.195882 1
8686 32.0 23.059216 0.201174 5.944131e+00 2.126078 0.805061 4.347489 39.0 22.595297 0.129570 2.685297e+00 3.346062 1.681354 2.784739 0.025548 1
VGYIELDLNSGk 8965 36.0 18.805564 0.332078 2.099859e+00 7.771240 NaN 6.158339 29.0 17.510583 0.145803 2.394400e+00 4.397509 -0.054590 2.641730 0.394377 1
SLDDAQIALAVINTTYASQIGLTPAk 9089 26.0 20.788147 0.102492 2.694714e-01 0.586520 3.116046 15.176564 31.0 20.371373 0.114255 6.046525e-01 1.046882 2.549166 6.614207 -0.124315 1
ELcSAAITMSDNTAANLLLTTIGGPk 8846 216.0 23.835211 0.850754 3.325143e-01 1.893201 NaN -0.022767 216.0 23.428056 0.756642 2.127841e+00 2.621094 -3.001260 0.851808 -0.127975 1
8883 264.0 24.355262 0.235376 1.270539e-01 2.377913 1.738366 4.411485 268.0 24.149613 0.279444 2.810016e-01 1.998608 1.920195 3.695578 -0.300475 1
9458 71.0 19.814885 0.067602 1.056360e+00 4.460319 1.553299 8.495942 55.0 19.899775 0.081068 7.621572e-01 4.665191 2.026509 16.984387 -1.687464 0
LANEGIFTQQELYDELLTLADEAk 9522 4.0 18.594230 0.047298 9.488388e-02 0.128509 3.940116 NaN 5.0 18.353222 0.046340 7.130651e-01 0.196605 3.942563 NaN 0.062293 1
AIHTLWNVLDELDQAWLPVEk 9560 11.0 22.513893 0.058378 2.969589e-01 0.072579 4.455892 1.022964 11.0 21.834118 0.062201 4.901800e-01 0.119166 3.774949 1.024549 -0.038950 1
9586 6.0 21.728629 0.058892 3.079504e-01 0.046923 4.161881 0.963363 8.0 21.320748 0.054593 9.095046e-01 0.092096 4.274263 1.208662 -0.070930 1
ELcSAAITMSDNTAANLLLTTIGGPK 9148 104.0 18.329059 0.041806 6.509279e+00 2.474097 3.799134 2.287613 182.0 21.163595 0.263017 9.294961e-01 3.052862 0.857173 1.730295 -3.928888 0
TAPDGEHGVNLVHLEDVIGAITLLLQAPk 9656 4.0 17.701533 0.040175 6.339672e-04 0.016543 inf NaN 6.0 17.895217 0.040175 6.339672e-04 0.024815 inf NaN -0.150357 1
NADGLGMLVAqAAHAFLLWHGVLPDVEPVIk 9688 3.0 15.521233 0.052455 1.120198e-01 0.068335 9.743359 NaN 1.0 13.981914 0.052455 1.120198e-01 0.022778 9.743359 NaN -0.548328 1
FLQFMVSPAFQNAIPTGnWMYPVANVTLPAGFEK 9696 0.0 -inf NaN NaN NaN NaN NaN 4.0 16.293770 0.067028 1.165689e-01 0.042940 4.309249 NaN NaN 0
ELcSAAITMSDNTAANLLLTTIGGPK 9209 216.0 20.944411 0.236307 9.643571e+00 3.447382 0.541612 2.741174 142.0 20.709059 0.299294 1.026466e+01 5.649428 -2.158764 8.410005 -0.675744 1
VLAPINDFINTLNAFFSAGGk 9765 6.0 19.314618 0.053537 1.655816e-01 0.013354 6.934276 NaN 6.0 18.662179 0.053537 2.025805e-01 0.013438 6.934400 NaN -0.090769 1
9767 5.0 18.794496 0.058808 2.308393e-01 0.013254 6.645728 1.003997 7.0 18.154701 0.054971 1.785362e-01 0.006698 7.625821 NaN 0.235751 1
VGYIELDLNSGk 9546 93.0 19.127481 0.184934 1.256568e+01 12.800832 -1.205556 3.856789 83.0 17.154424 0.098624 3.966709e+00 9.656604 -3.448946 8.693567 1.260810 0
FVQAYQSDEVYEAANk 10015 1.0 12.242579 0.030985 1.733472e-07 0.036631 3.687593 NaN 1.0 12.242579 0.030985 1.733472e-07 0.036631 3.687593 NaN -0.721485 1
ELcSAAITMSDNTAAnLLLTTIGGPk 9818 65.0 17.013110 0.113016 5.642632e+00 0.907013 0.174813 0.286011 39.0 16.018433 0.050737 1.074356e+01 0.859082 10.182012 0.436856 1.423368 0

2138 rows × 16 columns


In [223]:



Out[223]:
Peptide                                      MS2 Spectrum ID
GcImGSAHQr                                   779                1
GcImGSAHQR                                   783                1
                                             777                1
TQDATHGNSLSHR                                811                1
IEQAPGQHGAR                                  887                1
AGVTGAENr                                    904                1
AGVTGAENR                                    903                1
GTAmNPVDHPHGGGEGR                            917                1
ALVSHPR                                      933                1
mTGDNPDAPR                                   944                1
VHPNGIR                                      898                1
SVANAEQmDR                                   959                1
SVANAEQmDr                                   962                1
AAASHLVR                                     961                1
AAASHLVr                                     964                1
HLTDGmTVr                                    975                1
HLTDGmTVR                                    974                1
AVQNAMR                                      995                1
AGVHFGHQTR                                   1002               1
mVEEDPAHPr                                   1016               1
mVEEDPAHPR                                   1020               1
AGVHFGHQTR                                   992                1
AGVHFGHQTr                                   994                1
mVEEDPAHPr                                   1025               1
AGVHFGHQTr                                   1004               1
GTAMNPVDHPHGGGEGR                            1087               1
TDLHGTAVR                                    1078               0
AHHYPSELSGGQQQR                              1137               1
                                             1142               1
                                             1145               1
                                                               ..
TIPSVLTALFcAR                                8606               0
AMLTLIVFSFTVSVYSSATVTPGSLnLAPIAIADMDQSqLSnr  8994               0
GVLLPLLSLDcAVTITNR                           8966               1
ILELAGFLDSYIPEPER                            8999               0
FVESVDVAVNLGIDAr                             9057               0
IEGGEWLVETVQmLTER                            9375               1
                                             9363               1
IEGGEWLVETVQmLTEr                            9381               1
                                             9377               1
GDMLSMEDVLEILR                               9469               1
IEGGEWLVETVQmLTEr                            9388               1
GDMLSMEDVLEILr                               9471               1
HLEFFNTQPFVAAPILGVTLALEEQR                   9514               1
SVPGYSNIISMIGmLAER                           9526               1
SVPGYSNIISMIGMLAER                           9593               1
IEGGEWLVETVQMLTER                            9611               1
IEGGEWLVETVQMLTEr                            9612               1
IEGGEWLVETVQMLTER                            9613               1
                                             9625               0
FVESVDVAVnLGIDAR                             9677               0
ATFVVDPQGIIQAIEVTAEGIGR                      9724               1
                                             9729               1
AVTLYLGAVAATVR                               9652               0
                                             8958               0
IVVIYTTGSQATMDER                             9933               1
FVESVDVAVNLGIDAR                             9213               1
                                             9426               0
FVESVDVAVNLGIDAr                             9184               0
FVESVDVAVNLGIDAR                             9950               0
                                             9980               0
Name: Heavy/Light, dtype: int64

In [75]:
df = pd.concat(nds)
df = df.replace([np.inf,-np.inf], np.nan).dropna()

from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV

X = preprocessing.scale(df.drop('Deviation', axis=1).drop('Class', axis=1).values)
y = df.loc[:, ['Deviation', 'Class']].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)
y_test_reg = y_test[:, 0]
y_test_class = y_test[:, 1]
y_train_reg = y_train[:, 0]
y_train_class = y_train[:, 1]

In [82]:
from sklearn.svm import SVC as Classifier

clf = Classifier()
clf = clf.fit(X_train, y_train_class)
from sklearn.metrics import accuracy_score
print accuracy_score(y_test_class, clf.predict(X_test))


0.884615384615

In [341]:
from sklearn.qda import QDA as Classifier

clf = Classifier()
clf = clf.fit(X_train, y_train_class)
from sklearn.metrics import accuracy_score
print accuracy_score(y_test_class, clf.predict(X_test))


0.926799007444

In [349]:
from sklearn.gaussian_process import GaussianProcessClassifier as Classifier

clf = Classifier()
clf = clf.fit(X_train, y_train_class)
from sklearn.metrics import accuracy_score
print accuracy_score(y_test_class, clf.predict(X_test))


0.918114143921

In [83]:
from sklearn.neural_network import MLPClassifier as Classifier

clf = Classifier()
clf = clf.fit(X_train, y_train_class)
from sklearn.metrics import accuracy_score
print accuracy_score(y_test_class, clf.predict(X_test))


0.890818858561

In [84]:
import pickle
pickle.dump(clf, open('/home/chris/Devel/pyquant/pyquant/static/new_classifier2.pickle', 'wb'))

In [375]:
from sklearn.ensemble import AdaBoostRegressor as Regressor

clf = Regressor()
clf = clf.fit(X_train, y_train_reg)
from sklearn import metrics
print metrics.median_absolute_error(y_test_reg, clf.predict(X_test))
from matplotlib import pyplot as plt
plt.scatter(y_test_reg, clf.predict(X_test))
plt.plot([-6, 6], [-6, 6], 'r-')


0.217232977277
Out[375]:
[<matplotlib.lines.Line2D at 0x7f2df55edf50>]

In [24]:
from sklearn.neural_network import MLPRegressor
reg = MLPRegressor()
clf = GridSearchCV(reg, {})
clf.fit(X_train, y_train_reg)
print metrics.median_absolute_error(y_test_reg, clf.predict(X_test))
plt.scatter(y_test_reg, clf.predict(X_test))
plt.plot([-6, 6], [-6, 6], 'r-')


0.206629642862
Out[24]:
[<matplotlib.lines.Line2D at 0x7f6fe8eda150>]

In [377]:
from sklearn.ensemble import GradientBoostingRegressor
reg = GradientBoostingRegressor()
parameters = {
    'loss': ['ls', 'lad'], 
    'learning_rate': [0.01, 0.1, 0.5],
    'n_estimators': [50, 100, 200],
}
clf = GridSearchCV(reg, parameters)
clf.fit(X_train, y_train_reg)
from sklearn.metrics import r2_score
r2_score(y_test_reg, clf.predict(X_test))
plt.scatter(y_test_reg, clf.predict(X_test))
plt.plot([-6, 6], [-6, 6], 'r-')


Out[377]:
[<matplotlib.lines.Line2D at 0x7f2df570ad10>]

In [379]:
from sklearn.tree import DecisionTreeRegressor as Regressor
clf = Regressor()
clf.fit(X_train, y_train_reg)
print r2_score(y_test_reg, clf.predict(X_test))
plt.scatter(y_test_reg, clf.predict(X_test))
plt.plot([-6, 6], [-6, 6], 'r-')


-0.302797515251
Out[379]:
[<matplotlib.lines.Line2D at 0x7f2df53bee10>]

In [50]:
np.log2(dat[dat['Class']=='R']['Heavy/Light']).plot(kind='hist')


Out[50]:
<matplotlib.axes._subplots.AxesSubplot at 0x7feb98e65b90>

In [41]:
dat.columns.tolist()


Out[41]:
['Heavy Isotopes Found',
 'Heavy Intensity',
 'Heavy RT Width',
 'Heavy Mean Offset',
 'Heavy Residual',
 'Heavy R^2',
 'Heavy SNR',
 'Heavy/Light',
 'Light Isotopes Found',
 'Light Intensity',
 'Light RT Width',
 'Light Mean Offset',
 'Light Residual',
 'Light R^2',
 'Light SNR',
 'Medium Isotopes Found',
 'Medium Intensity',
 'Medium RT Width',
 'Medium Mean Offset',
 'Medium Residual',
 'Medium R^2',
 'Medium SNR',
 'Medium/Light',
 'Class']

In [5]:
from tpot import TPOT
from sklearn.cross_validation import train_test_split
import numpy as np
import pandas as pd
from patsy import dmatrix

dat = pd.read_table(out)
dat.set_index(['Peptide', 'MS2 Spectrum ID'], inplace=True)
dat.drop(['Modifications', 'Raw File', 'Accession', 'MS1 Spectrum ID', 'Charge', 'Retention Time', 'Heavy/Light', 'Heavy/Light Confidence', 'Medium/Light', 'Medium/Heavy', 'Medium/Heavy Confidence', 'Medium/Light', 'Medium/Light Confidence', 'Light/Medium', 'Light/Medium Confidence', 'Heavy/Medium', 'Heavy/Medium Confidence', 'Light/Heavy Confidence', 'Light/Heavy'], inplace=True, axis=1)
for i in ['Heavy', 'Medium', 'Light']:
    for j in ['Precursor', 'Calibrated Precursor']:
        dat.drop(i + ' ' +j, inplace=True, axis=1)
    to_drop = []

for j in dat.columns:
    if j.startswith('Heavy'):
        to_drop.append(j)
dat.drop(to_drop, inplace=True, axis=1)

dat['Class'] = None
for i in bad_data:
    dat.loc[i, 'Class'] = 0
for i in good_data:
    dat.loc[i, 'Class'] = 1

dat.dropna(inplace=True)
labels = dat['Class']

# # preprocess
dat['Medium Intensity'] = np.log2(dat['Medium Intensity'])
dat['Light Intensity'] = np.log2(dat['Light Intensity'])

# extra info
for i in ['RT Width', 'Isotopes Found']:
    dat['Medium/Light {}'.format(i)] = dat['Medium {}'.format(i)]/dat['Light {}'.format(i)]

# dat = dat.loc[:, ['Medium R^2', 'Light R^2', 'Class']]
dat.reset_index(drop=True, inplace=True)
training_indices, testing_indices = train_test_split(dat.index, stratify = labels.values, train_size=0.5, test_size=0.5)

tpot = TPOT(verbosity=2, generations=10)
tpot.fit(dat.drop('Class',axis=1).loc[training_indices].values, dat.loc[training_indices,'Class'].values.astype(int))
tpot.score(dat.drop('Class',axis=1).loc[testing_indices].values, dat.loc[testing_indices, 'Class'].values.astype(int))


GP Progress:   9%|▉         | 100/1100 [00:00<04:56,  3.37pipeline/s]
Generation 1 - Current best internal CV score: 1.00000
GP Progress:  18%|█▊        | 196/1100 [00:00<06:04,  2.48pipeline/s]
Generation 2 - Current best internal CV score: 1.00000
GP Progress:  26%|██▋       | 290/1100 [00:00<07:17,  1.85pipeline/s]
Generation 3 - Current best internal CV score: 1.00000
GP Progress:  36%|███▌      | 394/1100 [00:00<05:00,  2.35pipeline/s]
Generation 4 - Current best internal CV score: 1.00000
GP Progress:  44%|████▍     | 488/1100 [00:00<03:11,  3.20pipeline/s]
Generation 5 - Current best internal CV score: 1.00000
GP Progress:  54%|█████▍    | 594/1100 [00:00<03:19,  2.54pipeline/s]
Generation 6 - Current best internal CV score: 1.00000
GP Progress:  63%|██████▎   | 692/1100 [00:00<03:19,  2.04pipeline/s]
Generation 7 - Current best internal CV score: 1.00000
GP Progress:  72%|███████▏  | 789/1100 [00:00<01:52,  2.76pipeline/s]
Generation 8 - Current best internal CV score: 1.00000
GP Progress:  81%|████████▏ | 894/1100 [00:00<01:26,  2.39pipeline/s]
Generation 9 - Current best internal CV score: 1.00000
GP Progress:  90%|█████████ | 990/1100 [00:00<00:50,  2.19pipeline/s]
Generation 10 - Current best internal CV score: 1.00000
                                                                      
Best pipeline: _linear_svc(input_df, 0.97999999999999998, 60, False)

Out[5]:
1.0

In [17]:
# %matplotlib inline
# from sklearn.svm import SVC

# predictor = SVC()
# predictor.fit(dat.drop('Class',axis=1).loc[training_indices].values, dat.loc[training_indices,'Class'].values.astype(int))
# predictor.score(dat.drop('Class',axis=1).loc[training_indices].values, dat.loc[training_indices,'Class'].values.astype(int))
# # plt.scatter(dat.iloc[:, 0], dat.iloc[:, 1], c=dat.iloc[:, 2])


Out[17]:
0.90740740740740744

In [4]:
tpot.export('pipe.py')

In [85]:
dat = pd.read_table('/home/chris/Devel/pyquant/ml_test_cl2_stats')
dat = dat[dat['Peptide'].str.count('R')+dat['Peptide'].str.count('K')+dat['Peptide'].str.count('k')+dat['Peptide'].str.count('r') == 1]
dat['Class'] = None
dat.loc[dat['Peptide'].str.count('R')+dat['Peptide'].str.count('r') == 1, 'Class'] = 'R'
dat.loc[dat['Peptide'].str.count('K')+dat['Peptide'].str.count('k') == 1, 'Class'] = 'K'

In [98]:
np.log2(dat.loc[dat['Class']=='R','Heavy/Light']).plot(kind='density', c='r')
np.log2(dat.loc[(dat['Class']=='R') & (dat['Heavy/Light Confidence']>5),'Heavy/Light']).plot(kind='density', c='g')
np.log2(dat.loc[(dat['Class']=='R') & (dat['Heavy/Light Confidence']>8),'Heavy/Light']).plot(kind='density', c='k')


Out[98]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f6fe2a4a210>

In [97]:
isotope = 'K'
ratio = 'Heavy/Light'
df_1 = np.log2(dat.loc[dat['Class']==isotope,ratio])
df_2 = np.log2(dat.loc[(dat['Class']==isotope) & (dat['{} Confidence'.format(ratio)]>5),ratio])
df_3 = np.log2(dat.loc[(dat['Class']==isotope) & (dat['{} Confidence'.format(ratio)]>9),ratio])
df = pd.concat([df_1, df_2, df_3], axis=1)
df.columns=['All', '5', '8']
df.plot(kind='box')


Out[97]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f6fe2bb5fd0>

In [94]:
dat.loc[dat['Class']=='K', '{} Confidence'.format('Heavy/Light')].plot(kind='density')


Out[94]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f6fe2d99ad0>