In [1]:
import pandas as pd
import numpy as np


/Users/mauriciobarrientos/anaconda/lib/python2.7/site-packages/pytz/__init__.py:29: UserWarning: Module argparse was already imported from /Users/mauriciobarrientos/anaconda/python.app/Contents/lib/python2.7/argparse.pyc, but /Users/mauriciobarrientos/anaconda/lib/python2.7/site-packages is being added to sys.path
  from pkg_resources import resource_stream

Load set of common peptides


In [2]:
common_peps = []
with open("common_peps.txt", "r") as common_peps_file:
    common_peps = common_peps_file.read().split("\n")
common_peps = set(common_peps)

In [3]:
len(common_peps)


Out[3]:
296

Load plate files


In [4]:
plates  = []
plate_names = ["plate1.txt","plate2.txt","plate3.txt"]
for filename in plate_names:
    plates.append(pd.read_csv(filename,sep="\t"))

In [5]:
for p in plates:
    print p.shape


(194, 48)
(277, 47)
(248, 47)

In [6]:
plates[0].axes[1]


Out[6]:
Index([u'Unnamed: 0', u'Sequence', u'Phospho (STY)', u'seq+ph', u'K Count', u'R Count', u'Modifications', u'Mass', u'Mass Fractional Part', u'Protein Groups', u'Proteins', u'Unique (Groups)', u'Unique (Proteins)', u'Oxidation (M)', u'Missed cleavages', u'Retention time', u'Calibrated Retention Time', u'Charges', u'PEP', u'MS/MS Count', u'MS/MS Scan Number', u'Raw file', u'Score', u'Delta score', u'Ratio H/L', u'Ratio H/L normalized', u'log2(Ratio H/L norm)', u'Ratio H/L variability [%]', u'Ratio H/L count', u'Ratio H/L iso-count', u'Ratio H/L type', u'Intensity', u'Intensity L', u'Intensity H', u'Reverse', u'Contaminant', u'id', u'Protein group IDs', u'Peptide ID', u'Evidence IDs', u'MS/MS IDs', u'Best MS/MS', u'Oxidation (M) site IDs', u'Phospho (STY) site IDs', u'Sequence+phospho', u'Missing channel', u'Unnamed: 46', u'Gene Name'], dtype='object')

Keep only peptides in common peptide list


In [7]:
filtered_plates = []
for p in plates:
    filter_col = p["Sequence+phospho"].apply(lambda x: x in common_peps) #True if peptide is in common list
    filtered_plates.append(p.ix[filter_col][["Sequence+phospho","Gene Name",'log2(Ratio H/L norm)', 'Intensity L', 'Intensity H']])

In [8]:
for p in filtered_plates:
    print p.shape


(194, 5)
(277, 5)
(248, 5)

Magic Joining


In [9]:
join1 = pd.merge(filtered_plates[0],filtered_plates[1],on="Sequence+phospho",how="outer")
join2 = pd.merge(join1,filtered_plates[2],on="Sequence+phospho",how="outer")

In [10]:
join2.to_csv("joined.csv",sep=",")