notebook.community

Edit and run



In [1]:

    
import pandas as pd
import re



In [2]:

    
data = pd.read_table("ElenaP_20140213_MCF7_MG132_TMT10plex_1mgIPG25-37_fr12-72_peps.txt")



In [3]:

    
data.describe()









    Out[3]:






  
    
      
      # PSMs
      # Proteins
      # Protein Groups
      MH+ [Da]
      Area
      126/(127_C+128_C+129_C+130_C)
      126/(127_C+128_C+129_C+130_C) Count
      126/(127_C+128_C+129_C+130_C) Variability [%]
      127_C/(127_C+128_C+129_C+130_C)
      127_C/(127_C+128_C+129_C+130_C) Count
      127_C/(127_C+128_C+129_C+130_C) Variability [%]
      127_N/(127_C+128_C+129_C+130_C)
      127_N/(127_C+128_C+129_C+130_C) Count
      127_N/(127_C+128_C+129_C+130_C) Variability [%]
      128_C/(127_C+128_C+129_C+130_C)
      128_C/(127_C+128_C+129_C+130_C) Count
      128_C/(127_C+128_C+129_C+130_C) Variability [%]
      128_N/(127_C+128_C+129_C+130_C)
      128_N/(127_C+128_C+129_C+130_C) Count
      128_N/(127_C+128_C+129_C+130_C) Variability [%]
      
    
  
  
    
      count
       3758.000000
       3758.000000
       3758.000000
       3758.000000
       3.758000e+03
       3607.000000
       3607.000000
          1934.000000
       3607.000000
       3607.000000
          1934.000000
       3607.000000
       3607.000000
         1934.000000
       3607.000000
       3607.000000
          1934.000000
       3607.000000
       3607.000000
         1934.000000
      ...
    
    
      mean
          8.968866
          2.125333
          1.045769
       2250.839659
       1.205455e+08
          1.253093
          4.245633
          1879.573940
          0.979999
          4.245633
          4234.968563
          1.324021
          4.245633
          688.655584
          1.019143
          4.245633
          1615.082937
          1.413423
          4.245633
          133.309152
      ...
    
    
      std
         32.760629
          2.016037
          0.323067
        821.546172
       6.061890e+08
          3.420274
         14.467056
         54641.390337
          0.219848
         14.467056
        185202.322705
          5.738091
         14.467056
        19673.160265
          0.249831
         14.467056
         39119.590875
          6.517296
         14.467056
         4104.614984
      ...
    
    
      min
          1.000000
          1.000000
          0.000000
        748.403730
       0.000000e+00
          0.001000
          1.000000
             0.000000
          0.004000
          1.000000
             0.000000
          0.001000
          1.000000
            0.000000
          0.000000
          1.000000
             0.000000
          0.001000
          1.000000
            0.000000
      ...
    
    
      25%
          2.000000
          1.000000
          1.000000
       1607.862050
       5.121500e+06
          0.630000
          1.000000
             7.525000
          0.876500
          1.000000
             4.100000
          0.700000
          1.000000
            6.925000
          0.893000
          1.000000
             4.500000
          0.659000
          1.000000
            7.100000
      ...
    
    
      50%
          4.000000
          2.000000
          1.000000
       2138.035260
       1.762000e+07
          0.969000
          2.000000
            16.700000
          0.983000
          2.000000
             8.300000
          0.976000
          2.000000
           15.000000
          1.017000
          2.000000
             9.500000
          0.926000
          2.000000
           15.650000
      ...
    
    
      75%
          8.000000
          2.000000
          1.000000
       2720.846165
       5.717500e+07
          1.543500
          4.000000
            31.200000
          1.091000
          4.000000
            14.800000
          1.504500
          4.000000
           29.100000
          1.152000
          4.000000
            16.900000
          1.629500
          4.000000
           30.000000
      ...
    
    
      max
       1297.000000
         32.000000
          7.000000
       5778.788230
       1.631000e+10
        196.158000
        604.000000
       2283224.800000
          3.013000
        604.000000
       8144706.400000
        338.138000
        604.000000
       829247.700000
          4.507000
        604.000000
       1194366.200000
        378.199000
        604.000000
       179743.600000
      ...
    
  

8 rows × 42 columns



In [4]:

    
print data.columns









    



Index([u'Sequence', u'# PSMs', u'# Proteins', u'# Protein Groups', u'Protein Group Accessions', u'Modifications', u'MH+ [Da]', u'phosphoRS Site Probabilities', u'Area', u'126/(127_C+128_C+129_C+130_C)', u'126/(127_C+128_C+129_C+130_C) Count', u'126/(127_C+128_C+129_C+130_C) Variability [%]', u'127_C/(127_C+128_C+129_C+130_C)', u'127_C/(127_C+128_C+129_C+130_C) Count', u'127_C/(127_C+128_C+129_C+130_C) Variability [%]', u'127_N/(127_C+128_C+129_C+130_C)', u'127_N/(127_C+128_C+129_C+130_C) Count', u'127_N/(127_C+128_C+129_C+130_C) Variability [%]', u'128_C/(127_C+128_C+129_C+130_C)', u'128_C/(127_C+128_C+129_C+130_C) Count', u'128_C/(127_C+128_C+129_C+130_C) Variability [%]', u'128_N/(127_C+128_C+129_C+130_C)', u'128_N/(127_C+128_C+129_C+130_C) Count', u'128_N/(127_C+128_C+129_C+130_C) Variability [%]', u'129_C/(127_C+128_C+129_C+130_C)', u'129_C/(127_C+128_C+129_C+130_C) Count', u'129_C/(127_C+128_C+129_C+130_C) Variability [%]', u'129_N/(127_C+128_C+129_C+130_C)', u'129_N/(127_C+128_C+129_C+130_C) Count', u'129_N/(127_C+128_C+129_C+130_C) Variability [%]', u'130_C/(127_C+128_C+129_C+130_C)', u'130_C/(127_C+128_C+129_C+130_C) Count', u'130_C/(127_C+128_C+129_C+130_C) Variability [%]', u'130_N/(127_C+128_C+129_C+130_C)', u'130_N/(127_C+128_C+129_C+130_C) Count', u'130_N/(127_C+128_C+129_C+130_C) Variability [%]', u'131/(127_C+128_C+129_C+130_C)', u'131/(127_C+128_C+129_C+130_C) Count', u'131/(127_C+128_C+129_C+130_C) Variability [%]', u'q-Value', u'PEP', u'A3', u'XCorr A3', u'Probability A3', u'A7', u'XCorr A7', u'Probability A7', u'# Missed Cleavages'], dtype='object')



In [5]:

    
#Identify ratio columns for anova
ratio_cols = []
for col in data.columns.tolist():
    if re.search(r"^[0-9_CN]+/\([0-9_CN]+?(\+[0-9_CN]+?)*\)$",col):
        ratio_cols.append(col)
print ratio_cols
print len(ratio_cols)









    



['126/(127_C+128_C+129_C+130_C)', '127_C/(127_C+128_C+129_C+130_C)', '127_N/(127_C+128_C+129_C+130_C)', '128_C/(127_C+128_C+129_C+130_C)', '128_N/(127_C+128_C+129_C+130_C)', '129_C/(127_C+128_C+129_C+130_C)', '129_N/(127_C+128_C+129_C+130_C)', '130_C/(127_C+128_C+129_C+130_C)', '130_N/(127_C+128_C+129_C+130_C)', '131/(127_C+128_C+129_C+130_C)']
10



In [6]:

    
#Remove rows which have NA data in the ratio columns
na_array = np.zeros( len(data), dtype=bool)
for col in ratio_cols:
    na_array |=  pd.isnull(data[col]) 

data_clean = data[ ~na_array]
print "Removed rows with NA values: "+ str(np.sum(na_array))









    



Removed rows with NA values: 151



In [7]:

    
data_clean.describe()









    Out[7]:






  
    
      
      # PSMs
      # Proteins
      # Protein Groups
      MH+ [Da]
      Area
      126/(127_C+128_C+129_C+130_C)
      126/(127_C+128_C+129_C+130_C) Count
      126/(127_C+128_C+129_C+130_C) Variability [%]
      127_C/(127_C+128_C+129_C+130_C)
      127_C/(127_C+128_C+129_C+130_C) Count
      127_C/(127_C+128_C+129_C+130_C) Variability [%]
      127_N/(127_C+128_C+129_C+130_C)
      127_N/(127_C+128_C+129_C+130_C) Count
      127_N/(127_C+128_C+129_C+130_C) Variability [%]
      128_C/(127_C+128_C+129_C+130_C)
      128_C/(127_C+128_C+129_C+130_C) Count
      128_C/(127_C+128_C+129_C+130_C) Variability [%]
      128_N/(127_C+128_C+129_C+130_C)
      128_N/(127_C+128_C+129_C+130_C) Count
      128_N/(127_C+128_C+129_C+130_C) Variability [%]
      
    
  
  
    
      count
       3607.000000
       3607.000000
       3607
       3607.000000
       3.607000e+03
       3607.000000
       3607.000000
          1934.000000
       3607.000000
       3607.000000
          1934.000000
       3607.000000
       3607.000000
         1934.000000
       3607.000000
       3607.000000
          1934.000000
       3607.000000
       3607.000000
         1934.000000
      ...
    
    
      mean
          9.037982
          1.966177
          1
       2274.844613
       1.216479e+08
          1.253093
          4.245633
          1879.573940
          0.979999
          4.245633
          4234.968563
          1.324021
          4.245633
          688.655584
          1.019143
          4.245633
          1615.082937
          1.413423
          4.245633
          133.309152
      ...
    
    
      std
         33.364330
          1.630943
          0
        819.403222
       6.140113e+08
          3.420274
         14.467056
         54641.390337
          0.219848
         14.467056
        185202.322705
          5.738091
         14.467056
        19673.160265
          0.249831
         14.467056
         39119.590875
          6.517296
         14.467056
         4104.614984
      ...
    
    
      min
          1.000000
          1.000000
          1
        748.403730
       0.000000e+00
          0.001000
          1.000000
             0.000000
          0.004000
          1.000000
             0.000000
          0.001000
          1.000000
            0.000000
          0.000000
          1.000000
             0.000000
          0.001000
          1.000000
            0.000000
      ...
    
    
      25%
          2.000000
          1.000000
          1
       1635.879625
       5.073000e+06
          0.630000
          1.000000
             7.525000
          0.876500
          1.000000
             4.100000
          0.700000
          1.000000
            6.925000
          0.893000
          1.000000
             4.500000
          0.659000
          1.000000
            7.100000
      ...
    
    
      50%
          4.000000
          1.000000
          1
       2157.235450
       1.734000e+07
          0.969000
          2.000000
            16.700000
          0.983000
          2.000000
             8.300000
          0.976000
          2.000000
           15.000000
          1.017000
          2.000000
             9.500000
          0.926000
          2.000000
           15.650000
      ...
    
    
      75%
          8.000000
          2.000000
          1
       2749.161230
       5.637500e+07
          1.543500
          4.000000
            31.200000
          1.091000
          4.000000
            14.800000
          1.504500
          4.000000
           29.100000
          1.152000
          4.000000
            16.900000
          1.629500
          4.000000
           30.000000
      ...
    
    
      max
       1297.000000
         32.000000
          1
       5778.788230
       1.631000e+10
        196.158000
        604.000000
       2283224.800000
          3.013000
        604.000000
       8144706.400000
        338.138000
        604.000000
       829247.700000
          4.507000
        604.000000
       1194366.200000
        378.199000
        604.000000
       179743.600000
      ...
    
  

8 rows × 42 columns



In [8]:

    
#Count number of phopho modifications per peptide, and create two groups, with phospho and without phospho
data_clean["phospho_count"] = data_clean["Modifications"].map( lambda x: x.count("Phospho"))
data_phospho = data_clean.ix[ data_clean["phospho_count"] > 0 ]
data_nophospho = data_clean.ix[ data_clean["phospho_count"] == 0 ]



In [9]:

    
print len(data_phospho)
print len(data_nophospho)



In [10]:

    
data_nophospho.describe()









    Out[10]:






  
    
      
      # PSMs
      # Proteins
      # Protein Groups
      MH+ [Da]
      Area
      126/(127_C+128_C+129_C+130_C)
      126/(127_C+128_C+129_C+130_C) Count
      126/(127_C+128_C+129_C+130_C) Variability [%]
      127_C/(127_C+128_C+129_C+130_C)
      127_C/(127_C+128_C+129_C+130_C) Count
      127_C/(127_C+128_C+129_C+130_C) Variability [%]
      127_N/(127_C+128_C+129_C+130_C)
      127_N/(127_C+128_C+129_C+130_C) Count
      127_N/(127_C+128_C+129_C+130_C) Variability [%]
      128_C/(127_C+128_C+129_C+130_C)
      128_C/(127_C+128_C+129_C+130_C) Count
      128_C/(127_C+128_C+129_C+130_C) Variability [%]
      128_N/(127_C+128_C+129_C+130_C)
      128_N/(127_C+128_C+129_C+130_C) Count
      128_N/(127_C+128_C+129_C+130_C) Variability [%]
      
    
  
  
    
      count
       2562.000000
       2562.000000
       2562
       2562.000000
       2.562000e+03
       2562.000000
       2562.000000
         1301.000000
       2562.000000
       2562.000000
          1301.000000
       2562.000000
       2562.000000
         1301.000000
       2562.000000
       2562.000000
          1301.000000
       2562.000000
       2562.000000
        1301.000000
      ...
    
    
      mean
          8.060500
          1.904372
          1
       2145.556998
       1.376070e+08
          1.162705
          3.986729
          937.841814
          0.974311
          3.986729
          6286.786472
          1.257511
          3.986729
          912.802690
          1.022829
          3.986729
          1802.606457
          1.297855
          3.986729
          45.433666
      ...
    
    
      std
         28.491198
          1.682486
          0
        812.704882
       6.394598e+08
          3.968624
         11.490817
        20668.120665
          0.222514
         11.490817
        225806.198274
          6.742117
         11.490817
        23734.531274
          0.259089
         11.490817
         42859.686823
          7.536560
         11.490817
         466.808539
      ...
    
    
      min
          1.000000
          1.000000
          1
        748.403730
       0.000000e+00
          0.001000
          1.000000
            0.000000
          0.015000
          1.000000
             0.000000
          0.001000
          1.000000
            0.000000
          0.000000
          1.000000
             0.000000
          0.001000
          1.000000
           0.000000
      ...
    
    
      25%
          2.000000
          1.000000
          1
       1522.839555
       4.895750e+06
          0.596000
          1.000000
            6.800000
          0.863250
          1.000000
             3.600000
          0.681000
          1.000000
            6.100000
          0.882000
          1.000000
             4.000000
          0.632000
          1.000000
           6.300000
      ...
    
    
      50%
          3.000000
          1.000000
          1
       1991.594115
       1.767500e+07
          0.862500
          2.000000
           15.200000
          0.978000
          2.000000
             7.400000
          0.917500
          2.000000
           13.400000
          1.017000
          2.000000
             8.800000
          0.822000
          2.000000
          14.400000
      ...
    
    
      75%
          6.000000
          2.000000
          1
       2568.155660
       6.139750e+07
          1.399750
          3.000000
           29.900000
          1.094750
          3.000000
            14.000000
          1.332000
          3.000000
           26.900000
          1.156000
          3.000000
            16.000000
          1.422000
          3.000000
          28.800000
      ...
    
    
      max
        977.000000
         32.000000
          1
       5778.788230
       1.318000e+10
        196.158000
        268.000000
       558711.600000
          3.013000
        268.000000
       8144706.400000
        338.138000
        268.000000
       829247.700000
          4.507000
        268.000000
       1194366.200000
        378.199000
        268.000000
       15154.400000
      ...
    
  

8 rows × 43 columns



In [11]:

    
from scipy import stats



In [12]:

    
def extract_groups(row,ctrl_group,*groups):
    group_values = []
    ratio_suffix = "/("+"+".join(sorted(ctrl_group))+")"
    #Process control group
    temp_group = []
    for label in ctrl_group:
        temp_group.append( row[label+ratio_suffix] )
    group_values.append(tuple(temp_group))
    
    #Process rest of groups
    for group in groups:
        temp_group = []
        for label in group:
            temp_group.append( row[label+ratio_suffix] )
        group_values.append(tuple(temp_group))
    return group_values



In [13]:

    
#Groups
g_ctrl = ("128_C","129_C","127_C","130_C")
g_1h = ("126","127_N")
g_2h = ("128_N","129_N")
g_4h = ("130_N","131")

#Control group must be the first for extract_groups function to work
groups = ( g_ctrl, g_1h, g_2h, g_4h )



In [14]:

    
P_VALUE =1 
data_phospho["anova_pval"]  = data_phospho.apply(lambda row: stats.f_oneway( *extract_groups(row,*groups) )[P_VALUE],axis=1)
data_nophospho["anova_pval"]= data_nophospho.apply(lambda row: stats.f_oneway( *extract_groups(row,*groups))[P_VALUE],axis=1)



In [15]:

    
P_VALUE =1 
data_phospho["kruskal_pval"] = data_phospho.apply(lambda row: stats.mstats.kruskalwallis( *extract_groups(row,*groups))[P_VALUE],axis=1)
data_nophospho["kruskal_pval"] = data_nophospho.apply(lambda row: stats.mstats.kruskalwallis( *extract_groups(row,*groups))[P_VALUE],axis=1)



In [28]:

    
#Phospho
alpha = 0.05
n_samples = len(data_phospho)
bonf_corrected_alpha = alpha/n_samples
with open("phospho_anova_accessions_alpha0.05_bonferroni.tsv","w") as phospho_anova_fh:
    data_phospho.ix[ data_phospho["anova_pval"] < bonf_corrected_alpha ].to_csv(phospho_anova_fh,sep="\t",index=None)



In [ ]:

	# PSMs	# Proteins	# Protein Groups	MH+ [Da]	Area	126/(127_C+128_C+129_C+130_C)	126/(127_C+128_C+129_C+130_C) Count	126/(127_C+128_C+129_C+130_C) Variability [%]	127_C/(127_C+128_C+129_C+130_C)	127_C/(127_C+128_C+129_C+130_C) Count	127_C/(127_C+128_C+129_C+130_C) Variability [%]	127_N/(127_C+128_C+129_C+130_C)	127_N/(127_C+128_C+129_C+130_C) Count	127_N/(127_C+128_C+129_C+130_C) Variability [%]	128_C/(127_C+128_C+129_C+130_C)	128_C/(127_C+128_C+129_C+130_C) Count	128_C/(127_C+128_C+129_C+130_C) Variability [%]	128_N/(127_C+128_C+129_C+130_C)	128_N/(127_C+128_C+129_C+130_C) Count	128_N/(127_C+128_C+129_C+130_C) Variability [%]
count	3758.000000	3758.000000	3758.000000	3758.000000	3.758000e+03	3607.000000	3607.000000	1934.000000	3607.000000	3607.000000	1934.000000	3607.000000	3607.000000	1934.000000	3607.000000	3607.000000	1934.000000	3607.000000	3607.000000	1934.000000	...
mean	8.968866	2.125333	1.045769	2250.839659	1.205455e+08	1.253093	4.245633	1879.573940	0.979999	4.245633	4234.968563	1.324021	4.245633	688.655584	1.019143	4.245633	1615.082937	1.413423	4.245633	133.309152	...
std	32.760629	2.016037	0.323067	821.546172	6.061890e+08	3.420274	14.467056	54641.390337	0.219848	14.467056	185202.322705	5.738091	14.467056	19673.160265	0.249831	14.467056	39119.590875	6.517296	14.467056	4104.614984	...
min	1.000000	1.000000	0.000000	748.403730	0.000000e+00	0.001000	1.000000	0.000000	0.004000	1.000000	0.000000	0.001000	1.000000	0.000000	0.000000	1.000000	0.000000	0.001000	1.000000	0.000000	...
25%	2.000000	1.000000	1.000000	1607.862050	5.121500e+06	0.630000	1.000000	7.525000	0.876500	1.000000	4.100000	0.700000	1.000000	6.925000	0.893000	1.000000	4.500000	0.659000	1.000000	7.100000	...
50%	4.000000	2.000000	1.000000	2138.035260	1.762000e+07	0.969000	2.000000	16.700000	0.983000	2.000000	8.300000	0.976000	2.000000	15.000000	1.017000	2.000000	9.500000	0.926000	2.000000	15.650000	...
75%	8.000000	2.000000	1.000000	2720.846165	5.717500e+07	1.543500	4.000000	31.200000	1.091000	4.000000	14.800000	1.504500	4.000000	29.100000	1.152000	4.000000	16.900000	1.629500	4.000000	30.000000	...
max	1297.000000	32.000000	7.000000	5778.788230	1.631000e+10	196.158000	604.000000	2283224.800000	3.013000	604.000000	8144706.400000	338.138000	604.000000	829247.700000	4.507000	604.000000	1194366.200000	378.199000	604.000000	179743.600000	...

	# PSMs	# Proteins	# Protein Groups	MH+ [Da]	Area	126/(127_C+128_C+129_C+130_C)	126/(127_C+128_C+129_C+130_C) Count	126/(127_C+128_C+129_C+130_C) Variability [%]	127_C/(127_C+128_C+129_C+130_C)	127_C/(127_C+128_C+129_C+130_C) Count	127_C/(127_C+128_C+129_C+130_C) Variability [%]	127_N/(127_C+128_C+129_C+130_C)	127_N/(127_C+128_C+129_C+130_C) Count	127_N/(127_C+128_C+129_C+130_C) Variability [%]	128_C/(127_C+128_C+129_C+130_C)	128_C/(127_C+128_C+129_C+130_C) Count	128_C/(127_C+128_C+129_C+130_C) Variability [%]	128_N/(127_C+128_C+129_C+130_C)	128_N/(127_C+128_C+129_C+130_C) Count	128_N/(127_C+128_C+129_C+130_C) Variability [%]
count	2562.000000	2562.000000	2562	2562.000000	2.562000e+03	2562.000000	2562.000000	1301.000000	2562.000000	2562.000000	1301.000000	2562.000000	2562.000000	1301.000000	2562.000000	2562.000000	1301.000000	2562.000000	2562.000000	1301.000000	...
mean	8.060500	1.904372	1	2145.556998	1.376070e+08	1.162705	3.986729	937.841814	0.974311	3.986729	6286.786472	1.257511	3.986729	912.802690	1.022829	3.986729	1802.606457	1.297855	3.986729	45.433666	...
std	28.491198	1.682486	0	812.704882	6.394598e+08	3.968624	11.490817	20668.120665	0.222514	11.490817	225806.198274	6.742117	11.490817	23734.531274	0.259089	11.490817	42859.686823	7.536560	11.490817	466.808539	...
min	1.000000	1.000000	1	748.403730	0.000000e+00	0.001000	1.000000	0.000000	0.015000	1.000000	0.000000	0.001000	1.000000	0.000000	0.000000	1.000000	0.000000	0.001000	1.000000	0.000000	...
25%	2.000000	1.000000	1	1522.839555	4.895750e+06	0.596000	1.000000	6.800000	0.863250	1.000000	3.600000	0.681000	1.000000	6.100000	0.882000	1.000000	4.000000	0.632000	1.000000	6.300000	...
50%	3.000000	1.000000	1	1991.594115	1.767500e+07	0.862500	2.000000	15.200000	0.978000	2.000000	7.400000	0.917500	2.000000	13.400000	1.017000	2.000000	8.800000	0.822000	2.000000	14.400000	...
75%	6.000000	2.000000	1	2568.155660	6.139750e+07	1.399750	3.000000	29.900000	1.094750	3.000000	14.000000	1.332000	3.000000	26.900000	1.156000	3.000000	16.000000	1.422000	3.000000	28.800000	...
max	977.000000	32.000000	1	5778.788230	1.318000e+10	196.158000	268.000000	558711.600000	3.013000	268.000000	8144706.400000	338.138000	268.000000	829247.700000	4.507000	268.000000	1194366.200000	378.199000	268.000000	15154.400000	...