In [1]:
import pandas as pd
import re

In [2]:
data = pd.read_table("ElenaP_20140213_MCF7_MG132_TMT10plex_1mgIPG25-37_fr12-72_peps.txt")

In [3]:
data.describe()


Out[3]:
# PSMs # Proteins # Protein Groups MH+ [Da] Area 126/(127_C+128_C+129_C+130_C) 126/(127_C+128_C+129_C+130_C) Count 126/(127_C+128_C+129_C+130_C) Variability [%] 127_C/(127_C+128_C+129_C+130_C) 127_C/(127_C+128_C+129_C+130_C) Count 127_C/(127_C+128_C+129_C+130_C) Variability [%] 127_N/(127_C+128_C+129_C+130_C) 127_N/(127_C+128_C+129_C+130_C) Count 127_N/(127_C+128_C+129_C+130_C) Variability [%] 128_C/(127_C+128_C+129_C+130_C) 128_C/(127_C+128_C+129_C+130_C) Count 128_C/(127_C+128_C+129_C+130_C) Variability [%] 128_N/(127_C+128_C+129_C+130_C) 128_N/(127_C+128_C+129_C+130_C) Count 128_N/(127_C+128_C+129_C+130_C) Variability [%]
count 3758.000000 3758.000000 3758.000000 3758.000000 3.758000e+03 3607.000000 3607.000000 1934.000000 3607.000000 3607.000000 1934.000000 3607.000000 3607.000000 1934.000000 3607.000000 3607.000000 1934.000000 3607.000000 3607.000000 1934.000000 ...
mean 8.968866 2.125333 1.045769 2250.839659 1.205455e+08 1.253093 4.245633 1879.573940 0.979999 4.245633 4234.968563 1.324021 4.245633 688.655584 1.019143 4.245633 1615.082937 1.413423 4.245633 133.309152 ...
std 32.760629 2.016037 0.323067 821.546172 6.061890e+08 3.420274 14.467056 54641.390337 0.219848 14.467056 185202.322705 5.738091 14.467056 19673.160265 0.249831 14.467056 39119.590875 6.517296 14.467056 4104.614984 ...
min 1.000000 1.000000 0.000000 748.403730 0.000000e+00 0.001000 1.000000 0.000000 0.004000 1.000000 0.000000 0.001000 1.000000 0.000000 0.000000 1.000000 0.000000 0.001000 1.000000 0.000000 ...
25% 2.000000 1.000000 1.000000 1607.862050 5.121500e+06 0.630000 1.000000 7.525000 0.876500 1.000000 4.100000 0.700000 1.000000 6.925000 0.893000 1.000000 4.500000 0.659000 1.000000 7.100000 ...
50% 4.000000 2.000000 1.000000 2138.035260 1.762000e+07 0.969000 2.000000 16.700000 0.983000 2.000000 8.300000 0.976000 2.000000 15.000000 1.017000 2.000000 9.500000 0.926000 2.000000 15.650000 ...
75% 8.000000 2.000000 1.000000 2720.846165 5.717500e+07 1.543500 4.000000 31.200000 1.091000 4.000000 14.800000 1.504500 4.000000 29.100000 1.152000 4.000000 16.900000 1.629500 4.000000 30.000000 ...
max 1297.000000 32.000000 7.000000 5778.788230 1.631000e+10 196.158000 604.000000 2283224.800000 3.013000 604.000000 8144706.400000 338.138000 604.000000 829247.700000 4.507000 604.000000 1194366.200000 378.199000 604.000000 179743.600000 ...

8 rows × 42 columns


In [4]:
print data.columns


Index([u'Sequence', u'# PSMs', u'# Proteins', u'# Protein Groups', u'Protein Group Accessions', u'Modifications', u'MH+ [Da]', u'phosphoRS Site Probabilities', u'Area', u'126/(127_C+128_C+129_C+130_C)', u'126/(127_C+128_C+129_C+130_C) Count', u'126/(127_C+128_C+129_C+130_C) Variability [%]', u'127_C/(127_C+128_C+129_C+130_C)', u'127_C/(127_C+128_C+129_C+130_C) Count', u'127_C/(127_C+128_C+129_C+130_C) Variability [%]', u'127_N/(127_C+128_C+129_C+130_C)', u'127_N/(127_C+128_C+129_C+130_C) Count', u'127_N/(127_C+128_C+129_C+130_C) Variability [%]', u'128_C/(127_C+128_C+129_C+130_C)', u'128_C/(127_C+128_C+129_C+130_C) Count', u'128_C/(127_C+128_C+129_C+130_C) Variability [%]', u'128_N/(127_C+128_C+129_C+130_C)', u'128_N/(127_C+128_C+129_C+130_C) Count', u'128_N/(127_C+128_C+129_C+130_C) Variability [%]', u'129_C/(127_C+128_C+129_C+130_C)', u'129_C/(127_C+128_C+129_C+130_C) Count', u'129_C/(127_C+128_C+129_C+130_C) Variability [%]', u'129_N/(127_C+128_C+129_C+130_C)', u'129_N/(127_C+128_C+129_C+130_C) Count', u'129_N/(127_C+128_C+129_C+130_C) Variability [%]', u'130_C/(127_C+128_C+129_C+130_C)', u'130_C/(127_C+128_C+129_C+130_C) Count', u'130_C/(127_C+128_C+129_C+130_C) Variability [%]', u'130_N/(127_C+128_C+129_C+130_C)', u'130_N/(127_C+128_C+129_C+130_C) Count', u'130_N/(127_C+128_C+129_C+130_C) Variability [%]', u'131/(127_C+128_C+129_C+130_C)', u'131/(127_C+128_C+129_C+130_C) Count', u'131/(127_C+128_C+129_C+130_C) Variability [%]', u'q-Value', u'PEP', u'A3', u'XCorr A3', u'Probability A3', u'A7', u'XCorr A7', u'Probability A7', u'# Missed Cleavages'], dtype='object')

In [5]:
#Identify ratio columns for anova
ratio_cols = []
for col in data.columns.tolist():
    if re.search(r"^[0-9_CN]+/\([0-9_CN]+?(\+[0-9_CN]+?)*\)$",col):
        ratio_cols.append(col)
print ratio_cols
print len(ratio_cols)


['126/(127_C+128_C+129_C+130_C)', '127_C/(127_C+128_C+129_C+130_C)', '127_N/(127_C+128_C+129_C+130_C)', '128_C/(127_C+128_C+129_C+130_C)', '128_N/(127_C+128_C+129_C+130_C)', '129_C/(127_C+128_C+129_C+130_C)', '129_N/(127_C+128_C+129_C+130_C)', '130_C/(127_C+128_C+129_C+130_C)', '130_N/(127_C+128_C+129_C+130_C)', '131/(127_C+128_C+129_C+130_C)']
10

In [6]:
#Remove rows which have NA data in the ratio columns
na_array = np.zeros( len(data), dtype=bool)
for col in ratio_cols:
    na_array |=  pd.isnull(data[col]) 

data_clean = data[ ~na_array]
print "Removed rows with NA values: "+ str(np.sum(na_array))


Removed rows with NA values: 151

In [7]:
data_clean.describe()


Out[7]:
# PSMs # Proteins # Protein Groups MH+ [Da] Area 126/(127_C+128_C+129_C+130_C) 126/(127_C+128_C+129_C+130_C) Count 126/(127_C+128_C+129_C+130_C) Variability [%] 127_C/(127_C+128_C+129_C+130_C) 127_C/(127_C+128_C+129_C+130_C) Count 127_C/(127_C+128_C+129_C+130_C) Variability [%] 127_N/(127_C+128_C+129_C+130_C) 127_N/(127_C+128_C+129_C+130_C) Count 127_N/(127_C+128_C+129_C+130_C) Variability [%] 128_C/(127_C+128_C+129_C+130_C) 128_C/(127_C+128_C+129_C+130_C) Count 128_C/(127_C+128_C+129_C+130_C) Variability [%] 128_N/(127_C+128_C+129_C+130_C) 128_N/(127_C+128_C+129_C+130_C) Count 128_N/(127_C+128_C+129_C+130_C) Variability [%]
count 3607.000000 3607.000000 3607 3607.000000 3.607000e+03 3607.000000 3607.000000 1934.000000 3607.000000 3607.000000 1934.000000 3607.000000 3607.000000 1934.000000 3607.000000 3607.000000 1934.000000 3607.000000 3607.000000 1934.000000 ...
mean 9.037982 1.966177 1 2274.844613 1.216479e+08 1.253093 4.245633 1879.573940 0.979999 4.245633 4234.968563 1.324021 4.245633 688.655584 1.019143 4.245633 1615.082937 1.413423 4.245633 133.309152 ...
std 33.364330 1.630943 0 819.403222 6.140113e+08 3.420274 14.467056 54641.390337 0.219848 14.467056 185202.322705 5.738091 14.467056 19673.160265 0.249831 14.467056 39119.590875 6.517296 14.467056 4104.614984 ...
min 1.000000 1.000000 1 748.403730 0.000000e+00 0.001000 1.000000 0.000000 0.004000 1.000000 0.000000 0.001000 1.000000 0.000000 0.000000 1.000000 0.000000 0.001000 1.000000 0.000000 ...
25% 2.000000 1.000000 1 1635.879625 5.073000e+06 0.630000 1.000000 7.525000 0.876500 1.000000 4.100000 0.700000 1.000000 6.925000 0.893000 1.000000 4.500000 0.659000 1.000000 7.100000 ...
50% 4.000000 1.000000 1 2157.235450 1.734000e+07 0.969000 2.000000 16.700000 0.983000 2.000000 8.300000 0.976000 2.000000 15.000000 1.017000 2.000000 9.500000 0.926000 2.000000 15.650000 ...
75% 8.000000 2.000000 1 2749.161230 5.637500e+07 1.543500 4.000000 31.200000 1.091000 4.000000 14.800000 1.504500 4.000000 29.100000 1.152000 4.000000 16.900000 1.629500 4.000000 30.000000 ...
max 1297.000000 32.000000 1 5778.788230 1.631000e+10 196.158000 604.000000 2283224.800000 3.013000 604.000000 8144706.400000 338.138000 604.000000 829247.700000 4.507000 604.000000 1194366.200000 378.199000 604.000000 179743.600000 ...

8 rows × 42 columns


In [8]:
#Count number of phopho modifications per peptide, and create two groups, with phospho and without phospho
data_clean["phospho_count"] = data_clean["Modifications"].map( lambda x: x.count("Phospho"))
data_phospho = data_clean.ix[ data_clean["phospho_count"] > 0 ]
data_nophospho = data_clean.ix[ data_clean["phospho_count"] == 0 ]

In [9]:
print len(data_phospho)
print len(data_nophospho)


1045
2562

In [10]:
data_nophospho.describe()


Out[10]:
# PSMs # Proteins # Protein Groups MH+ [Da] Area 126/(127_C+128_C+129_C+130_C) 126/(127_C+128_C+129_C+130_C) Count 126/(127_C+128_C+129_C+130_C) Variability [%] 127_C/(127_C+128_C+129_C+130_C) 127_C/(127_C+128_C+129_C+130_C) Count 127_C/(127_C+128_C+129_C+130_C) Variability [%] 127_N/(127_C+128_C+129_C+130_C) 127_N/(127_C+128_C+129_C+130_C) Count 127_N/(127_C+128_C+129_C+130_C) Variability [%] 128_C/(127_C+128_C+129_C+130_C) 128_C/(127_C+128_C+129_C+130_C) Count 128_C/(127_C+128_C+129_C+130_C) Variability [%] 128_N/(127_C+128_C+129_C+130_C) 128_N/(127_C+128_C+129_C+130_C) Count 128_N/(127_C+128_C+129_C+130_C) Variability [%]
count 2562.000000 2562.000000 2562 2562.000000 2.562000e+03 2562.000000 2562.000000 1301.000000 2562.000000 2562.000000 1301.000000 2562.000000 2562.000000 1301.000000 2562.000000 2562.000000 1301.000000 2562.000000 2562.000000 1301.000000 ...
mean 8.060500 1.904372 1 2145.556998 1.376070e+08 1.162705 3.986729 937.841814 0.974311 3.986729 6286.786472 1.257511 3.986729 912.802690 1.022829 3.986729 1802.606457 1.297855 3.986729 45.433666 ...
std 28.491198 1.682486 0 812.704882 6.394598e+08 3.968624 11.490817 20668.120665 0.222514 11.490817 225806.198274 6.742117 11.490817 23734.531274 0.259089 11.490817 42859.686823 7.536560 11.490817 466.808539 ...
min 1.000000 1.000000 1 748.403730 0.000000e+00 0.001000 1.000000 0.000000 0.015000 1.000000 0.000000 0.001000 1.000000 0.000000 0.000000 1.000000 0.000000 0.001000 1.000000 0.000000 ...
25% 2.000000 1.000000 1 1522.839555 4.895750e+06 0.596000 1.000000 6.800000 0.863250 1.000000 3.600000 0.681000 1.000000 6.100000 0.882000 1.000000 4.000000 0.632000 1.000000 6.300000 ...
50% 3.000000 1.000000 1 1991.594115 1.767500e+07 0.862500 2.000000 15.200000 0.978000 2.000000 7.400000 0.917500 2.000000 13.400000 1.017000 2.000000 8.800000 0.822000 2.000000 14.400000 ...
75% 6.000000 2.000000 1 2568.155660 6.139750e+07 1.399750 3.000000 29.900000 1.094750 3.000000 14.000000 1.332000 3.000000 26.900000 1.156000 3.000000 16.000000 1.422000 3.000000 28.800000 ...
max 977.000000 32.000000 1 5778.788230 1.318000e+10 196.158000 268.000000 558711.600000 3.013000 268.000000 8144706.400000 338.138000 268.000000 829247.700000 4.507000 268.000000 1194366.200000 378.199000 268.000000 15154.400000 ...

8 rows × 43 columns


In [11]:
from scipy import stats

In [12]:
def extract_groups(row,ctrl_group,*groups):
    group_values = []
    ratio_suffix = "/("+"+".join(sorted(ctrl_group))+")"
    #Process control group
    temp_group = []
    for label in ctrl_group:
        temp_group.append( row[label+ratio_suffix] )
    group_values.append(tuple(temp_group))
    
    #Process rest of groups
    for group in groups:
        temp_group = []
        for label in group:
            temp_group.append( row[label+ratio_suffix] )
        group_values.append(tuple(temp_group))
    return group_values

In [13]:
#Groups
g_ctrl = ("128_C","129_C","127_C","130_C")
g_1h = ("126","127_N")
g_2h = ("128_N","129_N")
g_4h = ("130_N","131")

#Control group must be the first for extract_groups function to work
groups = ( g_ctrl, g_1h, g_2h, g_4h )

In [14]:
P_VALUE =1 
data_phospho["anova_pval"]  = data_phospho.apply(lambda row: stats.f_oneway( *extract_groups(row,*groups) )[P_VALUE],axis=1)
data_nophospho["anova_pval"]= data_nophospho.apply(lambda row: stats.f_oneway( *extract_groups(row,*groups))[P_VALUE],axis=1)

In [15]:
P_VALUE =1 
data_phospho["kruskal_pval"] = data_phospho.apply(lambda row: stats.mstats.kruskalwallis( *extract_groups(row,*groups))[P_VALUE],axis=1)
data_nophospho["kruskal_pval"] = data_nophospho.apply(lambda row: stats.mstats.kruskalwallis( *extract_groups(row,*groups))[P_VALUE],axis=1)

In [28]:
#Phospho
alpha = 0.05
n_samples = len(data_phospho)
bonf_corrected_alpha = alpha/n_samples
with open("phospho_anova_accessions_alpha0.05_bonferroni.tsv","w") as phospho_anova_fh:
    data_phospho.ix[ data_phospho["anova_pval"] < bonf_corrected_alpha ].to_csv(phospho_anova_fh,sep="\t",index=None)

In [ ]: