In [1]:
import pandas as pd
import re
In [2]:
data = pd.read_table("ElenaP_20140213_MCF7_MG132_TMT10plex_1mgIPG25-37_fr12-72_peps.txt")
In [3]:
data.describe()
Out[3]:
In [4]:
print data.columns
In [5]:
#Identify ratio columns for anova
ratio_cols = []
for col in data.columns.tolist():
if re.search(r"^[0-9_CN]+/\([0-9_CN]+?(\+[0-9_CN]+?)*\)$",col):
ratio_cols.append(col)
print ratio_cols
print len(ratio_cols)
In [6]:
#Remove rows which have NA data in the ratio columns
na_array = np.zeros( len(data), dtype=bool)
for col in ratio_cols:
na_array |= pd.isnull(data[col])
data_clean = data[ ~na_array]
print "Removed rows with NA values: "+ str(np.sum(na_array))
In [7]:
data_clean.describe()
Out[7]:
In [8]:
#Count number of phopho modifications per peptide, and create two groups, with phospho and without phospho
data_clean["phospho_count"] = data_clean["Modifications"].map( lambda x: x.count("Phospho"))
data_phospho = data_clean.ix[ data_clean["phospho_count"] > 0 ]
data_nophospho = data_clean.ix[ data_clean["phospho_count"] == 0 ]
In [9]:
print len(data_phospho)
print len(data_nophospho)
In [10]:
data_nophospho.describe()
Out[10]:
In [11]:
from scipy import stats
In [12]:
def extract_groups(row,ctrl_group,*groups):
group_values = []
ratio_suffix = "/("+"+".join(sorted(ctrl_group))+")"
#Process control group
temp_group = []
for label in ctrl_group:
temp_group.append( row[label+ratio_suffix] )
group_values.append(tuple(temp_group))
#Process rest of groups
for group in groups:
temp_group = []
for label in group:
temp_group.append( row[label+ratio_suffix] )
group_values.append(tuple(temp_group))
return group_values
In [13]:
#Groups
g_ctrl = ("128_C","129_C","127_C","130_C")
g_1h = ("126","127_N")
g_2h = ("128_N","129_N")
g_4h = ("130_N","131")
#Control group must be the first for extract_groups function to work
groups = ( g_ctrl, g_1h, g_2h, g_4h )
In [14]:
P_VALUE =1
data_phospho["anova_pval"] = data_phospho.apply(lambda row: stats.f_oneway( *extract_groups(row,*groups) )[P_VALUE],axis=1)
data_nophospho["anova_pval"]= data_nophospho.apply(lambda row: stats.f_oneway( *extract_groups(row,*groups))[P_VALUE],axis=1)
In [15]:
P_VALUE =1
data_phospho["kruskal_pval"] = data_phospho.apply(lambda row: stats.mstats.kruskalwallis( *extract_groups(row,*groups))[P_VALUE],axis=1)
data_nophospho["kruskal_pval"] = data_nophospho.apply(lambda row: stats.mstats.kruskalwallis( *extract_groups(row,*groups))[P_VALUE],axis=1)
In [28]:
#Phospho
alpha = 0.05
n_samples = len(data_phospho)
bonf_corrected_alpha = alpha/n_samples
with open("phospho_anova_accessions_alpha0.05_bonferroni.tsv","w") as phospho_anova_fh:
data_phospho.ix[ data_phospho["anova_pval"] < bonf_corrected_alpha ].to_csv(phospho_anova_fh,sep="\t",index=None)
In [ ]: