In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline
In [7]:
con = pd.read_csv("../main_HUMAN.csv",index_col=0)
con.head()
Out[7]:
In [8]:
con["entropy"].hist(bins=50,color='grey')
Out[8]:
In [9]:
con["NumDBsContributing"].hist(bins=13,color='grey')
Out[9]:
In [10]:
con["Bimodality"].hist(bins=50,color='grey')
Out[10]:
Now we define 5 different datasets with different amounts of quality filter, ranged from least to most stringent
- All proteins included
- Entropy: < 1
- HGT: No filter
- NumDBsContributing: No Filter
- Bimodality: No Filter
- Entropy: No Filter
- HGT: Remove flagged proteins
- NumDBsContributing: No Filter
- Bimodality: No Filter
- Entropy: No Filter
- HGT: No Filter
- NumDBsContributing: > 3
- Bimodality: No Filter
- Entropy: No Filter
- HGT: No Filter
- NumDBsContributing: No Filter
- Bimodality: < 5
- Entropy: < 1
- HGT: Remove flagged proteins
- NumDBsContributing: > 3
- Bimodality: < 5
In [13]:
d1_archs = [prot for prot in con[con["modeAge"] == "Euk_Archaea"].index]
d1_bacs = [prot for prot in con[con["modeAge"] == "Euk+Bac"].index]
d2_archs = [prot for prot in con[(con["modeAge"] == "Euk_Archaea")
& (con["entropy"]<1)].index]
d2_bacs = [prot for prot in con[(con["modeAge"] == "Euk+Bac")
& (con["entropy"]<1)].index]
d3_archs = [prot for prot in con[(con["modeAge"] == "Euk_Archaea") &
(con["HGT_flag"] == False)].index]
d3_bacs = [prot for prot in con[(con["modeAge"] == "Euk+Bac") &
(con["HGT_flag"] == False)].index]
d4_archs = [prot for prot in con[(con["modeAge"] == "Euk_Archaea") &
(con["NumDBsContributing"] > 3)].index]
d4_bacs = [prot for prot in con[(con["modeAge"] == "Euk+Bac") &
(con["NumDBsContributing"] > 3)].index]
d5_archs = [prot for prot in con[(con["modeAge"] == "Euk_Archaea") &
(con["Bimodality"] < 5)].index]
d5_bacs = [prot for prot in con[(con["modeAge"] == "Euk+Bac") &
(con["Bimodality"] < 5)].index]
d6_archs = [prot for prot in con[(con["modeAge"] == "Euk_Archaea") &
(con["entropy"]<1) &
(con["HGT_flag"] == False) &
(con["Bimodality"] < 5) &
(con["NumDBsContributing"] > 3)].index]
d6_bacs = [prot for prot in con[(con["modeAge"] == "Euk+Bac") &
(con["entropy"]<1) &
(con["HGT_flag"] == False) &
(con["Bimodality"] < 5) &
(con["NumDBsContributing"] > 3)].index]
In [14]:
## Write out the data files for submission to g:Profiler
archs = [d1_archs,d2_archs,d3_archs,d4_archs,d5_archs,d6_archs]
bacs = [d1_bacs,d2_bacs,d3_bacs,d4_bacs,d5_bacs,d6_bacs]
for index,prots in enumerate(archs):
with open("d%d_archs.txt" % (index+1),'w') as out:
for i in prots:
out.write(i+"\n")
for index,prots in enumerate(bacs):
with open("d%d_bacs.txt" % (index+1),'w') as out:
for i in prots:
out.write(i+"\n")
Enrichment values were calculated with g:Profiler (http://biit.cs.ut.ee/gprofiler/index.cgi)
Now we get the gProfiler output for each dataset, and see how the p-values change with trimming.
In [16]:
infiles = !ls *enrichment.txt
infiles
Out[16]:
In [18]:
dsets = dict(zip(
["_".join(i.split("_")[:2]) for i in infiles],
[pd.read_table(i) for i in infiles]))
dsets['d1_archs'].head()
Out[18]:
In [19]:
for df in dsets:
dsets[df].sort(["t type","p-value"],inplace=True)
In [20]:
## Write data files once they've been sorted
for df in dsets:
dsets[df].to_csv(df+"_sorted.csv")
First define a function to munge the data. It selects one of the two kingdoms, 'bacs' or 'archs' (which is all we are investigating here), and one of the enrichment databases or GO terms (default "BP": biological process).
The p-values are converted to negative log10. It also adds a column that is the difference between dataset 1 and dataset 5.
In [21]:
def pValue_series(data_sets,kingdom='archs',t_type="BP"):
is_first = True
for df in sorted(data_sets.keys()):
if kingdom not in df: # ignore other kingdom, 'arch' or 'bac'
continue
trimdf = data_sets[df][data_sets[df]["t type"] == t_type]
trimdf.loc[:,"name"] = trimdf["t name"].map(lambda x: x.strip())
trimdf.loc[:,"log p-value"] = trimdf["p-value"].map(lambda x: -(np.log10(x)))
if is_first:
pvalues = trimdf[["name","log p-value"]].set_index("name")
pvalues.columns = ["log p-value" + "_" + df]
is_first = False
else:
temp = trimdf[["name","log p-value"]].set_index("name")
temp.columns = ["log p-value" + "_" + df]
pvalues = pd.concat([pvalues,temp],axis=1,copy=False)
pvalues.loc[:,"diff"] = pvalues.ix[:,0] - pvalues.ix[:,-1]
pvalues.sort("diff",inplace=True)
return pvalues
In [23]:
archs_pvalues_BP = pValue_series(dsets)
archs_pvalues_BP.head(n=10)
Out[23]:
In [24]:
bacs_pvalues_BP = pValue_series(dsets,'bacs')
In [25]:
archs_pvalues_CC = pValue_series(dsets,'archs',"CC")
bacs_pvalues_CC = pValue_series(dsets,'bacs',"CC")
In [26]:
#Write out p-value series
archs_pvalues_BP.to_csv("archs_p-values_BP.csv")
bacs_pvalues_BP.to_csv("bacs_p-values_BP.csv")
archs_pvalues_CC.to_csv("archs_p-values_CC.csv")
bacs_pvalues_CC.to_csv("bacs_p-values_CC.csv")
In [28]:
def plot_it(df,term,style='-',label=None):
df = df.fillna(0)
row = df.ix[term,:-1]
row.index = [i.split("_")[-2] for i in row.index]
row.plot(style=style,label=label,color='black',linewidth=2)
plt.xticks(rotation=70)
plt.xlabel("Dataset")
plt.ylabel("-log10(p-value)")
In [40]:
plot_it(archs_pvalues_BP,"RNA metabolic process",label="RNA metabolic process (BP)")
plot_it(archs_pvalues_BP,"ribosome biogenesis",style='--',label="ribosome biogenesis (BP)")
plot_it(archs_pvalues_CC,"nucleolus",style=':',label="nucleolus (CC)")
plot_it(archs_pvalues_CC,"cytosolic large ribosomal subunit",style='-.',label="cyto. large ribosomal subunit (CC)")
plt.legend(loc=0,prop={'size':9})
ax = plt.gca()
ax.set_ylim([0,50])
#plt.savefig("Archaea_p-values.svg")
In [41]:
plot_it(bacs_pvalues_BP,"amide biosynthetic process",label="amide biosynthetic process (BP)")
plot_it(bacs_pvalues_BP,"metabolic process",style='--',label="metabolic process (BP)")
plot_it(bacs_pvalues_CC,"intracellular organelle",style='-.',label="intracellular organelle (CC)")
plot_it(bacs_pvalues_BP,"catabolic process",style=':',label="mitochondrial membrane (BP)")
plt.legend(loc=2,prop={'size':9})
ax = plt.gca()
ax.set_ylim([0,100])
#plt.savefig("Bacteria_p-values.svg")
In [43]:
fig,axes = plt.subplots(2,2)
dfs_names = zip(["Euk_Archaea (BP)", "Euk+Bacteria (BP)","Euk_Archaea (CC)","Euk+Bacteria (CC)"],
[archs_pvalues_BP,bacs_pvalues_BP,archs_pvalues_CC,bacs_pvalues_CC])
# Greys
# flier_colors = ["#252525","#252525","#636363","#636363","#969696","#969696","#bdbdbd","#bdbdbd","#d9d9d9","#d9d9d9"]
flier_colors = ["#990000","#990000","#d7301f","#d7301f","#ef6548","#ef6548","#fc8d59","#fc8d59","#fdbb84","#fdbb84"]
index=0
for name,df in dfs_names:
colors = dict(boxes='black',caps='black',medians='black',whiskers='grey')
ax = axes.flat[index]
df = df.dropna()
df = df.ix[:,:-1]
df.columns = [i.split("_")[1] for i in df.columns]
box1 = df.plot(kind='box',ax=ax,notch=True,sym=".",color=colors,return_type='dict',bootstrap=5000)
for col,flier in zip(flier_colors,box1['fliers']):
plt.setp(flier,color=col)
ax.set_title(name)
ax.get_xaxis().set_ticks([])
index += 1
#plt.savefig("avg_p-values.svg")
In [ ]: