In [1]:
import numpy as np
import pandas as pd
from scipy import stats
from matplotlib import pyplot as plt
from itertools import combinations
%matplotlib inline
In [56]:
## Suppress copious FutureWarnings from pandas
import warnings
warnings.filterwarnings('ignore')
In [6]:
# Protein lengths are from Uniprot proteome. Number of domains was inferred with hmmer. Similarities here are derived
# from pairwise alignments (needleman-wunsch) between one-to-one human and mouse orthologs, defined by PhylomeDB.
# We look atHuman-Yeast pairs below, but chose to use Mouse here because there are more one-to-ones.
lengths = pd.read_csv("lengthMapping.csv",index_col=0,names=["protLength"]).sort_index()
numDomains = pd.read_csv("numDomains.csv",index_col=0).sort_index()
mouse_human = pd.read_csv("h-m_similarities.csv",index_col=0,names=["Hs-Mm_sim"]).sort_index()
In [7]:
nodeStats = pd.read_csv("../../nodeStats/nodeStats_HUMAN.csv",index_col=0).sort_index()[["Bimodality","NodeError"]]
In [8]:
lossTaxa = pd.read_csv("../../Errors/lossStats_HUMAN.csv",index_col=0).sort_index()[["mean"]]
lossTaxa.columns = ["meanLossTaxa"]
In [9]:
fracOverSplit = pd.read_csv("../../Errors/HUMAN_LDO_summary.csv",index_col=0).sort_index()
fracOverSplit.columns = ["fracOverSplit"]
In [55]:
all_stats = pd.concat([lengths,numDomains,mouse_human,nodeStats,lossTaxa,fracOverSplit],axis=1)
all_stats.dropna(inplace=True)
all_stats = all_stats.convert_objects(convert_numeric=True)
all_stats.columns = ["Protein Length","# Domains","Hs-Mm_evoSim","Bimodality",
"Node Error","Avg. Loss Taxa","% Oversplit"]
all_stats.head()
Out[55]:
In [26]:
fig, axes = plt.subplots(3,3,sharey=False)
index = 0
for col in all_stats.columns:
axis = axes.flat[index]
all_stats.plot(ax=axis,kind='scatter',x=col,y='Node Error',color='grey',title=col)
axis.set_ylabel('')
axis.set_xticklabels([])
index += 1
#fig.savefig("errorScatterPlot.png",dpi=200)
In [27]:
# Get Spearman rank correlations between all variables
correlations = all_stats.corr("spearman")
correlations
Out[27]:
In [28]:
absCorrs = correlations.applymap(np.abs)
absCorrs.drop("Node Error",axis=1).ix["Node Error",].plot(
kind='bar',color='grey',ylim=(0,1),title="Correlation with Node Error")
plt.ylabel("Absolute Spearman corr. coef.")
#plt.savefig("correlation_barPlot.svg")
Out[28]:
In [29]:
# correlations.to_csv("errorCorrelations.csv")
In [43]:
yeast_human = pd.read_csv("h-y_similarities.csv",index_col=0,names=["Hs-Sc_sim"]).sort_index()
In [57]:
error_yeastSims = pd.concat([nodeStats,yeast_human],axis=1).dropna()[["Hs-Sc_sim","NodeError"]]
error_yeastSims = error_yeastSims.convert_objects(convert_numeric=True)
print "Number of alignments: %d" % len(error_yeastSims)
In [52]:
error_yeastSims.corr()
Out[52]:
In [53]:
error_yeastSims.plot(kind='scatter',y="NodeError",x="Hs-Sc_sim",color='grey',title="Yeast-Human Distances Scatter")
#plt.savefig("Yeast-Human_dists_scatter.png",dpi=600)
Out[53]:
Does the limited sensitivity of homology search algorithms affect the age-call? Prediction of Moyers and Zhang (2015,2016) is that quickly evolving proteins and short proteins will be called at a younger age incorrectly.
Takeaway from the below is that protein length and evolutionary rate may affect some of the proteins in the "Mammal" age class. So we cannot rule out that some of these are false negatives due to limited sensitivity in the initial homology search.
In [59]:
mouse_human.hist(bins=50,color='grey')
Out[59]:
In [62]:
lengths.hist(bins=50,color='grey')
plt.yscale("log")
In [36]:
conAges = pd.read_csv("../../Main/main_HUMAN.csv",index_col=0)["modeAge"]
conAges.head()
Out[36]:
In [37]:
statsAges = all_stats.join(conAges).sort("modeAge")
statsAges.head()
Out[37]:
In [44]:
ages = ["Cellular_organisms","Euk+Bac","Euk_Archaea","Eukaryota",
"Opisthokonta","Eumetazoa","Vertebrata","Mammalia"]
ageLengths = {}
for i in ages:
df = statsAges[statsAges["modeAge"] == i]
assert len(df) > 50 # just checkin'
ageLengths[i] = df["Protein Length"].mean()
In [45]:
ax = pd.Series(ageLengths,index=ages).plot(color='black')
plt.xticks(rotation=70)
ax.set_ylabel("Average Protein Length")
ax.set_title("Average Protein Lengths in each Age Category")
#plt.savefig("lengthsXage.svg")
Out[45]:
In [62]:
ageSims = {}
for i in ages:
df = statsAges[statsAges["modeAge"] == i]
assert len(df) > 50 # just checkin'
ageSims[i] = df["Hs-Mm_evoSim"].mean()
ax = pd.Series(ageSims,index=ages).plot(color='black')
plt.xticks(rotation=70)
ax.set_ylabel("Average Hs-Mm Similarity")
ax.set_title("Hs-Mm Evolutionary Similarity in each Age Category")
#plt.savefig("Hs-Mm_similarityXage.svg")
Out[62]:
In [56]:
# Create series for Mann-Whitney U-test
ageSimsSets = {}
for i in ages:
df = statsAges[statsAges["modeAge"] == i]
assert len(df) > 50 # just checkin'
ageSimsSets[i] = df["Hs-Mm_evoSim"]
ageSimsSets["Mammalia"].head()
Out[56]:
In [103]:
## Mann-Whitney U-test
f,p = stats.mannwhitneyu(ageSimsSets["Vertebrata"],ageSimsSets["Mammalia"])
print "p-val Mammals and Vertebrate classes have same mean: %s" % p
In [104]:
ageCatSimsDF = pd.DataFrame(ageSimsSets,columns=ages)
In [105]:
ax = ageCatSimsDF.boxplot(sym=".")
plt.ylim(20,110)
plt.xticks(rotation=70)
Out[105]:
In [ ]: