In [1]:
%matplotlib inline
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
False positives are defined as algorithms that, for a given gene, infer an outsized number of losses for that orthogroup.
My programs output mean number of taxa that the algorithms inferred to have had lost the orthogroup, and variance of this number for each gene.
It also identifies algorithms that have an outsize number of taxa (2 standard deviations above the mean). These are listed in the outlier column, if they were found.
In [2]:
stats2 = pd.read_csv("lossStats_HUMAN.csv",index_col=0)
In [3]:
stats2.fillna({"mean":np.nan,"variance":np.nan,"outliers":0},inplace=True)
stats2.head()
Out[3]:
In [8]:
ax = stats2["variance"].hist(bins=50,color='grey')
ax.set_title("Variance histogram, all genes")
ax.set_ylabel("Number of genes")
#plt.savefig("variance_histogram.svg")
Out[8]:
In [9]:
stats_outliers = stats2[stats2["outliers"] != 0]
ax = stats_outliers["variance"].hist(bins=50,color='grey')
ax.set_title("Variance, genes with outliers")
ax.set_ylabel("Number of genes")
Out[9]:
In [11]:
ax = stats2["mean"].hist(bins=50,color='grey')
ax.set_title("Histogram of mean values, all genes")
ax.set_ylabel("Number of genes")
#plt.savefig("mean_histogram.svg")
Out[11]:
In [12]:
ax = stats_outliers["mean"].hist(bins=50,color='grey')
ax.set_title("Mean, genes with outliers")
ax.set_ylabel("Number of genes")
Out[12]:
In [13]:
stats2['numOutliers'] = stats2['outliers'].map(lambda x: len(x.split(" ")) if x != 0 else 0)
stats2.head()
Out[13]:
In [14]:
stats2["numOutliers"].value_counts()
Out[14]:
In [15]:
FalsePos = pd.Series([db for row in stats2["outliers"] for db in str(row).split()]).value_counts()
FalsePos = FalsePos[FalsePos.index != '0'] # don't care about these
FalsePos
Out[15]:
In [16]:
ldos = pd.read_csv("HUMAN_LDO_results.csv",index_col=0)
ldos.head()
Out[16]:
In [18]:
FalseNeg = ldos.apply(pd.value_counts).ix[True]
FalseNeg.sort(ascending=False, inplace=True)
FalseNeg
Out[18]:
In [19]:
dbs = ["InParanoid","InParanoidCore","OMA_Groups","OMA_Pairs","PANTHER8_LDO","RSD","EggNOG","Orthoinspector",
"Hieranoid_2","EnsemblCompara_v2","Metaphors","PhylomeDB","PANTHER8_all"]
errors = pd.DataFrame({"FalsePositive":FalsePos,"FalseNegative":FalseNeg})
errors = errors.reindex(dbs)
errors.head()
Out[19]:
In [20]:
# errors.to_csv("errors_byDatabase.csv")
In [21]:
width = .35
fig, ax1 = plt.subplots()
errors["FalseNegative"].plot(kind='bar', ax=ax1, color='grey', width=width, position=1)
ax1.set_ylabel("Number Genes False Negative")
ax2 = ax1.twinx()
errors["FalsePositive"].plot(kind='bar', ax=ax2, color='black', width=width, position=0)
ax2.set_ylabel("Number Genes False Positive")
ax1.yaxis.grid(False)
ax2.yaxis.grid(False)
ax1.xaxis.grid(False)
ax2.xaxis.grid(False)
#plt.savefig("errors_byDatabase.svg")
In [23]:
normErrors = errors/errors.sum()
normErrors["sumErrors"] = normErrors["FalseNegative"] + normErrors["FalsePositive"]
normErrors["normSum"] = normErrors["sumErrors"]/normErrors["sumErrors"].sum()
normErrors.sum()
Out[23]:
In [24]:
normErrors["normSum"].plot(kind='bar',color='grey')
#plt.savefig("totalErrors.svg")
Out[24]:
In [ ]: