In [1]:
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline
In [2]:
lossTaxa = pd.read_csv("lossTaxa_HUMAN.csv",index_col=0)
lossTaxa.head()
Out[2]:
In [3]:
# Map genes to trimmed algorithms
outlierD = {}
with open("lossStats_HUMAN.csv") as f:
f.readline() # skip header
for line in f:
line = line.strip().split(",")
if line[3] == '':
continue
outlierD[line[0]] = line[3].split()
outlierD[outlierD.keys()[0]]
Out[3]:
In [4]:
dbsTrimmed = 0
avgs = pd.Series()
for index,row in lossTaxa.iterrows():
if index in outlierD:
dbsTrimmed += len(outlierD[index])
dbs = [i for i in lossTaxa.columns if i not in outlierD[index]]
else:
dbs = lossTaxa.columns
avgs[index] = row[dbs].mean() # only include algorithm that have not been trimmed
dbsTrimmed
Out[4]:
In [5]:
lossTaxa["Avg"] = avgs
lossTaxa.head()
Out[5]:
In [6]:
def floatRange(start,stop,step):
i = start
while i <= stop:
yield i
i += step
quantile_steps = [i for i in floatRange(0,1,.05)]
quantiles = lossTaxa["Avg"].quantile(quantile_steps)
quantiles
Out[6]:
In [7]:
quantiles.iloc[-1]
Out[7]:
In [8]:
lossTaxa["Avg"].hist(bins=50,color='grey')
bline = plt.axvline(24.92,color='black',label="95th percentile")
plt.legend()
#plt.savefig("AvgLossTaxa_distribution.svg")
In [9]:
lossTaxa["HGT_flag"] = lossTaxa["Avg"] >= 24.92
lossTaxa.head()
Out[9]:
In [9]:
lossTaxa.to_csv("HGTFlag_HUMAN.csv")
In [ ]: