In [1]:
import pandas as pd
import numpy as np
from IPython.display import display
import csv
from collections import defaultdict
import matplotlib.pyplot as plt
df = pd.read_csv("../data/references/miniGWG_darth/shear100/sheared_bayes.txt", sep="\t", header=None)
df.head()
Out[1]:
In [2]:
uniqueness_rate_per_level = np.zeros(8, dtype=float)
for i in range(1, 9):
# Sum all of the columns
colsums = df[i].sum()
# Take the sum of those columns
num_hits = colsums.sum()
# Total number of possible hits
total_hits = df[9].sum()
# Uniqueness Rate
uniqueness_rate_per_level[i-1] = num_hits/total_hits
levels = ['kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species', 'strain']
display(list(zip(levels, uniqueness_rate_per_level)))
In [3]:
# Sum all of the columns
colsums = df.iloc[:,1:9].sum()
# Take the sum of those columns
num_hits = colsums.sum()
# Total number of possible hits
total_hits = df[9].sum()
# Uniqueness Rate
num_hits/total_hits
Out[3]:
The kmer size is too large. For this experiment, we sheared 100 bps with a step size of 50. They just aren't unique enough at the strain level. Almost .87% of these reads are unique at the strain level. If we look at all levels, the uniqueness gets distributed only to 89%, with the highest uniquness being at the genus level with 1%.
Is it feasible for use to decrease the size of the kmers to 32bps? Or maybe even to 16bps?
In [4]:
# Get the size of the files
files_in_darth = !ls -alh "../data/references/miniGWG_darth/"
display(files_in_darth)
So it would be ~200GB sheared file with a 32bps and 16bps sliding window. Is this feasible with embalmer?
In [5]:
df_32 = pd.read_csv("../data/references/miniGWG_darth/shear32/sheared_bayes.txt", sep="\t", header=None)
uniqueness_rate_per_level = np.zeros(8, dtype=float)
for i in range(1, 9):
# Sum all of the columns
colsums = df_32[i].sum()
# Take the sum of those columns
num_hits = colsums.sum()
# Total number of possible hits
total_hits = df_32[9].sum()
# Uniqueness Rate
uniqueness_rate_per_level[i-1] = num_hits/total_hits
levels = ['kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species', 'strain']
display(list(zip(levels, uniqueness_rate_per_level)))
In [6]:
# Distribution of uniqueness
plt.hist(df_32[8]/df_32[9])
plt.show()
In [7]:
plt.hist(df[8]/df[9])
plt.show()
In [8]:
low_uniqueness = df[df[8]/df[9] <= .01]
low_uniqueness[0]
Out[8]:
In [9]:
from collections import Counter
kingdom_freq_hi = Counter([_.split(';')[0] for _ in df[0]])
kingdom_freq_low = Counter([_.split(';')[0] for _ in low_uniqueness[0]])
print(kingdom_freq_hi)
print(kingdom_freq_low)
for key, value in kingdom_freq_hi.items():
print("%s\t%f" % (key, kingdom_freq_low[key]/np.float(value)))
In [10]:
# Sum all of the columns
colsums = df_32.iloc[:,1:9].sum()
# Take the sum of those columns
num_hits = colsums.sum()
# Total number of possible hits
total_hits = df_32[9].sum()
# Uniqueness Rate
num_hits/total_hits
Out[10]:
In [11]:
# hits_dict = defaultdict(list)
# with open("../data/references/miniGWG_darth/shear32/embalmer_align.b6") as inf:
# csv_inf = csv.reader(inf, delimiter="\t")
# for row in csv_inf:
# if 't__' in row[-1]:
# hits_dict[row[-1]].append(int(row[8]))
# with open("./hits_dict.keys", "w") as outf:
# for key, value in hits_dict.items():
In [12]:
#len(list(hits_dict.keys()))
In [13]:
#darth = pd.read_csv("../data/references/miniGWG_darth/shear100/embalmer_otutable.fixed.txt", sep="\t", index_col=0)
In [25]:
import csv
# with open("../data/references/miniGWG_darth/shear100/embalmer_otutable.fixed.txt") as inf:
# csv_inf = csv.reader(inf, delimiter="\t")
# columns = next(csv_inf)
# columns = dict(zip(columns[1:], range(len(columns))))
# indptr = [0]
# indices = np.array([], dtype=int)
# data = np.array([], dtype=int)
# names = []
# for ix, row in enumerate(csv_inf):
# if ix % 1000 == 0:
# print(ix)
# names.append(row[0])
# np_row = np.array(row[1:], dtype=int)
# temp_indx = [np_row > 0]
# data = np.concatenate((data, np_row[temp_indx]))
# indices = np.concatenate((indices, np.where(temp_indx)[1]))
# indptr.append(indices.shape[0])
In [30]:
from scipy.sparse import csr_matrix
# csr = csr_matrix((data, indices, indptr), dtype=int)
def save_sparse_csr(filename,array):
np.savez(filename,data = array.data ,indices=array.indices,
indptr =array.indptr, shape=array.shape )
def load_sparse_csr(filename):
loader = np.load(filename)
return csr_matrix(( loader['data'], loader['indices'], loader['indptr']),
shape = loader['shape'])
# save_sparse_csr("../data/references/miniGWG_darth/shear100/embalmer_otutable.fixed.npz", csr)
In [67]:
# load csr
with open("../data/references/miniGWG_darth/shear100/embalmer_taxatable.fixed.txt") as inf:
csv_inf = csv.reader(inf, delimiter="\t")
columns = next(csv_inf)
columns = dict(zip(columns[1:], range(len(columns))))
indptr = [0]
indices = np.array([], dtype=int)
data = np.array([], dtype=int)
names = []
for ix, row in enumerate(csv_inf):
if ix % 1000 == 0:
print(ix)
names.append(row[0])
np_row = np.array(row[1:], dtype=int)
temp_indx = [np_row > 0]
data = np.concatenate((data, np_row[temp_indx]))
indices = np.concatenate((indices, np.where(temp_indx)[1]))
indptr.append(indices.shape[0])
taxatable_csr = csr_matrix((data, indices, indptr), dtype=int)
save_sparse_csr("../data/references/miniGWG_darth/shear100/embalmer_taxatable.fixed.npz", taxatable_csr)
np.savez("../data/references/miniGWG_darth/shear100/embalmer_taxatable.fixed.rows.npz", np.array(names))
np.savez("../data/references/miniGWG_darth/shear100/embalmer_taxatable.fixed.columns.npz", np.array(sorted(columns, key=columns.get)))
In [155]:
csr = load_sparse_csr("../data/references/miniGWG_darth/shear100/embalmer_taxatable.fixed.npz")
rows = np.load("../data/references/miniGWG_darth/shear100/embalmer_taxatable.fixed.rows.npz")['arr_0']
rows_dict = dict(zip(rows, range(len(rows))))
columns = np.load("../data/references/miniGWG_darth/shear100/embalmer_taxatable.fixed.columns.npz")['arr_0']
columns_dict = dict(zip(columns, range(len(columns))))
In [156]:
print(csr.shape)
In [207]:
np.sum(csr[rows_dict['k__Bacteria']])
Out[207]:
In [158]:
df.shape
Out[158]:
In [134]:
# Sum all of the columns
colsums = df.iloc[:,1:9].sum()
# Take the sum of those columns
num_hits = colsums.sum()
# Total number of possible hits
total_hits = df[9].sum()
# Uniqueness Rate
np.sum(csr)
Out[134]:
In [260]:
unique_count = np.zeros(8)
for row, name in zip(csr, rows):
level = name.count(';')
unique_count[level] += np.sum(row)
print(np.sum(csr, axis=1).mean())
In [261]:
unique_count.sum()
Out[261]:
In [262]:
levels = ['kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species', 'strain']
display(list(zip(levels, unique_count/np.sum(unique_count))))
In [263]:
# Load up the accession2taxonomy
with open("../data/references/miniGWG_darth/miniGWG_darth.tax") as inf:
csv_inf = csv.reader(inf, delimiter='\t')
name2taxonomy = dict(csv_inf)
# Transpose to be samples by indices
csr = csr.T
In [264]:
# Build the BayesMat
tax2index = np.unique(list(name2taxonomy.values()))
bayes_mat = np.zeros((tax2index.shape[0], 9), dtype=int)
tax2index = dict(zip(tax2index, range(tax2index.shape[0])))
row_names = []
for key, value in columns_dict.items():
key = key.split('.')
key = key[0] + "_" + key[1] + "." + key[2]
tax = name2taxonomy[key]
values = csr.getrow(value)
names = rows[values.indices]
row_names.append(tax)
for name, value in zip(names, values.data):
if not name is "":
bayes_mat[tax2index[tax], name.count(';')] += value
bayes_mat[tax2index[tax], 8] += np.sum(values)
In [265]:
df_emb_100 = pd.DataFrame(bayes_mat, index=sorted(tax2index, key=tax2index.get))
In [266]:
df_emb_100.iloc[:, 8].sum()
Out[266]:
In [267]:
uniqueness_rate_per_level = np.zeros(8, dtype=float)
for i in range(0, 8):
# Take the sum of those columns
num_hits = df_emb_100.iloc[:, i].sum()
# Total number of possible hits
total_hits = df_emb_100.iloc[:, 8].sum()
# Uniqueness Rate
uniqueness_rate_per_level[i] = num_hits/total_hits
levels = ['kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species', 'strain']
display(list(zip(levels, uniqueness_rate_per_level)))
In [268]:
read_counts_100 = pd.read_csv('../data/references/miniGWG_darth/shear100/sheared_read_counts.txt', sep="\t", header=None)
In [270]:
print(df.shape)
print(df_emb_100.shape)
df_emb_100.to_csv('../data/references/miniGWG_darth/shear100/sheared_bayes.fixed.txt', sep="\t", header=None)
In [304]:
# Distribution of uniqueness
plt.hist(df_emb_100.iloc[:, 1:8].sum(axis=1)/df_emb_100[8])
plt.show()
In [301]:
low_uniqueness = df_emb_100[df_emb_100.iloc[:, :8].sum(axis=1)/df_emb_100[8] < .01]
kingdom_freq_low = Counter([_.split(';')[0] for _ in low_uniqueness.index])
kingdom_freq_all = Counter([_.split(';')[0] for _ in df_emb_100.index])
print(kingdom_freq_low)
print(kingdom_freq_all)
for key, value in kingdom_freq_all.items():
print("%s\t%f" % (key, kingdom_freq_low[key]/np.float(value)))