In [1]:

    
import pandas as pd
import numpy as np
from IPython.display import display
import csv
from collections import defaultdict
import matplotlib.pyplot as plt
df = pd.read_csv("../data/references/miniGWG_darth/shear100/sheared_bayes.txt", sep="\t", header=None)
df.head()









    



/export/scratch/miniconda3/envs/analysis_SHOGUN/lib/python3.5/site-packages/matplotlib/__init__.py:841: UserWarning: Found matplotlib configuration in ~/.matplotlib/. To conform with the XDG base directory standard, this configuration location has been deprecated on Linux, and the new location is now /home/grad00/hillm096/.config/matplotlib/. Please move your configuration there to ensure that matplotlib will continue to find it in the future.
  _get_xdg_config_dir())






    Out[1]:






  
    
      
      0
      1
      2
      3
      4
      5
      6
      7
      8
      9
    
  
  
    
      0
      k__Bacteria;p__Proteobacteria;c__Alphaproteoba...
      0
      1
      48
      20
      12
      241
      2
      92971
      93299
    
    
      1
      k__Bacteria;p__Deinococcus-Thermus;c__Deinococ...
      25
      0
      0
      0
      0
      25
      0
      88888
      89027
    
    
      2
      k__Bacteria;p__Firmicutes;c__Bacilli;o__Bacill...
      0
      20
      45
      41
      29
      928
      0
      124865
      125928
    
    
      3
      k__Bacteria;p__Firmicutes;c__Erysipelotrichia;...
      56
      837
      0
      0
      304
      2
      0
      94217
      96069
    
    
      4
      k__Viruses;p__ssRNA_viruses;c__;o__;f__Virgavi...
      0
      0
      0
      0
      0
      0
      0
      129
      129



In [2]:

    
uniqueness_rate_per_level = np.zeros(8, dtype=float)
for i in range(1, 9):
    # Sum all of the columns
    colsums = df[i].sum()
    # Take the sum of those columns
    num_hits = colsums.sum()
    # Total number of possible hits
    total_hits = df[9].sum()
    # Uniqueness Rate
    uniqueness_rate_per_level[i-1] = num_hits/total_hits
levels = ['kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species', 'strain']
display(list(zip(levels, uniqueness_rate_per_level)))









    





[('kingdom', 0.00024427088981178973),
 ('phylum', 0.00045203286459036219),
 ('class', 0.00037099261803100767),
 ('order', 0.0013770542337594339),
 ('family', 0.0045528001044508127),
 ('genus', 0.010950355120068945),
 ('species', 0.0039106679441064454),
 ('strain', 0.87604409173555464)]



In [3]:

    
# Sum all of the columns
colsums = df.iloc[:,1:9].sum()
# Take the sum of those columns
num_hits = colsums.sum()
# Total number of possible hits
total_hits = df[9].sum()
# Uniqueness Rate
num_hits/total_hits









    Out[3]:





0.89790226551037344

The kmer size is too large. For this experiment, we sheared 100 bps with a step size of 50. They just aren't unique enough at the strain level. Almost .87% of these reads are unique at the strain level. If we look at all levels, the uniqueness gets distributed only to 89%, with the highest uniquness being at the genus level with 1%.

Is it feasible for use to decrease the size of the kmers to 32bps? Or maybe even to 16bps?



In [4]:

    
# Get the size of the files
files_in_darth = !ls -alh "../data/references/miniGWG_darth/"
display(files_in_darth)









    





['total 34G',
 'drwxrwx--- 4 hillm096 knightslab  401 Jul  3 13:05 .',
 'drwxrwx--- 6 hillm096 knightslab  139 Jun 30 10:46 ..',
 '-rw-r----- 1 hillm096 knightslab 7.6G Jul  3 12:19 miniGWG_darth_15.but',
 '-rw-r----- 1 hillm096 knightslab 1.3M Jul  3 12:19 miniGWG_darth_15.but.log',
 '-rw-r----- 1 hillm096 knightslab  13G Jul  2 08:45 miniGWG_darth_15.fna',
 '-rw-r----- 1 hillm096 knightslab 7.9G Jun 22 16:23 miniGWG_darth_15.gg.utr',
 '-rw-r----- 1 hillm096 knightslab 1.5M Jun 22 16:23 miniGWG_darth_15.gg.utr.gg.log',
 '-rw-r----- 1 hillm096 knightslab  24M Jun 22 15:06 miniGWG_darth_15.tax',
 'lrwxrwxrwx 1 hillm096 knightslab   40 Apr 25 17:45 miniGWG_darth.acx -> /project/flatiron2/sop/miniGWG_darth.acx',
 'lrwxrwxrwx 1 hillm096 knightslab   40 Apr 25 17:45 miniGWG_darth.edx -> /project/flatiron2/sop/miniGWG_darth.edx',
 '-rw-r----- 1 hillm096 knightslab  16M Jun 22 15:09 miniGWG_darth.tax',
 'drwxr-x--- 2 hillm096 knightslab  496 Jun 30 10:47 shear100',
 'drwxr-x--- 2 hillm096 knightslab  182 Jun 30 10:49 shear32']

So it would be ~200GB sheared file with a 32bps and 16bps sliding window. Is this feasible with embalmer?



In [5]:

    
df_32 = pd.read_csv("../data/references/miniGWG_darth/shear32/sheared_bayes.txt", sep="\t", header=None)
uniqueness_rate_per_level = np.zeros(8, dtype=float)
for i in range(1, 9):
    # Sum all of the columns
    colsums = df_32[i].sum()
    # Take the sum of those columns
    num_hits = colsums.sum()
    # Total number of possible hits
    total_hits = df_32[9].sum()
    # Uniqueness Rate
    uniqueness_rate_per_level[i-1] = num_hits/total_hits
levels = ['kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species', 'strain']
display(list(zip(levels, uniqueness_rate_per_level)))









    





[('kingdom', 0.0013420657585268855),
 ('phylum', 0.0011408068226012689),
 ('class', 0.001393227253969136),
 ('order', 0.001940149641640013),
 ('family', 0.0064696347476648252),
 ('genus', 0.014771946859077095),
 ('species', 0.0041695190973051882),
 ('strain', 0.68654350753079096)]



In [6]:

    
# Distribution of uniqueness
plt.hist(df_32[8]/df_32[9])
plt.show()



In [7]:

    
plt.hist(df[8]/df[9])
plt.show()



In [8]:

    
low_uniqueness = df[df[8]/df[9] <= .01]
low_uniqueness[0]









    Out[8]:





14       k__Bacteria;p__Firmicutes;c__Bacilli;o__Bacill...
17       k__Bacteria;p__Firmicutes;c__Bacilli;o__Lactob...
25       k__Bacteria;p__Proteobacteria;c__Gammaproteoba...
27       k__Bacteria;p__Proteobacteria;c__Gammaproteoba...
30       k__Bacteria;p__Proteobacteria;c__Betaproteobac...
37       k__Bacteria;p__Proteobacteria;c__Alphaproteoba...
38       k__Bacteria;p__Proteobacteria;c__Deltaproteoba...
44       k__Bacteria;p__Proteobacteria;c__Gammaproteoba...
50       k__Bacteria;p__Firmicutes;c__Clostridia;o__The...
63       k__Bacteria;p__Proteobacteria;c__Gammaproteoba...
70       k__Bacteria;p__Proteobacteria;c__Gammaproteoba...
100      k__Bacteria;p__Proteobacteria;c__Gammaproteoba...
107      k__Bacteria;p__Proteobacteria;c__Betaproteobac...
125      k__Bacteria;p__Proteobacteria;c__Gammaproteoba...
131      k__Bacteria;p__Firmicutes;c__Bacilli;o__Bacill...
137      k__Bacteria;p__Firmicutes;c__Bacilli;o__Bacill...
139      k__Bacteria;p__Actinobacteria;c__Actinobacteri...
149      k__Bacteria;p__Actinobacteria;c__Actinobacteri...
155      k__Bacteria;p__Actinobacteria;c__Actinobacteri...
171      k__Bacteria;p__Proteobacteria;c__Gammaproteoba...
176      k__Bacteria;p__Cyanobacteria;c__;o__Nostocales...
181      k__Bacteria;p__Proteobacteria;c__Gammaproteoba...
192      k__Bacteria;p__Proteobacteria;c__Betaproteobac...
193      k__Bacteria;p__Actinobacteria;c__Coriobacterii...
195      k__Archaea;p__Euryarchaeota;c__Methanococci;o_...
196      k__Bacteria;p__Proteobacteria;c__Alphaproteoba...
197      k__Bacteria;p__Firmicutes;c__Clostridia;o__Clo...
204      k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o_...
210      k__Bacteria;p__Firmicutes;c__Clostridia;o__The...
211      k__Bacteria;p__Proteobacteria;c__Gammaproteoba...
                               ...                        
5080     k__Viruses;p__dsDNA_viruses,_no_RNA_stage;c__;...
5090     k__Viruses;p__dsDNA_viruses,_no_RNA_stage;c__;...
5161     k__Viruses;p__unclassified_bacterial_viruses;c...
5362     k__Viruses;p__unclassified_bacterial_viruses;c...
5363     k__Viruses;p__dsDNA_viruses,_no_RNA_stage;c__;...
5519     k__Viruses;p__dsDNA_viruses,_no_RNA_stage;c__;...
5806     k__Viruses;p__ssRNA_viruses;c__;o__;f__Flavivi...
5826     k__Viruses;p__dsDNA_viruses,_no_RNA_stage;c__;...
5827     k__Viruses;p__dsDNA_viruses,_no_RNA_stage;c__;...
5992     k__Viruses;p__dsDNA_viruses,_no_RNA_stage;c__;...
6037     k__Viruses;p__dsDNA_viruses,_no_RNA_stage;c__;...
6210     k__Viruses;p__dsDNA_viruses,_no_RNA_stage;c__;...
6548     k__Viruses;p__dsDNA_viruses,_no_RNA_stage;c__;...
6757     k__Viruses;p__dsDNA_viruses,_no_RNA_stage;c__;...
6759     k__Viruses;p__dsDNA_viruses,_no_RNA_stage;c__;...
7818     k__Viruses;p__dsDNA_viruses,_no_RNA_stage;c__;...
7900     k__Viruses;p__ssRNA_viruses;c__;o__Picornavira...
7934     k__Viruses;p__dsDNA_viruses,_no_RNA_stage;c__;...
7958     k__Viruses;p__dsDNA_viruses,_no_RNA_stage;c__;...
8255     k__Viruses;p__dsDNA_viruses,_no_RNA_stage;c__;...
8540     k__Viruses;p__dsDNA_viruses,_no_RNA_stage;c__;...
8622     k__Viruses;p__dsDNA_viruses,_no_RNA_stage;c__;...
9340     k__Viruses;p__dsDNA_viruses,_no_RNA_stage;c__;...
9982     k__Viruses;p__dsDNA_viruses,_no_RNA_stage;c__;...
10040    k__Viruses;p__dsDNA_viruses,_no_RNA_stage;c__;...
10071    k__Viruses;p__dsDNA_viruses,_no_RNA_stage;c__;...
10134    k__Viruses;p__dsDNA_viruses,_no_RNA_stage;c__;...
10215    k__Viruses;p__dsDNA_viruses,_no_RNA_stage;c__;...
10234    k__Viruses;p__dsDNA_viruses,_no_RNA_stage;c__;...
10236    k__Viruses;p__dsDNA_viruses,_no_RNA_stage;c__;...
Name: 0, dtype: object



In [9]:

    
from collections import Counter
kingdom_freq_hi = Counter([_.split(';')[0] for _ in df[0]])
kingdom_freq_low = Counter([_.split(';')[0] for _ in low_uniqueness[0]])
print(kingdom_freq_hi)
print(kingdom_freq_low)
for key, value in kingdom_freq_hi.items():
    print("%s\t%f" % (key, kingdom_freq_low[key]/np.float(value)))









    



Counter({'k__Viruses': 7131, 'k__Bacteria': 2903, 'k__Archaea': 161, 'k__Viroids': 46})
Counter({'k__Bacteria': 264, 'k__Viruses': 62, 'k__Archaea': 7})
k__Archaea	0.043478
k__Bacteria	0.090940
k__Viruses	0.008694
k__Viroids	0.000000



In [10]:

    
# Sum all of the columns
colsums = df_32.iloc[:,1:9].sum()
# Take the sum of those columns
num_hits = colsums.sum()
# Total number of possible hits
total_hits = df_32[9].sum()
# Uniqueness Rate
num_hits/total_hits









    Out[10]:





0.71777085771157545



In [11]:

    
# hits_dict = defaultdict(list)
# with open("../data/references/miniGWG_darth/shear32/embalmer_align.b6") as inf:
#     csv_inf = csv.reader(inf, delimiter="\t")
#     for row in csv_inf:
#         if 't__' in row[-1]:
#             hits_dict[row[-1]].append(int(row[8]))

# with open("./hits_dict.keys", "w") as outf:
#     for key, value in hits_dict.items():



In [12]:

    
#len(list(hits_dict.keys()))



In [13]:

    
#darth = pd.read_csv("../data/references/miniGWG_darth/shear100/embalmer_otutable.fixed.txt", sep="\t", index_col=0)



In [25]:

    
import csv
# with open("../data/references/miniGWG_darth/shear100/embalmer_otutable.fixed.txt") as inf:
#     csv_inf = csv.reader(inf, delimiter="\t")
#     columns = next(csv_inf)
#     columns = dict(zip(columns[1:], range(len(columns))))
#     indptr = [0]
#     indices = np.array([], dtype=int)
#     data = np.array([], dtype=int)
#     names = []
#     for ix, row in enumerate(csv_inf):
#         if ix % 1000 == 0:
#             print(ix)
#         names.append(row[0])
#         np_row = np.array(row[1:], dtype=int)
#         temp_indx = [np_row > 0]
#         data = np.concatenate((data, np_row[temp_indx]))
#         indices = np.concatenate((indices, np.where(temp_indx)[1]))
#         indptr.append(indices.shape[0])



In [30]:

    
from scipy.sparse import csr_matrix
# csr = csr_matrix((data, indices, indptr), dtype=int)

def save_sparse_csr(filename,array):
    np.savez(filename,data = array.data ,indices=array.indices,
             indptr =array.indptr, shape=array.shape )

def load_sparse_csr(filename):
    loader = np.load(filename)
    return csr_matrix((  loader['data'], loader['indices'], loader['indptr']),
                         shape = loader['shape'])

# save_sparse_csr("../data/references/miniGWG_darth/shear100/embalmer_otutable.fixed.npz", csr)



In [67]:

    
# load csr
with open("../data/references/miniGWG_darth/shear100/embalmer_taxatable.fixed.txt") as inf:
    csv_inf = csv.reader(inf, delimiter="\t")
    columns = next(csv_inf)
    columns = dict(zip(columns[1:], range(len(columns))))
    indptr = [0]
    indices = np.array([], dtype=int)
    data = np.array([], dtype=int)
    names = []
    for ix, row in enumerate(csv_inf):
        if ix % 1000 == 0:
            print(ix)
        names.append(row[0])
        np_row = np.array(row[1:], dtype=int)
        temp_indx = [np_row > 0]
        data = np.concatenate((data, np_row[temp_indx]))
        indices = np.concatenate((indices, np.where(temp_indx)[1]))
        indptr.append(indices.shape[0])

taxatable_csr = csr_matrix((data, indices, indptr), dtype=int)
save_sparse_csr("../data/references/miniGWG_darth/shear100/embalmer_taxatable.fixed.npz", taxatable_csr)
np.savez("../data/references/miniGWG_darth/shear100/embalmer_taxatable.fixed.rows.npz", np.array(names))
np.savez("../data/references/miniGWG_darth/shear100/embalmer_taxatable.fixed.columns.npz", np.array(sorted(columns, key=columns.get)))









    



---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-67-3198e0cc862e> in <module>()
     21 save_sparse_csr("../data/references/miniGWG_darth/shear100/embalmer_taxatable.fixed.npz", taxatable_csr)
     22 np.savez("../data/references/miniGWG_darth/shear100/embalmer_taxatable.fixed.rows.npz", np.array(names))
---> 23 np.savez("../data/references/miniGWG_darth/shear100/embalmer_taxatable.fixed.columns.npz", np.array(sorted(columns, key=columns.get)))

AttributeError: 'numpy.ndarray' object has no attribute 'get'



In [155]:

    
csr = load_sparse_csr("../data/references/miniGWG_darth/shear100/embalmer_taxatable.fixed.npz")
rows = np.load("../data/references/miniGWG_darth/shear100/embalmer_taxatable.fixed.rows.npz")['arr_0']
rows_dict = dict(zip(rows, range(len(rows))))
columns = np.load("../data/references/miniGWG_darth/shear100/embalmer_taxatable.fixed.columns.npz")['arr_0']
columns_dict = dict(zip(columns, range(len(columns))))



In [156]:

    
print(csr.shape)









    



(12098, 100880)



In [207]:

    
np.sum(csr[rows_dict['k__Bacteria']])









    Out[207]:





4435



In [158]:

    
df.shape









    Out[158]:





(10241, 10)



In [134]:

    
# Sum all of the columns
colsums = df.iloc[:,1:9].sum()
# Take the sum of those columns
num_hits = colsums.sum()
# Total number of possible hits
total_hits = df[9].sum()
# Uniqueness Rate
np.sum(csr)









    Out[134]:





272730016



In [260]:

    
unique_count = np.zeros(8)
for row, name in zip(csr, rows):
    level = name.count(';')
    unique_count[level] += np.sum(row)
    
print(np.sum(csr, axis=1).mean())









    



22543.3969251



In [261]:

    
unique_count.sum()









    Out[261]:





272730016.0



In [262]:

    
levels = ['kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species', 'strain']
display(list(zip(levels, unique_count/np.sum(unique_count))))









    





[('kingdom', 0.00059829131532042291),
 ('phylum', 0.00055417075911439102),
 ('class', 0.00032945401946516951),
 ('order', 0.0012051552110787834),
 ('family', 0.0046517285431464939),
 ('genus', 0.0083225309531019861),
 ('species', 0.37079631161683357),
 ('strain', 0.6135423575819392)]



In [263]:

    
# Load up the accession2taxonomy
with open("../data/references/miniGWG_darth/miniGWG_darth.tax") as inf:
    csv_inf = csv.reader(inf, delimiter='\t')
    name2taxonomy = dict(csv_inf)
# Transpose to be samples by indices
csr = csr.T



In [264]:

    
# Build the BayesMat
tax2index = np.unique(list(name2taxonomy.values()))
bayes_mat = np.zeros((tax2index.shape[0], 9), dtype=int)
tax2index = dict(zip(tax2index, range(tax2index.shape[0])))
row_names = []
for key, value in columns_dict.items():
    key = key.split('.')
    key = key[0] + "_" + key[1] + "." + key[2]
    tax = name2taxonomy[key]
    values = csr.getrow(value)
    names = rows[values.indices]
    row_names.append(tax)
    for name, value in zip(names, values.data):
        if not name is "":
            bayes_mat[tax2index[tax], name.count(';')] += value
    bayes_mat[tax2index[tax], 8] += np.sum(values)



In [265]:

    
df_emb_100 = pd.DataFrame(bayes_mat, index=sorted(tax2index, key=tax2index.get))



In [266]:

    
df_emb_100.iloc[:, 8].sum()









    Out[266]:





272730016



In [267]:

    
uniqueness_rate_per_level = np.zeros(8, dtype=float)
for i in range(0, 8):
    # Take the sum of those columns
    num_hits =  df_emb_100.iloc[:, i].sum()
    # Total number of possible hits
    total_hits = df_emb_100.iloc[:, 8].sum()
    # Uniqueness Rate
    uniqueness_rate_per_level[i] = num_hits/total_hits
levels = ['kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species', 'strain']
display(list(zip(levels, uniqueness_rate_per_level)))









    





[('kingdom', 0.00059829131532042291),
 ('phylum', 0.00055417075911439102),
 ('class', 0.00032945401946516951),
 ('order', 0.0012051552110787834),
 ('family', 0.0046517285431464939),
 ('genus', 0.0083225309531019861),
 ('species', 0.37079631161683357),
 ('strain', 0.6135423575819392)]



In [268]:

    
read_counts_100 = pd.read_csv('../data/references/miniGWG_darth/shear100/sheared_read_counts.txt', sep="\t", header=None)



In [270]:

    
print(df.shape)
print(df_emb_100.shape)
df_emb_100.to_csv('../data/references/miniGWG_darth/shear100/sheared_bayes.fixed.txt', sep="\t", header=None)









    



(10241, 10)
(10635, 9)



In [304]:

    
# Distribution of uniqueness
plt.hist(df_emb_100.iloc[:, 1:8].sum(axis=1)/df_emb_100[8])
plt.show()



In [301]:

    
low_uniqueness = df_emb_100[df_emb_100.iloc[:, :8].sum(axis=1)/df_emb_100[8] < .01]

kingdom_freq_low = Counter([_.split(';')[0] for _ in low_uniqueness.index])
kingdom_freq_all = Counter([_.split(';')[0] for _ in df_emb_100.index])
print(kingdom_freq_low)
print(kingdom_freq_all)
for key, value in kingdom_freq_all.items():
    print("%s\t%f" % (key, kingdom_freq_low[key]/np.float(value)))









    



Counter()
Counter({'k__Viruses': 7133, 'k__Bacteria': 3290, 'k__Archaea': 166, 'k__Viroids': 46})
k__Archaea	0.000000
k__Bacteria	0.000000
k__Viruses	0.000000
k__Viroids	0.000000

	0	1	2	3	4	5	6	7	8	9
0	k__Bacteria;p__Proteobacteria;c__Alphaproteoba...	0	1	48	20	12	241	2	92971	93299
1	k__Bacteria;p__Deinococcus-Thermus;c__Deinococ...	25	0	0	0	0	25	0	88888	89027
2	k__Bacteria;p__Firmicutes;c__Bacilli;o__Bacill...	0	20	45	41	29	928	0	124865	125928
3	k__Bacteria;p__Firmicutes;c__Erysipelotrichia;...	56	837	0	0	304	2	0	94217	96069
4	k__Viruses;p__ssRNA_viruses;c__;o__;f__Virgavi...	0	0	0	0	0	0	0	129	129