In [104]:
import pandas as pd
import numpy as np
from IPython.display import display
import csv
from collections import defaultdict, Counter
import matplotlib.pyplot as plt
import os

if not os.path.isdir("/dev/shm/shearing"):
    os.makedirs("/dev/shm/shearing")

In [8]:
%%time
shear_results = "../data/references/rep82/shear_results.b6"
shear_results_fixed = "/dev/shm/shearing/shear_results.fixed.b6"
!sed 's/_/./1' {shear_results} > {shear_results_fixed}


CPU times: user 1min 6s, sys: 12.4 s, total: 1min 18s
Wall time: 10min 42s

In [12]:
%%time
otu_table = "/dev/shm/shearing/shear_results.otu.txt"
taxa_table = "/dev/shm/shearing/shear_results.taxatable.txt"
!embalmulate {shear_results_fixed} {otu_table}


Parsed 406474006 reads [12976 samples, 0 taxa, 12958 refs]. Collating...
CPU times: user 1min 15s, sys: 14.3 s, total: 1min 30s
Wall time: 15min 26s

In [119]:
with open(otu_table) as inf:
    csv_inf = csv.reader(inf, delimiter="\t")
    columns = next(csv_inf)
    columns = dict(zip(columns[1:], range(len(columns))))
    indptr = [0]
    indices = np.array([], dtype=int)
    data = np.array([], dtype=int)
    names = []
    for ix, row in enumerate(csv_inf):
        if ix % 1000 == 0:
            print(ix)
        names.append(row[0])
        np_row = np.array(row[1:], dtype=int)
        temp_indx = [np_row > 0]
        data = np.concatenate((data, np_row[temp_indx]))
        indices = np.concatenate((indices, np.where(temp_indx)[1]))
        indptr.append(indices.shape[0])


0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000

In [120]:
from scipy.sparse import csr_matrix

# samples by taxonomy
csr = csr_matrix((data, indices, indptr), dtype=int).T

In [121]:
# Load up the accession2taxonomy
with open("../data/references/rep82/rep82.tax") as inf:
    csv_inf = csv.reader(inf, delimiter='\t')
    name2taxonomy = dict(csv_inf)

In [122]:
cols_tax = [name2taxonomy[name] for name in names]
rows_tax = [name2taxonomy[_.replace(".", "_", 1)] for _ in sorted(columns, key=columns.get)]

In [123]:
def index_lca(str1, str2):
    for i, (s1, s2) in enumerate(zip(str1.split(';'), str2.split(';'))):
        if s1 != s2:
            return i
    return 8

dat = np.zeros((len(rows_tax), 9), dtype=int)
for i, row_name in enumerate(rows_tax):
    row = csr.getrow(i)
    for j, indx in enumerate(row.indices):
        dat[i, index_lca(rows_tax[i], cols_tax[indx])] += row.data[j]

In [124]:
dat[:, 0].sum()


Out[124]:
193728

In [125]:
import pandas as pd
df = pd.DataFrame(dat, index=rows_tax)

In [126]:
df['sum'] = dat.sum(axis=1)

In [127]:
df.head()


Out[127]:
0 1 2 3 4 5 6 7 8 sum
k__Viruses;p__ssRNA viruses;c__ssRNA positive-strand viruses, no DNA stage;o__;f__;g__;s__Phytophthora infestans RNA virus 1;t__ 0 0 0 0 0 0 0 0 120 120
k__Bacteria;p__Planctomycetes;c__Planctomycetia;o__Planctomycetales;f__Gemmataceae;g__Gemmata;s__Gemmata obscuriglobus;t__Gemmata obscuriglobus UQM 2246 8 97 0 0 0 0 0 0 181382 181487
k__Bacteria;p__Firmicutes;c__Bacilli;o__Bacillales;f__Bacillaceae;g__Bacillus;s__Bacillus sp. FJAT-14578;t__ 1 3 0 0 16 20 139 0 111122 111301
k__Viruses;p__dsDNA viruses, no RNA stage;c__Caudovirales;o__;f__Podoviridae;g__;s__Bacillus phage Stitch;t__ 0 1 0 0 0 0 16 0 469 486
k__Viruses;p__ssRNA viruses;c__ssRNA positive-strand viruses, no DNA stage;o__;f__Virgaviridae;g__Tobamovirus;s__Hibiscus latent Singapore virus;t__ 0 0 0 0 0 0 0 0 129 129

In [128]:
df.drop(0, axis=1, inplace=True)

In [129]:
df.head()


Out[129]:
1 2 3 4 5 6 7 8 sum
k__Viruses;p__ssRNA viruses;c__ssRNA positive-strand viruses, no DNA stage;o__;f__;g__;s__Phytophthora infestans RNA virus 1;t__ 0 0 0 0 0 0 0 120 120
k__Bacteria;p__Planctomycetes;c__Planctomycetia;o__Planctomycetales;f__Gemmataceae;g__Gemmata;s__Gemmata obscuriglobus;t__Gemmata obscuriglobus UQM 2246 97 0 0 0 0 0 0 181382 181487
k__Bacteria;p__Firmicutes;c__Bacilli;o__Bacillales;f__Bacillaceae;g__Bacillus;s__Bacillus sp. FJAT-14578;t__ 3 0 0 16 20 139 0 111122 111301
k__Viruses;p__dsDNA viruses, no RNA stage;c__Caudovirales;o__;f__Podoviridae;g__;s__Bacillus phage Stitch;t__ 1 0 0 0 0 16 0 469 486
k__Viruses;p__ssRNA viruses;c__ssRNA positive-strand viruses, no DNA stage;o__;f__Virgaviridae;g__Tobamovirus;s__Hibiscus latent Singapore virus;t__ 0 0 0 0 0 0 0 129 129

In [130]:
df.to_csv("../data/references/rep82/sheared_bayes.txt", header=False, sep="\t")

In [131]:
uniqueness_rate_per_level = np.zeros(8, dtype=float)
for i in range(0, 8):
    # Take the sum of those columns
    num_hits =  df.iloc[:, i].sum()
    # Total number of possible hits
    total_hits = df['sum'].sum()
    # Uniqueness Rate
    uniqueness_rate_per_level[i] = num_hits/total_hits
levels = ['kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species', 'strain']
list(zip(levels, uniqueness_rate_per_level))
print(uniqueness_rate_per_level.sum())


0.999523393877

In [132]:
# Distribution of uniqueness
plt.hist(df.iloc[:, 1:8].sum(axis=1)/df['sum'])
plt.show()



In [133]:
low_uniqueness = df[df.iloc[:, 1:8].sum(axis=1)/df['sum'] < 1.1]
print(low_uniqueness)
kingdom_freq_low = Counter([_.split(';')[0] for _ in low_uniqueness.index])
print(kingdom_freq_low)


                                                      1   2    3    4      5  \
k__Viruses;p__ssRNA viruses;c__ssRNA positive-s...    0   0    0    0      0   
k__Bacteria;p__Planctomycetes;c__Planctomycetia...   97   0    0    0      0   
k__Bacteria;p__Firmicutes;c__Bacilli;o__Bacilla...    3   0    0   16     20   
k__Viruses;p__dsDNA viruses, no RNA stage;c__Ca...    1   0    0    0      0   
k__Viruses;p__ssRNA viruses;c__ssRNA positive-s...    0   0    0    0      0   
k__Viruses;p__dsRNA viruses;c__Totiviridae;o__;...    0   0    0    0      0   
k__Viruses;p__dsRNA viruses;c__unclassified dsR...    0   0    0    0      0   
k__Viruses;p__unclassified viruses;c__Torulaspo...    0   0    0    0      0   
k__Viruses;p__ssRNA viruses;c__ssRNA positive-s...    0   0    0    0      0   
k__Bacteria;p__Proteobacteria;c__Epsilonproteob...   13   0    0    0     10   
k__Bacteria;p__Proteobacteria;c__Gammaproteobac...    0   0    4    0      0   
k__Bacteria;p__Firmicutes;c__Clostridia;o__Ther...    0   4    1    2    203   
k__Bacteria;p__Firmicutes;c__Clostridia;o__Clos...    1   1   13   32     39   
k__Bacteria;p__Actinobacteria;c__Actinobacteria...   31   0    7    0      5   
k__Bacteria;p__Firmicutes;c__Bacilli;o__Bacilla...    0   1    3    8      0   
k__Viruses;p__ssRNA viruses;c__ssRNA positive-s...    0   0    0    0      0   
k__Viruses;p__dsDNA viruses, no RNA stage;c__He...    0   0    0    0      0   
k__Bacteria;p__Firmicutes;c__Bacilli;o__Bacilla...    0   0    5   17     22   
k__Viruses;p__ssRNA viruses;c__ssRNA positive-s...    0   0    0    0      0   
k__Viruses;p__dsDNA viruses, no RNA stage;c__He...    0   0    0    0      0   
k__Bacteria;p__Bacteroidetes;c__;o__Bacteroidet...    0   0    0    0      0   
k__Bacteria;p__Tenericutes;c__Mollicutes;o__Myc...    0   0    0    0      0   
k__Bacteria;p__Actinobacteria;c__Actinobacteria...    0   0    5    0      4   
k__Bacteria;p__Actinobacteria;c__Actinobacteria...    0   0   17    0      4   
k__Bacteria;p__Deinococcus-Thermus;c__Deinococc...    0   0    0    0      0   
k__Bacteria;p__Firmicutes;c__Clostridia;o__Ther...   46   2  621    1      1   
k__Viruses;p__ssRNA viruses;c__ssRNA positive-s...    0   0    0    0      0   
k__Bacteria;p__Proteobacteria;c__Gammaproteobac...  109  26  356  453  35598   
k__Bacteria;p__Firmicutes;c__Clostridia;o__Clos...    0   1    0   20      0   
k__Archaea;p__Euryarchaeota;c__Methanobacteria;...    0   0    0    0      0   
...                                                 ...  ..  ...  ...    ...   
k__Viroids;p__;c__;o__;f__;g__;s__Rubber viroid...    0   0    0    0      0   
k__Viruses;p__Satellites;c__Satellite Nucleic A...    0   0    0    0      0   
k__Viroids;p__;c__;o__;f__;g__;s__Cherry leaf s...    0   0    0    0      0   
k__Viroids;p__;c__;o__;f__Avsunviroidae;g__Avsu...    0   0    0    0      0   
k__Viroids;p__;c__;o__;f__Pospiviroidae;g__Posp...    0   0    0    0      0   
k__Viruses;p__unclassified viruses;c__Lake Sara...    0   0    0    0      0   
k__Viruses;p__ssDNA viruses;c__unclassified ssD...    0   0    0    0      0   
k__Viroids;p__;c__;o__;f__Pospiviroidae;g__Apsc...    0   0    0    0      0   
k__Viroids;p__;c__;o__;f__Pospiviroidae;g__Cole...    0   0    0    0      0   
k__Viruses;p__Satellites;c__Satellite Nucleic A...    0   0    0    0      0   
k__Viruses;p__Satellites;c__Satellite Nucleic A...    0   0    0    0      0   
k__Viroids;p__;c__;o__;f__Pospiviroidae;g__Posp...    0   0    0    0      0   
k__Viruses;p__Satellites;c__Satellite Nucleic A...    0   0    0    0      0   
k__Viruses;p__Satellites;c__Satellite Nucleic A...    0   0    0    0      0   
k__Viruses;p__Satellites;c__Satellite Nucleic A...    0   0    0    0      0   
k__Viroids;p__;c__;o__;f__Pospiviroidae;g__Host...    0   0    0    0      0   
k__Viroids;p__;c__;o__;f__Pospiviroidae;g__Coca...    0   0    0    0      0   
k__Viroids;p__;c__;o__;f__Pospiviroidae;g__Cole...    0   0    0    0      0   
k__Viroids;p__;c__;o__;f__Pospiviroidae;g__Apsc...    0   0    0    0      0   
k__Viruses;p__Satellites;c__Satellite Nucleic A...    0   0    0    0      0   
k__Viroids;p__;c__;o__;f__Pospiviroidae;g__Cole...    0   0    0    0      0   
k__Viroids;p__;c__;o__;f__;g__;s__Apple fruit c...    0   0    0    0      0   
k__Viroids;p__;c__;o__;f__Pospiviroidae;g__Coca...    0   0    0    0      0   
k__Viroids;p__;c__;o__;f__Pospiviroidae;g__Coca...    0   0    0    0      0   
k__Viroids;p__;c__;o__;f__Pospiviroidae;g__Apsc...    0   0    0    0      0   
k__Viroids;p__;c__;o__;f__Avsunviroidae;g__Pela...    0   0    0    0      0   
k__Viroids;p__;c__;o__;f__;g__;s__Apple hammerh...    0   0    0    0      0   
k__Viroids;p__;c__;o__;f__Pospiviroidae;g__Cole...    0   0    0    0      0   
k__Viruses;p__Satellites;c__Satellite Nucleic A...    0   0    0    0      0   
k__Viruses;p__Satellites;c__Satellite Nucleic A...    0   0    0    0      0   

                                                       6  7       8     sum  
k__Viruses;p__ssRNA viruses;c__ssRNA positive-s...     0  0     120     120  
k__Bacteria;p__Planctomycetes;c__Planctomycetia...     0  0  181382  181487  
k__Bacteria;p__Firmicutes;c__Bacilli;o__Bacilla...   139  0  111122  111301  
k__Viruses;p__dsDNA viruses, no RNA stage;c__Ca...    16  0     469     486  
k__Viruses;p__ssRNA viruses;c__ssRNA positive-s...     0  0     129     129  
k__Viruses;p__dsRNA viruses;c__Totiviridae;o__;...     0  0     120     120  
k__Viruses;p__dsRNA viruses;c__unclassified dsR...     0  0      36      36  
k__Viruses;p__unclassified viruses;c__Torulaspo...     0  0      34      34  
k__Viruses;p__ssRNA viruses;c__ssRNA positive-s...     0  0     242     242  
k__Bacteria;p__Proteobacteria;c__Epsilonproteob...    33  0   34169   34225  
k__Bacteria;p__Proteobacteria;c__Gammaproteobac...   127  0   91953   92084  
k__Bacteria;p__Firmicutes;c__Clostridia;o__Ther...     0  0   53578   53788  
k__Bacteria;p__Firmicutes;c__Clostridia;o__Clos...     0  0   91465   91551  
k__Bacteria;p__Actinobacteria;c__Actinobacteria...   874  0  162362  163279  
k__Bacteria;p__Firmicutes;c__Bacilli;o__Bacilla...     0  0   55565   55577  
k__Viruses;p__ssRNA viruses;c__ssRNA positive-s...     0  0     185     185  
k__Viruses;p__dsDNA viruses, no RNA stage;c__He...     0  0    4058    4058  
k__Bacteria;p__Firmicutes;c__Bacilli;o__Bacilla...    21  0   76201   76266  
k__Viruses;p__ssRNA viruses;c__ssRNA positive-s...     0  0     222     222  
k__Viruses;p__dsDNA viruses, no RNA stage;c__He...     6  0    3129    3135  
k__Bacteria;p__Bacteroidetes;c__;o__Bacteroidet...     0  0   65232   65232  
k__Bacteria;p__Tenericutes;c__Mollicutes;o__Myc...    36  0   16837   16873  
k__Bacteria;p__Actinobacteria;c__Actinobacteria...   898  0  162274  163181  
k__Bacteria;p__Actinobacteria;c__Actinobacteria...    89  0  116766  116876  
k__Bacteria;p__Deinococcus-Thermus;c__Deinococc...   256  0   92607   92863  
k__Bacteria;p__Firmicutes;c__Clostridia;o__Ther...   674  0   58060   59405  
k__Viruses;p__ssRNA viruses;c__ssRNA positive-s...     0  0     196     196  
k__Bacteria;p__Proteobacteria;c__Gammaproteobac...  3114  0   52027   92142  
k__Bacteria;p__Firmicutes;c__Clostridia;o__Clos...     0  0   53485   53506  
k__Archaea;p__Euryarchaeota;c__Methanobacteria;...     7  0   58737   58744  
...                                                  ... ..     ...     ...  
k__Viroids;p__;c__;o__;f__;g__;s__Rubber viroid...     0  0       7       7  
k__Viruses;p__Satellites;c__Satellite Nucleic A...     0  0       4       4  
k__Viroids;p__;c__;o__;f__;g__;s__Cherry leaf s...     0  0       8       8  
k__Viroids;p__;c__;o__;f__Avsunviroidae;g__Avsu...     0  0       4       4  
k__Viroids;p__;c__;o__;f__Pospiviroidae;g__Posp...     0  0       6       6  
k__Viruses;p__unclassified viruses;c__Lake Sara...     0  0      16      16  
k__Viruses;p__ssDNA viruses;c__unclassified ssD...     0  0      32      32  
k__Viroids;p__;c__;o__;f__Pospiviroidae;g__Apsc...     0  0       6       6  
k__Viroids;p__;c__;o__;f__Pospiviroidae;g__Cole...     0  0       5       5  
k__Viruses;p__Satellites;c__Satellite Nucleic A...     0  0       6       6  
k__Viruses;p__Satellites;c__Satellite Nucleic A...     0  0       9       9  
k__Viroids;p__;c__;o__;f__Pospiviroidae;g__Posp...     0  0       7       7  
k__Viruses;p__Satellites;c__Satellite Nucleic A...     0  0       7       7  
k__Viruses;p__Satellites;c__Satellite Nucleic A...     0  0       7       7  
k__Viruses;p__Satellites;c__Satellite Nucleic A...     0  0       4       4  
k__Viroids;p__;c__;o__;f__Pospiviroidae;g__Host...     0  0       6       6  
k__Viroids;p__;c__;o__;f__Pospiviroidae;g__Coca...     0  0       5       5  
k__Viroids;p__;c__;o__;f__Pospiviroidae;g__Cole...     0  0       6       6  
k__Viroids;p__;c__;o__;f__Pospiviroidae;g__Apsc...     0  0       7       7  
k__Viruses;p__Satellites;c__Satellite Nucleic A...     0  0       9       9  
k__Viroids;p__;c__;o__;f__Pospiviroidae;g__Cole...     0  0       7       7  
k__Viroids;p__;c__;o__;f__;g__;s__Apple fruit c...     0  0       7       7  
k__Viroids;p__;c__;o__;f__Pospiviroidae;g__Coca...     0  0       4       4  
k__Viroids;p__;c__;o__;f__Pospiviroidae;g__Coca...     0  0       5       5  
k__Viroids;p__;c__;o__;f__Pospiviroidae;g__Apsc...     0  0       5       5  
k__Viroids;p__;c__;o__;f__Avsunviroidae;g__Pela...     0  0       7       7  
k__Viroids;p__;c__;o__;f__;g__;s__Apple hammerh...     0  0       8       8  
k__Viroids;p__;c__;o__;f__Pospiviroidae;g__Cole...     0  0       5       5  
k__Viruses;p__Satellites;c__Satellite Nucleic A...     0  0       5       5  
k__Viruses;p__Satellites;c__Satellite Nucleic A...     0  0       6       6  

[12976 rows x 9 columns]
Counter({'k__Viruses': 7194, 'k__Bacteria': 4884, 'k__BacteriaPlasmid': 577, 'k__Archaea': 238, 'k__Viroids': 46, 'k__ArchaeaPlasmid': 37})