In [1]:
%%time
base = "/project/flatiron2/analysis_SHOGUN/data/references/linear"
fna = "{}/rep82_combined.fna".format(base)
tax = "{}/rep82_combined.tax".format(base)
with open(fna) as inf:
    with open(fna[:-4] + ".fixed.fna", "w") as outf:
        for line in inf:
            line = line.rstrip()
            if line[0] == ">":
                row = line.split()
                outf.write(row[0] + "\n")
            else:
                outf.write(line + "\n")


---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
<ipython-input-1-3ef63cce8835> in <module>()
----> 1 get_ipython().run_cell_magic('time', '', 'base = "/project/flatiron2/analysis_SHOGUN/data/references/linear"\nfna = "{}/rep82_combined.fna".format(base)\ntax = "{}/rep82_combined.tax".format(base)\nwith open(fna) as inf:\n    with open(fna[:-4] + ".fixed.fna", "w") as outf:\n        for line in inf:\n            line = line.rstrip()\n            if line[0] == ">":\n                row = line.split()\n                outf.write(row[0] + "\\n")\n            else:\n                outf.write(line + "\\n")')

/export/scratch/miniconda3/envs/analysis_SHOGUN/lib/python3.5/site-packages/IPython/core/interactiveshell.py in run_cell_magic(self, magic_name, line, cell)
   2113             magic_arg_s = self.var_expand(line, stack_depth)
   2114             with self.builtin_trap:
-> 2115                 result = fn(magic_arg_s, cell)
   2116             return result
   2117 

<decorator-gen-59> in time(self, line, cell, local_ns)

/export/scratch/miniconda3/envs/analysis_SHOGUN/lib/python3.5/site-packages/IPython/core/magic.py in <lambda>(f, *a, **k)
    186     # but it's overkill for just that one bit of state.
    187     def magic_deco(arg):
--> 188         call = lambda f, *a, **k: f(*a, **k)
    189 
    190         if callable(arg):

/export/scratch/miniconda3/envs/analysis_SHOGUN/lib/python3.5/site-packages/IPython/core/magics/execution.py in time(self, line, cell, local_ns)
   1183         else:
   1184             st = clock2()
-> 1185             exec(code, glob, local_ns)
   1186             end = clock2()
   1187             out = None

<timed exec> in <module>()

FileNotFoundError: [Errno 2] No such file or directory: '/project/flatiron2/analysis_SHOGUN/data/references/linear/rep82_combined.fna'

In [6]:
with open(tax) as inf:
    with open(tax[:-4] + ".fixed.tax", "w") as outf:
        for line in inf:
            line = line.rstrip()
            row = line.split("\t")
            row[0] = row[0].split()[0]
            outf.write("\t".join(row) + "\n")

In [ ]:
%%capture capt
!/export/scratch/ben/bin/utree-build_gg {base}/rep82_combined.fixed.fna {base}/rep82_combined.fixed.tax {base}/rep82_combined.fixed.ubt 48 16

In [11]:
print(capt.stdout)


Using up to 48 threads.
Tree initialized.
Setting overlap to 16
Parsed map. 1948713 bytes, 12976 lines.
Done with sequence parse: 1262899870 k-mers made
Refining tree...
File parsed.
Total nodes in tree: 1259261702 [14563 labels]
Tree written.


In [14]:
# Number of taxa
!wc -l {tax}


12976 /project/flatiron2/analysis_SHOGUN/data/references/linear/rep82_combined.tax

In [15]:
# Number of Plasmids
!grep "Plasmid" {tax} | wc -l


614

In [17]:
# Compress the UTree DB
!/export/scratch/ben/bin/utree-compress {base}/rep82_combined.fixed.ubt {base}/rep82_combined.fixed.ctr


Nodes in input tree: 1259261702 (PACKSIZE=32, CNTTYPE=NA, IXTYPE=uint16_t, el=10)
Using 32-bit counters
Total nodes in tree: 1259260685 [14539 labels]

In [27]:
def read_fasta(fh):
    """
    :return: tuples of (title, seq)
    """
    title = None
    data = None
    for line in fh:
        if line[0] == ">":
            if title:
                yield (title.strip(), data)
            title = line[1:]
            data = ''
        else:
            data += line.strip()
    if not title:
        yield None
    yield (title.strip(), data)

In [28]:
%%time
from re import sub
with open(base + "/rep82_combined.dusted.fna") as inf:
    with open(base + "/rep82_combined.dusted.hard.fna", "w") as outf:
        for title, seq in read_fasta(inf):
            outf.write(">" + title + "\n")
            outf.write(sub("[a-z]", "N", seq) + "\n")


CPU times: user 7min 22s, sys: 31.2 s, total: 7min 53s
Wall time: 10min 38s

In [ ]:
%%capture capt2
%time !/export/scratch/ben/bin/utree-build_gg {base}/rep82_combined.dusted.hard.fna {base}/rep82_combined.fixed.tax {base}/rep82_combined.dusted.ubt 48 16

In [31]:
print(capt2.stdout)


Using up to 48 threads.
Tree initialized.
Setting overlap to 16
Parsed map. 1948713 bytes, 12976 lines.
Done with sequence parse: 1210821923 k-mers made
Refining tree...
File parsed.
Total nodes in tree: 1207684503 [14555 labels]
Tree written.
CPU times: user 24.8 s, sys: 2.79 s, total: 27.5 s
Wall time: 1h 20min 13s


In [32]:
# Compress the UTree DB
!/export/scratch/ben/bin/utree-compress {base}/rep82_combined.dusted.ubt {base}/rep82_combined.dusted.ctr


Nodes in input tree: 1207684503 (PACKSIZE=32, CNTTYPE=NA, IXTYPE=uint16_t, el=10)
Using 32-bit counters
Total nodes in tree: 1207684496 [14538 labels]

In [ ]: