In [1]:
import ipyrad as ip
import ipyparallel as ipp
In [2]:
from ipyrad.assemble.clustmap import *
from ipyrad.assemble.consens_se import *
In [3]:
ipyclient = ipp.Client()
In [4]:
data = ip.Assembly("test")
data.set_params("project_dir", "hotfix")
data.set_params("raw_fastq_path", "ipsimdata/pairddrad_example_R*_.fastq.gz")
data.set_params("barcodes_path", "ipsimdata/pairddrad_example_barcodes.txt")
data.set_params("reference_sequence", "ipsimdata/pairddrad_example_genome.fa")
data.set_params("assembly_method", "reference")
data.set_params("datatype", "pairddrad")
data.set_params("restriction_overhang", ("TGCAG", "CCG"))
data.get_params()
In [5]:
data.run("12", force=True)
In [7]:
data.run("3")
In [8]:
data.run('4')
In [9]:
data.run("5")
In [11]:
data.run("6")
In [51]:
from ipyrad.assemble.write_outputs import *
bself = Step7(data, True, ipyclient)
self.split_clusters()
jobs = glob.glob(os.path.join(self.data.tmpdir, "chunk-*"))
jobs = sorted(jobs, key=lambda x: int(x.rsplit("-")[-1]))
for jobfile in jobs:
args = (self.data, self.chunksize, jobfile)
In [119]:
def get_edges(self, seqs):
"""
Trim terminal edges or mask internal edges based on three criteria and
take the max for each edge.
1. user entered hard trimming.
2. removing cutsite overhangs.
3. trimming singleton-like overhangs from seqs of diff lengths.
"""
# record whether to filter this locus based on sample coverage
bad = False
# 1. hard trim edges
trim1 = np.array(self.data.paramsdict["trim_loci"])
# 2. fuzzy match for trimming restriction site where it's expected.
trim2 = np.array([0, 0, 0, 0])
overhangs = np.array([
i.encode() for i in self.data.paramsdict["restriction_overhang"]
])
for pos, overhang in enumerate(overhangs):
if overhang:
cutter = np.array(list(overhang))
trim2[pos] = check_cutter(seqs, pos, cutter, 0.75)
# 3. find where the edge is not indel marked (really unknown ("N"))
trim3 = np.array([0, 0, 0, 0])
try:
minsamp = min(4, seqs.shape[0])
# minsamp = max(minsamp, self.data.paramsdict["min_samples_locus"])
mincovs = np.sum((seqs != 78) & (seqs != 45), axis=0)
for pos in range(4):
trim3[pos] = check_minsamp(seqs, pos, minsamp, mincovs)
except ValueError:
print('error')
bad = True
# get max edges
print(trim1, trim2, trim3)
trim = np.max([trim1, trim2, trim3], axis=0)
# return edges as slice indices
r1left = trim[0]
# single-end simple:
if "pair" not in self.data.paramsdict["datatype"]:
r1right = seqs.shape[1] - trim[1]
r2left = r2right = r1right
edges = (r1left, r1right, r2left, r2right)
else:
r1right =
# get filter
if (r1right < r1left) or (r2left < r1right) or (r2right < r2left):
bad = True
# if loc length is too short then filter
if (r2right - r1left) < self.data.paramsdict["filter_min_trim_len"]:
bad = True
return bad, edges
In [120]:
def check_minsamp(seq, position, minsamp, mincovs):
"used in Processor.get_edges() for trimming edges of - or N sites."
if position == 0:
# find leftmost edge with minsamp coverage
leftmost = np.where(mincovs >= minsamp)[0]
if leftmost.size:
return leftmost.min()
# no sites actually have minsamp coverage although there are minsamp
# rows of data, this can happen when reads only partially overlap. Loc
# should be excluded for minsamp filter.
else:
raise ValueError("no sites above minsamp coverage in edge trim")
elif position == 1:
maxhit = np.where(mincovs >= minsamp)[0].max()
return seq.shape[1] - (maxhit + 1)
## TODO...
elif position == 2:
return 0
else:
return 0
In [121]:
# store list of edge trims for VCF building
edgelist = []
# todo: this could be an iterator...
with open(self.chunkfile, 'rb') as infile:
loci = infile.read().split(b"//\n//\n")
# iterate over loci
for iloc, loc in enumerate(loci):
# load names and seqs
lines = loc.decode().strip().split("\n")
names = []
nidxs = []
aseqs = []
useqs = []
for line in lines:
if line[0] == ">":
name, nidx = line[1:].rsplit("_", 1)
names.append(name)
nidxs.append(nidx)
else:
aseqs.append(list(line))
useqs.append(list(line.upper()))
# filter to include only samples in this assembly
mask = [i in self.data.snames for i in names]
names = np.array(names)[mask].tolist()
nidxs = np.array(nidxs)[mask].tolist()
useqs = np.array(useqs)[mask, :].astype(bytes).view(np.uint8)
aseqs = np.array(aseqs)[mask, :].astype(bytes).view(np.uint8)
# apply filters
efilter, edges = get_edges(self, useqs)
print(efilter, edges)
In [54]:
data = self.data
chunksize = self.chunksize
chunkfile = jobs[0]
self = Processor(data, chunksize, chunkfile)
In [23]:
self.remote_process_chunks()
self.collect_stats()
self.store_file_handles()
In [24]:
start = time.time()
printstr = ("building arrays ", "s7")
rasyncs = {}
args0 = (self.data,)
args1 = (self.data, self.ntaxa, self.nbases, self.nloci)
args2 = (self.data, self.ntaxa, self.nsnps)
write_loci_and_alleles(*args0)
In [25]:
fill_seq_array(*args1)
In [49]:
data = self.data
ntaxa = self.ntaxa
nsnps = self.nsnps
# get faidict to convert chroms to ints
if data.isref:
faidict = chroms2ints(data, True)
# open new database file handle
with h5py.File(data.snps_database, 'w') as io5:
# Database files for storing arrays on disk.
# Should optimize for slicing by rows if we run into slow writing, or
# it uses too much mem. For now letting h5py to auto-chunking.
io5.create_dataset(
name="snps",
shape=(ntaxa, nsnps),
dtype=np.uint8,
)
# store snp locations:
# (loc-counter, loc-snp-counter, loc-snp-pos, chrom, chrom-snp-pos)
io5.create_dataset(
name="snpsmap",
shape=(nsnps, 5),
dtype=np.uint32,
)
# store snp locations
io5.create_dataset(
name="pseudoref",
shape=(nsnps, 4),
dtype=np.uint8,
)
# store genotype calls (0/0, 0/1, 0/2, etc.)
io5.create_dataset(
name="genos",
shape=(nsnps, ntaxa, 2),
dtype=np.uint8,
)
# gather all loci bits
locibits = glob.glob(os.path.join(data.tmpdir, "*.loci"))
sortbits = sorted(locibits,
key=lambda x: int(x.rsplit("-", 1)[-1][:-5]))
# name order for entry in array
sidxs = {sample: i for (i, sample) in enumerate(data.snames)}
# iterate through file
start = end = 0
tmploc = {}
locidx = 1
snpidx = 1
# array to store until writing
tmparr = np.zeros((ntaxa, nsnps), dtype=np.uint8)
tmpmap = np.zeros((nsnps, 5), dtype=np.uint32)
# iterate over chunkfiles
for bit in sortbits:
# iterate lines of file until locus endings
for line in iter(open(bit, 'r')):
# still filling locus until |\n
if "|\n" not in line:
name, seq = line.split()
tmploc[name] = seq
# locus is full, dump it
else:
# convert seqs to an array
loc = (
np.array([list(i) for i in tmploc.values()])
.astype(bytes).view(np.uint8)
)
snps, idxs, _ = line[len(data.snppad):].rsplit("|", 2)
snpsmask = np.array(list(snps)) != " "
snpsidx = np.where(snpsmask)[0]
# select only the SNP sites
snpsites = loc[:, snpsmask]
# store end position of locus for map
end = start + snpsites.shape[1]
print(start, end)
for idx, name in enumerate(tmploc):
tmparr[sidxs[name], start:end] = snpsites[idx, :]
# store snpsmap data 1-indexed with chroms info
if data.isref:
chrom, pos = idxs.split(",")[0].split(":")
start = int(pos.split("-")[0])
#chromidx = faidict[chrom]
chromidx = int(chrom)
for isnp in range(snpsites.shape[1]):
isnpx = snpsidx[isnp]
tmpmap[snpidx - 1] = (
locidx, isnp, isnpx, chromidx, isnpx + start,
)
snpidx += 1
# store snpsmap data (snpidx is 1-indexed)
else:
for isnp in range(snpsites.shape[1]):
tmpmap[snpidx - 1] = (
locidx, isnp, snpsidx[isnp], 0, snpidx,
)
snpidx += 1
locidx += 1
# reset locus
start = end
tmploc = {}
# fill missing with 78 (N)
tmparr[tmparr == 0] = 78
In [48]:
print(tmparr.shape)
print(start, end)
tmparr[sidxs[name], start:end]
Out[48]:
In [21]:
fill_snp_array(*args2)
In [12]:
data.run("7")
In [ ]:
In [ ]:
In [5]:
data.run("3")
In [6]:
data.run("456")
In [11]:
data.run("1", force=True)
In [9]:
data = ip.load_json("tortas/5-tortas.json")
In [18]:
d2 = data.branch("d2", subsamples=["AGO09concat"])
In [19]:
d2.run("3", force=True)
In [20]:
d2.run("45", force=True)
In [7]:
d2 = ip.load_json("./tortas/d2.json")
In [9]:
d2.stats
Out[9]:
In [11]:
self = Step5(data, True, ipyclient)
In [12]:
self.remote_calculate_depths()
In [13]:
self.remote_make_chunks()
In [14]:
statsdicts = self.remote_process_chunks()
In [15]:
statsdicts
Out[15]:
In [ ]:
self.remote_concatenate_chunks()
In [ ]:
self.data_store(statsdicts)
In [10]:
data.stats
Out[10]:
In [5]:
dd = data.branch("dd")
dd.run("5", force=True)
In [6]:
dd.run("3", force=True)
Out[6]:
In [4]:
data.run("5", force=True)
In [5]:
step = Step6(data, True, ipyclient)
In [6]:
step.remote_concat_bams()
In [7]:
step.remote_build_ref_regions()
In [8]:
self = step
In [9]:
regs = self.regions[:20]
regs
Out[9]:
In [11]:
# access reads from bam file using pysam
bamfile = AlignmentFile(
os.path.join(
self.data.dirs.across,
"cat.sorted.bam"),
'rb')
# catcons output file for raw clusters and db samples
outfile = gzip.open(
os.path.join(
self.data.dirs.across,
"{}.catcons.gz".format(self.data.name)),
'wb')
# write header line to catcons with sample names
snames = sorted([i.name for i in self.samples])
nsamples = len(snames)
outfile.write(
b" ".join([i.encode() for i in snames]) + b"\n")
# get clusters
lidx = 0
clusts = []
# while 1:
# try:
# region = next(self.regions)
# reads = bamfile.fetch(*region)
# except StopIteration:
# break
for region in regs:
reads = bamfile.fetch(*region)
# get reference
print(region)
refn, refs = get_ref_region(
data.paramsdict["reference_sequence"],
region[0], region[1]+1, region[2]+1)
# build cluster dict for sorting
rdict = {}
for read in reads:
rdict[read.qname] = read.seq
keys = sorted(rdict.keys(), key=lambda x: x.rsplit(":", 2)[0])
# build cluster based on map positions (reads are already oriented)
arr = np.zeros((nsamples + 1, len(refs)), dtype=bytes)
# fill it
arr[0] = list(refs)
for idx, key in enumerate(keys):
# get start and stop from this seq
sname = key.rsplit("_", 1)[0]
rstart, rstop = key.rsplit(":", 2)[-1].split("-")
sidx = snames.index(sname)
# get start relative to ref
start = int(rstart) - int(region[1]) - 1
stop = start + int(rstop) - int(rstart)
print(sidx + 1, start, stop, arr.shape[1])
arr[sidx + 1, int(start): int(stop)] = list(rdict[key])
print("")
arr[arr == b""] = b"N"
for line in arr:
outfile.write(line.tostring() + b"\n")
outfile.write(b"\n")
outfile.close()
In [50]:
print(start, stop, stop-start, len(rdict[key]), rstart, rstop, int(rstop) - int(rstart))
print(region, region[2] - region[1], len(refs))
In [61]:
key.split("")
In [62]:
snames
Out[62]:
In [141]:
revcomp("AATTCCATTCTTCCTTTCCCATACCTCCCGCCCTGCTCCTTTCCKCTCTTGATTTCTTCTTGAGGGAGGCAGAGGANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGGCACCAGATTTTCTCACTGTTCAGGTCAGGGTTTGACTTCAGCCCCATCTCTAATACAAGCCATGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN")
Out[141]:
In [37]:
# get consens seq and variant site index
clust = []
avars = refvars(arr.view(np.uint8), PSEUDO_REF)
dat = b"".join(avars.view("S1")[:, 0]).decode()
snpstring = "".join(["*" if i else " " for i in avars[:, 1]])
clust.append("ref_{}:{}-{}\n{}".format(*region, dat))
# or, just build variant string (or don't...)
# write all loci with [locids, nsnps, npis, nindels, ?]
for key in keys:
clust.append("{}\n{}".format(key, rdict[key]))
clust.append("SNPs\n" + snpstring)
clusts.append("\n".join(clust))
# advance locus counter
lidx += 1
# write chunks to file
if not lidx % 1000:
outfile.write(
str.encode("\n//\n//\n".join(clusts) + "\n//\n//\n"))
clusts = []
# write remaining
if clusts:
outfile.write(
str.encode("\n//\n//\n".join(clusts) + "\n//\n//\n"))
outfile.close()
In [16]:
step.remote_build_ref_clusters()
In [4]:
from ipyrad.assemble.clustmap import *
In [6]:
self = Step3(data, 8, True, ipyclient)
In [7]:
self.run()
In [8]:
self.data.run("45")
In [13]:
self.remote_index_refs()
self.remote_run(
function=concat_multiple_edits,
printstr=("concatenating ", "s3"),
args=(),
)
self.remote_run(
function=merge_end_to_end,
printstr=("join unmerged pairs ", "s3"),
args=(False, False,),
)
In [15]:
self.remote_run(
function=dereplicate,
printstr=("dereplicating ", "s3"),
args=(self.nthreads,),
threaded=True,
)
In [16]:
self.remote_run(
function=split_endtoend_reads,
printstr=("splitting dereps ", "s3"),
args=(),
)
In [ ]:
self.remote_run(
function=mapping_reads,
printstr=("mapping reads ", "s3"),
args=(self.nthreads,),
threaded=True,
)
In [ ]:
sample = list(self.data.samples.values())[0]
merge_end_to_end(self.data, sample, True, True)
In [ ]:
In [15]:
sample = list(data.samples.values())[0]
infiles = [
os.path.join(data.dirs.edits, "{}.trimmed_R1_.fastq.gz".format(sample.name)),
os.path.join(data.dirs.edits, "{}_R1_concatedit.fq.gz".format(sample.name)),
os.path.join(data.tmpdir, "{}_merged.fastq".format(sample.name)),
os.path.join(data.tmpdir, "{}_declone.fastq".format(sample.name)),
]
infiles = [i for i in infiles if os.path.exists(i)]
infile = infiles[-1]
infile
Out[15]:
In [19]:
strand = "plus"
if data.paramsdict["datatype"] is ('gbs' or '2brad'):
strand = "both"
nthreads=2
cmd = [
ip.bins.vsearch,
"--derep_fulllength", infile,
"--strand", strand,
"--output", os.path.join(data.tmpdir, sample.name + "_derep.fastq"),
"--threads", str(nthreads),
"--fasta_width", str(0),
"--fastq_qmax", "1000",
"--sizeout",
"--relabel_md5",
]
cmd
Out[19]:
In [20]:
proc = sps.Popen(cmd, stderr=sps.STDOUT, stdout=sps.PIPE, close_fds=True)
errmsg = proc.communicate()[0]
if proc.returncode:
ip.logger.error("error inside dereplicate %s", errmsg)
raise IPyradWarningExit(errmsg)
In [10]:
s.remote_run(
function=dereplicate,
printstr=("dereplicating ", "s3"),
args=(s.nthreads,),
threaded=True,
)