In [6]:
import ipyrad as ip
import ipyparallel as ipp
In [7]:
ipyclient = ipp.Client()
In [3]:
data = ip.Assembly("1-pairtest")
data.set_params("raw_fastq_path", "ipsimdata/pairddrad_example_R*_.fastq.gz")
data.set_params("barcodes_path", "ipsimdata/pairddrad_example_barcodes.txt")
data.set_params("datatype", "pairddrad")
data.set_params("restriction_overhang", ("TGCAG", "CGG"))
In [3]:
data = ip.Assembly("2-setest")
data.set_params("raw_fastq_path", "ipsimdata/rad_example_R1_.fastq.gz")
data.set_params("barcodes_path", "ipsimdata/rad_example_barcodes.txt")
data.set_params("datatype", "rad")
data.set_params("restriction_overhang", ("TGCAG", ""))
In [3]:
data = ip.Assembly("3-refsetest")
data.set_params("raw_fastq_path", "ipsimdata/rad_example_R1_.fastq.gz")
data.set_params("barcodes_path", "ipsimdata/rad_example_barcodes.txt")
data.set_params("datatype", "rad")
data.set_params("assembly_method", "reference")
data.set_params("reference_sequence", "ipsimdata/rad_example_genome.fa")
data.set_params("restriction_overhang", ("TGCAG", ""))
#data.get_params()
In [5]:
data = ip.Assembly("4-refpairtest")
data.set_params("raw_fastq_path", "ipsimdata/pairddrad_example_R*_.fastq.gz")
data.set_params("barcodes_path", "ipsimdata/rad_example_barcodes.txt")
data.set_params("datatype", "pairddrad")
data.set_params("assembly_method", "reference")
data.set_params("reference_sequence", "ipsimdata/pairddrad_example_genome.fa")
data.set_params("restriction_overhang", ("TGCAG", "CGG"))
#data.get_params()
In [8]:
data = ip.Assembly("5-tortas")
data.set_params("project_dir", "tortas")
data.set_params("sorted_fastq_path", "/home/deren/Dropbox/Maud/fastq-concats/*.gz")
data.set_params("datatype", "pairddrad")
data.set_params("assembly_method", "reference")
data.set_params("reference_sequence", "/home/deren/Dropbox/Maud/lgeorge.genome.fa")
data.set_params("restriction_overhang", ("CATG", "AATT"))
data.set_params("filter_adapters", 2)
In [4]:
data.run("12", force=True)
In [11]:
#data.run("12", force=True)
#data = ip.load_json("1-pairtest.json")
#data = ip.load_json("2-setest.json")
#data = ip.load_json("3-refsetest.json")
#data = ip.load_json("4-refpairtest.json")
data = ip.load_json("tortas/5-tortas.json")
In [12]:
Out[12]:
In [9]:
from ipyrad.assemble.cluster_across import *
In [10]:
from ipyrad.assemble.clustmap import *
s3 = Step3(data, list(data.samples.values()), 0, 5, True, ipyclient)
samples = list(s3.data.samples.values())
sample = samples[1]
In [7]:
s3.data.ipcluster["threads"] = 4
s3.run()
In [11]:
s3.data.run("45")
In [16]:
#s3.data.stats
data = s3.data
jobid = 0
samples = list(data.samples.values())[:4]
randomseed = 123
In [ ]:
In [19]:
conshandles = [
sample.files.consens[0] for sample in samples if
sample.stats.reads_consens]
conshandles.sort()
assert conshandles, "no consensus files found"
conshandles
Out[19]:
In [20]:
## concatenate all of the gzipped consens files
cmd = ['cat'] + conshandles
groupcons = os.path.join(
data.dirs.across,
"{}-{}-catcons.gz".format(data.name, jobid))
LOGGER.debug(" ".join(cmd))
with open(groupcons, 'w') as output:
call = sps.Popen(cmd, stdout=output, close_fds=True)
call.communicate()
## a string of sed substitutions for temporarily replacing hetero sites
## skips lines with '>', so it doesn't affect taxon names
subs = ["/>/!s/W/A/g", "/>/!s/w/A/g", "/>/!s/R/A/g", "/>/!s/r/A/g",
"/>/!s/M/A/g", "/>/!s/m/A/g", "/>/!s/K/T/g", "/>/!s/k/T/g",
"/>/!s/S/C/g", "/>/!s/s/C/g", "/>/!s/Y/C/g", "/>/!s/y/C/g"]
subs = ";".join(subs)
In [27]:
## pipe passed data from gunzip to sed.
cmd1 = ["gunzip", "-c", groupcons]
cmd2 = ["sed", subs]
LOGGER.debug(" ".join(cmd1))
LOGGER.debug(" ".join(cmd2))
proc1 = sps.Popen(cmd1, stdout=sps.PIPE, close_fds=True)
allhaps = groupcons.replace("-catcons.gz", "-cathaps.gz")
with open(allhaps, 'w') as output:
proc2 = sps.Popen(cmd2, stdin=proc1.stdout, stdout=output, close_fds=True)
proc2.communicate()
proc1.stdout.close()
In [30]:
data.dirs.across = os.path.join(data.name + "_across")
if not os.path.exists(data.dirs.across):
os.makedirs(data.dirs.across)
import ipyrad
In [31]:
conshandles = [
sample.files.consens[0] for sample in samples if
sample.stats.reads_consens]
conshandles.sort()
assert conshandles, "no consensus files found"
## concatenate all of the gzipped consens files
cmd = ['cat'] + conshandles
groupcons = os.path.join(
data.dirs.across,
"{}-{}-catcons.gz".format(data.name, jobid))
LOGGER.debug(" ".join(cmd))
with open(groupcons, 'w') as output:
call = sps.Popen(cmd, stdout=output, close_fds=True)
call.communicate()
## a string of sed substitutions for temporarily replacing hetero sites
## skips lines with '>', so it doesn't affect taxon names
subs = ["/>/!s/W/A/g", "/>/!s/w/A/g", "/>/!s/R/A/g", "/>/!s/r/A/g",
"/>/!s/M/A/g", "/>/!s/m/A/g", "/>/!s/K/T/g", "/>/!s/k/T/g",
"/>/!s/S/C/g", "/>/!s/s/C/g", "/>/!s/Y/C/g", "/>/!s/y/C/g"]
subs = ";".join(subs)
## impute pseudo-haplo information to avoid mismatch at hetero sites
## the read data with hetero sites is put back into clustered data later.
## pipe passed data from gunzip to sed.
cmd1 = ["gunzip", "-c", groupcons]
cmd2 = ["sed", subs]
LOGGER.debug(" ".join(cmd1))
LOGGER.debug(" ".join(cmd2))
proc1 = sps.Popen(cmd1, stdout=sps.PIPE, close_fds=True)
allhaps = groupcons.replace("-catcons.gz", "-cathaps.gz")
with open(allhaps, 'w') as output:
proc2 = sps.Popen(cmd2, stdin=proc1.stdout, stdout=output, close_fds=True)
proc2.communicate()
proc1.stdout.close()
## now sort the file using vsearch
allsort = groupcons.replace("-catcons.gz", "-catsort.fa")
cmd1 = [ipyrad.bins.vsearch,
"--sortbylength", allhaps,
"--fasta_width", "0",
"--output", allsort]
LOGGER.debug(" ".join(cmd1))
proc1 = sps.Popen(cmd1, close_fds=True)
proc1.communicate()
Out[31]:
In [34]:
from ipyrad.assemble.cluster_across import *
random.seed(randomseed)
## open an iterator to lengthsorted file and grab two lines at at time
allshuf = groupcons.replace("-catcons.gz", "-catshuf.fa")
outdat = open(allshuf, 'wt')
indat = open(allsort, 'r')
idat = izip(iter(indat), iter(indat))
done = 0
chunk = [next(idat)]
while not done:
## grab 2-lines until they become shorter (unless there's only one)
oldlen = len(chunk[-1][-1])
while 1:
try:
dat = next(idat)
except StopIteration:
done = 1
break
if len(dat[-1]) == oldlen:
chunk.append(dat)
else:
## send the last chunk off to be processed
random.shuffle(chunk)
outdat.write("".join(chain(*chunk)))
## start new chunk
chunk = [dat]
break
## do the last chunk
random.shuffle(chunk)
outdat.write("".join(chain(*chunk)))
indat.close()
outdat.close()
In [50]:
def build_clusters_from_cigars(data, sample):
# get all regions with reads. Generator to yield (str, int, int)
fullregions = bedtools_merge(data, sample).strip().split("\n")
regions = (i.split("\t") for i in fullregions)
regions = ((i, int(j), int(k)) for (i, j, k) in regions)
# access reads from bam file using pysam
bamfile = AlignmentFile(
os.path.join(data.dirs.refmapping,
"{}-mapped-sorted.bam".format(sample.name)),
'rb')
# iterate over all regions
opath = os.path.join(
data.dirs.clusts, "{}.clustS.gz".format(sample.name))
out = gzip.open(opath, 'wt')
idx = 0
clusters = []
for reg in regions:
# uncomment and compare against ref sequence when testing
#ref = get_ref_region(data.paramsdict["reference_sequence"], *reg)
reads = bamfile.fetch(*reg)
# store reads in a dict
rdict = {}
# paired-end data cluster building
if "pair" in data.paramsdict["datatype"]:
# match paired reads together in a dictionary
for read in reads:
if read.qname not in rdict:
rdict[read.qname] = [read, None]
else:
rdict[read.qname][1] = read
# sort keys by derep number
keys = sorted(
rdict.keys(),
key=lambda x: int(x.split("=")[-1]), reverse=True)
# build the cluster based on map positions, orientation, cigar
clust = []
for key in keys:
r1, r2 = rdict[key]
if r1 and r2:
#lref = len(ref[1])
lref = reg[2] - reg[1]
arr1 = np.zeros(lref, dtype="U1")
arr2 = np.zeros(lref, dtype="U1")
arr1.fill("-")
arr2.fill("-")
# how far ahead of the start does this read begin
seq = cigared(r1.seq, r1.cigar)
start = r1.reference_start - reg[1]
arr1[start:start + len(seq)] = list(seq)
seq = cigared(r2.seq, r2.cigar)
start = r2.reference_start - reg[1]
arr2[start:start + len(seq)] = list(seq)
arr3 = join_arrays(arr1, arr2)
pairseq = "".join(arr3)
ori = "+"
if r1.is_reverse:
ori = "-"
derep = r1.qname.split("=")[-1]
rname = "{}:{}-{};size={};{}".format(*reg, derep, ori)
clust.append("{}\n{}".format(rname, pairseq))
# single-end data cluster building
else:
for read in reads:
rdict[read.qname] = read
# sort keys by derep number
keys = sorted(
rdict.keys(),
key=lambda x: int(x.split("=")[-1]), reverse=True)
# build the cluster based on map positions, orientation, cigar
clust = []
for key in keys:
r1 = rdict[key]
#aref = np.array(list(ref[1]))
lref = reg[2] - reg[1]
arr1 = np.zeros(lref, dtype="U1")
arr1.fill("-")
# how far ahead of the start does this read begin
seq = cigared(r1.seq, r1.cigar)
start = r1.reference_start - reg[1]
arr1[start:start + len(seq)] = list(seq)
aseq = "".join(arr1)
ori = "+"
if r1.is_reverse:
ori = "-"
derep = r1.qname.split("=")[-1]
rname = "{}:{}-{};size={};{}".format(*reg, derep, ori)
clust.append("{}\n{}".format(rname, aseq))
# store this cluster
clusters.append("\n".join(clust))
idx += 1
# if 1000 clusters stored then write to disk
if not idx % 10:
out.write("\n//\n//\n".join(clusters) + "\n//\n//\n")
clusters = []
# write final remaining clusters to disk
out.write("\n//\n//\n".join(clusters) + "\n//\n//\n")
out.close()
In [51]:
build_clusters_from_cigars(data, sample)
In [1]:
#maxlens, depths = get_quick_depths(data, sample)
In [2]:
#sample_cleanup(data, sample)
In [18]:
def get_quick_depths(data, sample):
""" iterate over clustS files to get data """
## use existing sample cluster path if it exists, since this
## func can be used in step 4 and that can occur after merging
## assemblies after step3, and if we then referenced by data.dirs.clusts
## the path would be broken.
if sample.files.clusters:
pass
else:
## set cluster file handles
sample.files.clusters = os.path.join(
data.dirs.clusts, sample.name + ".clustS.gz")
## get new clustered loci
fclust = data.samples[sample.name].files.clusters
clusters = gzip.open(fclust, 'rt')
pairdealer = izip(*[iter(clusters)] * 2)
## storage
depths = []
maxlen = []
## start with cluster 0
tdepth = 0
tlen = 0
## iterate until empty
while 1:
## grab next
try:
name, seq = next(pairdealer)
except StopIteration:
break
## if not the end of a cluster
#print name.strip(), seq.strip()
if name.strip() == seq.strip():
depths.append(tdepth)
maxlen.append(tlen)
tlen = 0
tdepth = 0
else:
try:
tdepth += int(name.strip().split("=")[-1][:-2])
tlen = len(seq)
except:
print(name)
## return
clusters.close()
return np.array(maxlen), np.array(depths)
In [14]:
def sample_cleanup(data, sample):
""" stats, cleanup, and link to samples """
## get maxlen and depths array from clusters
maxlens, depths = get_quick_depths(data, sample)
## Test if depths is non-empty, but just full of zeros.
if not depths.max():
print(" no clusters found for {}".format(sample.name))
return
else:
## store which min was used to calculate hidepth here
sample.stats_dfs.s3["hidepth_min"] = (
data.paramsdict["mindepth_majrule"])
# If our longest sequence is longer than the current max_fragment_len
# then update max_fragment_length. For assurance we require that
# max len is 4 greater than maxlen, to allow for pair separators.
hidepths = depths >= data.paramsdict["mindepth_majrule"]
maxlens = maxlens[hidepths]
## Handle the case where there are no hidepth clusters
if maxlens.any():
maxlen = int(maxlens.mean() + (2. * maxlens.std()))
else:
maxlen = 0
if maxlen > data._hackersonly["max_fragment_length"]:
data._hackersonly["max_fragment_length"] = maxlen + 4
## make sense of stats
keepmj = depths[depths >= data.paramsdict["mindepth_majrule"]]
keepstat = depths[depths >= data.paramsdict["mindepth_statistical"]]
## sample summary stat assignments
sample.stats["state"] = 3
sample.stats["clusters_total"] = depths.shape[0]
sample.stats["clusters_hidepth"] = keepmj.shape[0]
## store depths histogram as a dict. Limit to first 25 bins
bars, bins = np.histogram(depths, bins=range(1, 26))
sample.depths = {int(i): int(v) for i, v in zip(bins, bars) if v}
## sample stat assignments
## Trap numpy warnings ("mean of empty slice") printed by samples
## with few reads.
with warnings.catch_warnings():
warnings.simplefilter("ignore", category=RuntimeWarning)
sample.stats_dfs.s3["merged_pairs"] = sample.stats.reads_merged
sample.stats_dfs.s3["clusters_total"] = depths.shape[0]
try:
sample.stats_dfs.s3["clusters_hidepth"] = (
int(sample.stats["clusters_hidepth"]))
except ValueError:
## Handle clusters_hidepth == NaN
sample.stats_dfs.s3["clusters_hidepth"] = 0
sample.stats_dfs.s3["avg_depth_total"] = depths.mean()
sample.stats_dfs.s3["avg_depth_mj"] = keepmj.mean()
sample.stats_dfs.s3["avg_depth_stat"] = keepstat.mean()
sample.stats_dfs.s3["sd_depth_total"] = depths.std()
sample.stats_dfs.s3["sd_depth_mj"] = keepmj.std()
sample.stats_dfs.s3["sd_depth_stat"] = keepstat.std()
## Get some stats from the bam files
## This is moderately hackish. samtools flagstat returns
## the number of reads in the bam file as the first element
## of the first line, this call makes this assumption.
if not data.paramsdict["assembly_method"] == "denovo":
## shorter names
mapf = os.path.join(
data.dirs.refmapping, sample.name + "-mapped-sorted.bam")
umapf = os.path.join(
data.dirs.refmapping, sample.name + "-unmapped.bam")
## get from unmapped
cmd1 = [ip.bins.samtools, "flagstat", umapf]
proc1 = sps.Popen(cmd1, stderr=sps.STDOUT, stdout=sps.PIPE)
result1 = proc1.communicate()[0]
## get from mapped
cmd2 = [ip.bins.samtools, "flagstat", mapf]
proc2 = sps.Popen(cmd2, stderr=sps.STDOUT, stdout=sps.PIPE)
result2 = proc2.communicate()[0]
## store results
## If PE, samtools reports the _actual_ number of reads mapped, both
## R1 and R2, so here if PE divide the results by 2 to stay consistent
## with how we've been reporting R1 and R2 as one "read pair"
if "pair" in data.paramsdict["datatype"]:
sample.stats["refseq_unmapped_reads"] = int(result1.split()[0]) / 2
sample.stats["refseq_mapped_reads"] = int(result2.split()[0]) / 2
else:
sample.stats["refseq_unmapped_reads"] = int(result1.split()[0])
sample.stats["refseq_mapped_reads"] = int(result2.split()[0])
unmapped = os.path.join(data.dirs.refmapping, sample.name + "-unmapped.bam")
samplesam = os.path.join(data.dirs.refmapping, sample.name + ".sam")
for rfile in [unmapped, samplesam]:
if os.path.exists(rfile):
os.remove(rfile)
# if loglevel==DEBUG
log_level = ip.logger.getEffectiveLevel()
if not log_level == 10:
## Clean up loose files only if not in DEBUG
##- edits/*derep, utemp, *utemp.sort, *htemp, *clust.gz
derepfile = os.path.join(data.dirs.edits, sample.name + "_derep.fastq")
mergefile = os.path.join(data.dirs.edits, sample.name + "_merged_.fastq")
uhandle = os.path.join(data.dirs.clusts, sample.name + ".utemp")
usort = os.path.join(data.dirs.clusts, sample.name + ".utemp.sort")
hhandle = os.path.join(data.dirs.clusts, sample.name + ".htemp")
clusters = os.path.join(data.dirs.clusts, sample.name + ".clust.gz")
for rfile in [derepfile, mergefile, uhandle, usort, hhandle, clusters]:
if os.path.exists(rfile):
os.remove(rfile)
In [7]:
# optimize speed of this next
build_clusters_from_cigars(data, sample)
In [ ]:
### Write each out to a sam file...
# newconsensus()
#
data._este = data.stats.error_est.mean()
data._esth = data.stats.hetero_est.mean()
clusters = open(os.path.join(data.dirs.clusts, "{}.clustS.gz".format(sample.name)), 'r')
clusters.read()
# plan to fill an h5 for this sample
tmp5 = consenshandle.replace("_tmpcons.", "_tmpcats.")
with h5py.File(tmp5, 'w') as io5:
io5.create_dataset("cats", (optim, maxlen, 4), dtype=np.uint32)
io5.create_dataset("alls", (optim, ), dtype=np.uint8)
io5.create_dataset("chroms", (optim, 3), dtype=np.int64)
## local copies to use to fill the arrays
catarr = io5["cats"][:]
nallel = io5["alls"][:]
refarr = io5["chroms"][:]
In [ ]:
### Step 6 for refmapped pairs:
# 1. convert all sams to bams and make a merged mapped-sorted.bam
# 2. get overlapping regions with bedtools_merge()
# 3. pull consens reads in aligned regions with bamfile.fetch()
# 4. store the consensus sequence in h5.
# 5. store the variants in h5.
# 6. store the depth of variants in h5.
# 7.
In [7]:
# get all regions with reads. Generator to yield (str, int, int)
fullregions = bedtools_merge(data, sample).strip().split("\n")
regions = (i.split("\t") for i in fullregions)
regions = ((i, int(j), int(k)) for (i, j, k) in regions)
# access reads from bam file using pysam
samfile = AlignmentFile(
os.path.join(data.dirs.refmapping,
"{}-mapped-sorted.bam".format(sample.name)),
'rb')
In [34]:
reg = next(regions)
ref = get_ref_region(data.paramsdict["reference_sequence"], *reg)
reads = samfile.fetch(*reg)
In [35]:
# match paired reads together in a dictionary
rdict = {}
for read in reads:
rdict[read.qname] = read
# sort keys by derep number
keys = sorted(
rdict.keys(),
key=lambda x: int(x.split("=")[-1]), reverse=True)
# build the cluster based on map positions, orientation, cigar
clust = []
for key in keys:
r1 = rdict[key]
aref = np.array(list(ref[1]))
arr1 = np.zeros(aref.size, dtype="U1")
arr1.fill("-")
# how far ahead of the start does this read begin
seq = cigared(r1.seq, r1.cigar)
start = r1.reference_start - reg[1]
arr1[start:start + len(seq)] = list(seq)
aseq = "".join(arr1)
ori = "+"
if r1.is_reverse:
ori = "-"
derep = r1.qname.split("=")[-1]
rname = "{}:{}-{};size={};{}".format(*reg, derep, ori)
clust.append("{}\n{}".format(rname, aseq))
In [36]:
clust
Out[36]:
In [30]:
rdict
Out[30]:
In [16]:
mapping_reads(data, sample, 2)
In [ ]:
In [9]:
s3.data.run("5")
In [11]:
subsamples = list(s3.data.samples.values())
subsamples.sort(key=lambda x: x.stats.clusters_hidepth, reverse=True)
jobs = {}
In [8]:
from ipyrad.assemble.jointestimate import *
optim(data, sample)
In [9]:
sample, _, _, nhidepth, maxlen = recal_hidepth(data, sample)
Out[9]:
In [6]:
data
concat_multiple_edits(data, sample)
merge_pairs_with_vsearch(data, sample, True)
merge_end_to_end(data, sample, True, True)
In [7]:
dereplicate(data, sample, 2)
In [8]:
cluster(data, sample, 2, 1)
In [9]:
build_clusters(data, sample, 5)
In [10]:
muscle_chunker(data, sample)
In [14]:
for idx in range(10):
handle = os.path.join(s3.data.tmpdir,
"{}_chunk_{}.ali".format(sample.name, idx))
align_and_parse(handle, 5, 0)
In [15]:
reconcat(data, sample)
In [ ]:
In [6]:
s3.run()
In [6]:
In [9]:
maxlens, depths = get_quick_depths(data, sample)
maxlens, depths
Out[9]:
In [ ]:
In [7]:
data = s3.data
samples = list(s3.data.samples.values())
sample = samples[1]
In [8]:
sample = samples[1]
In [9]:
concat_multiple_edits(data, sample)
In [10]:
merge_end_to_end(data, sample, False)
In [11]:
dereplicate(data, sample, 2)
In [12]:
split_endtoend_reads(data, sample)
In [13]:
mapping_reads(data, sample, 2)
In [14]:
build_ref_cigars(data, sample)
In [ ]:
In [ ]:
#samtools mpileup -uf ref.fa aln1.bam aln2.bam | bcftools view -bvcg - > var.raw.bcf
! samtools mpileup -ur /home/deren/Dropbox/opbox/Maud/lgeorge.genome.fa
In [9]:
#regions = bedtools_merge(data, sample).strip().split("\n")
fullregions = bedtools_merge(data, sample).strip().split("\n")
In [10]:
regions = (i.split("\t") for i in fullregions)
regions = ((i, int(j), int(k)) for (i, j, k) in regions)
In [ ]:
In [14]:
def get_ref_region(reference, contig, rstart, rend):
"returns the reference sequence over a given region"
cmd = [
ip.bins.samtools, 'faidx',
reference,
"{}:{}-{}".format(contig, rstart + 1, rend),
]
stdout, err = sps.Popen(cmd, stdout=sps.PIPE).communicate()
name, seq = stdout.decode().split("\n", 1)
listseq = [name, seq.replace("\n", "")]
return listseq
In [527]:
def build_ref_cigars(data, sample):
# get all regions with reads. Generator to yield (str, int, int)
#fullregions = bedtools_merge(data, sample).strip().split("\n")
regions = (i.split("\t") for i in fullregions)
regions = ((i, int(j), int(k)) for (i, j, k) in regions)
# access reads from bam file using pysam
samfile = AlignmentFile(
os.path.join(data.dirs.refmapping,
"{}-mapped-sorted.bam".format(sample.name)),
'rb')
# iterate over all regions
out = open("test.clustS", 'w')
idx = 0
clusters = []
for reg in regions:
ref = get_ref_region(data.paramsdict["reference_sequence"], *reg)
reads = samfile.fetch(*reg)
# match paired reads together in a dictionary
rdict = {}
for read in reads:
if read.qname not in rdict:
rdict[read.qname] = [read, None]
else:
rdict[read.qname][1] = read
# sort keys by derep number
keys = sorted(
rdict.keys(),
key=lambda x: int(x.split("=")[-1]), reverse=True)
# build the cluster based on map positions, orientation, cigar
clust = []
for key in keys:
r1, r2 = rdict[key]
if r1 and r2:
aref = np.array(list(ref[1]))
arr1 = np.zeros(aref.size, dtype="U1")
arr2 = np.zeros(aref.size, dtype="U1")
arr1.fill("-")
arr2.fill("-")
try:
# how far ahead of the start does this read begin
seq = cigared(r1.seq, r1.cigar)
start = r1.reference_start - reg[1]
arr1[start:start + len(seq)] = list(seq)
seq = cigared(r2.seq, r2.cigar)
start = r2.reference_start - reg[1]
arr2[start:start + len(seq)] = list(seq)
arr3 = join_arrays(arr1, arr2)
pairseq = "".join(arr3)
derep = r1.qname.split("=")[-1]
clust.append("{}\n{}".format("{}:{}-{};size={}"
.format(*reg, derep), pairseq))
except ValueError:
print(reg)
clusters.append("\n".join(clust))
idx += 1
if not idx % 1000:
out.write("\n//\n//\n".join(clusters))
out.close()
In [549]:
def build_ref_cigars(data, sample):
# get all regions with reads. Generator to yield (str, int, int)
#fullregions = bedtools_merge(data, sample).strip().split("\n")
regions = (i.split("\t") for i in fullregions)
regions = ((i, int(j), int(k)) for (i, j, k) in regions)
# access reads from bam file using pysam
samfile = AlignmentFile(
os.path.join(data.dirs.refmapping,
"{}-mapped-sorted.bam".format(sample.name)),
'rb')
# iterate over all regions
opath = os.path.join(
data.dirs.refmapping, "{}.clustS.gz".format(sample.name))
out = gzip.open(opath, 'w')
idx = 0
clusters = []
for reg in regions:
ref = get_ref_region(data.paramsdict["reference_sequence"], *reg)
reads = samfile.fetch(*reg)
# match paired reads together in a dictionary
rdict = {}
for read in reads:
if read.qname not in rdict:
rdict[read.qname] = [read, None]
else:
rdict[read.qname][1] = read
# sort keys by derep number
keys = sorted(
rdict.keys(),
key=lambda x: int(x.split("=")[-1]), reverse=True)
# build the cluster based on map positions, orientation, cigar
clust = []
for key in keys:
r1, r2 = rdict[key]
if r1 and r2:
aref = np.array(list(ref[1]))
arr1 = np.zeros(aref.size, dtype="U1")
arr2 = np.zeros(aref.size, dtype="U1")
arr1.fill("-")
arr2.fill("-")
# how far ahead of the start does this read begin
seq = cigared(r1.seq, r1.cigar)
start = r1.reference_start - reg[1]
arr1[start:start + len(seq)] = list(seq)
seq = cigared(r2.seq, r2.cigar)
start = r2.reference_start - reg[1]
arr2[start:start + len(seq)] = list(seq)
arr3 = join_arrays(arr1, arr2)
pairseq = "".join(arr3)
derep = r1.qname.split("=")[-1]
rname = "{}:{}-{};size={}".format(*reg, derep)
clust.append("{}\n{}".format(rname, pairseq))
clusters.append("\n".join(clust))
idx += 1
if not idx % 100:
out.write("\n//\n//\n".join(clusters).encode())
out.close()
In [550]:
c = build_ref_cigars(data, sample)
In [547]:
"\n//\n//\n".join(c).encode()
Out[547]:
In [397]:
regions = (i.split("\t") for i in fullregions)
regions = ((i, int(j), int(k)) for (i, j, k) in regions)
build_ref_cigars(data, sample)
In [438]:
reg = ('Contig0', 41920, 42479)
reg = ('Contig0', 7219468, 7220326)
reg = ('Contig0', 207998, 208219)
reg = ('Contig0', 16738, 16933)
reg = ('Contig0', 49781, 50005)
reg = ('Contig0', 76480, 76711)
reg = ('Contig0', 24391, 24908)
reg = ('Contig0', 7193, 7952) # has merge
#reg = ('Contig0', 13327, 13845) # has indels
reg = ('Contig0', 76480, 77015) # has indels
reg = ('Contig0', 346131, 346193) # valueerror
samfile = AlignmentFile(
os.path.join(data.dirs.refmapping,
"{}-mapped-sorted.bam".format(sample.name)),
'rb')
ref = get_ref_region(data.paramsdict["reference_sequence"], *reg)
reads = list(samfile.fetch(*reg))
In [519]:
def cigared(sequence, cigartups):
start = 0
seq = ""
for tup in cigartups:
flag, add = tup
if flag is 0:
seq += sequence[start:start + add]
if flag is 1:
pass
if flag is 2:
seq += "-" * add
start -= add
if flag is 4:
pass
start += add
return seq
In [456]:
# match paired reads together in a dictionary
rdict = {}
for read in reads:
if read.qname not in rdict:
rdict[read.qname] = [read]
else:
rdict[read.qname].append(read)
# sort keys by derep number
keys = sorted(
rdict.keys(),
key=lambda x: int(x.split("=")[-1]), reverse=True)
# build the cluster based on map positions, orientation, cigar
for key in keys:
r1, r2 = rdict[key]
if r1 and r2:
print(r1.seq, r1.cigar)# r1.pos, r1.aend, r1.rlen, r1.aligned_pairs)
print(cigared(r1.seq, r1.cigar))
In [310]:
out = open("test2.txt", 'w')
# build the cluster based on map positions, orientation, cigar
for key in keys:
r1, r2 = rdict[key]
if r1 and r2:
# empty arrays
aref = np.array(list(ref[1]))
arr1 = np.zeros(aref.size, dtype="U1")
arr2 = np.zeros(aref.size, dtype="U1")
arr1.fill("-")
arr2.fill("-")
# how far ahead of the start does this read begin
start = r1.reference_start - reg[1]
seq = cigared(r1.seq, r1.cigar)
arr1[start:start+len(seq)] = list(seq)
seq = cigared(r2.seq, r2.cigar)
start = r2.reference_start - reg[1]
arr2[start:start+len(seq)] = list(seq)
#print("".join(arr1), file=out)
#print("".join(arr2), file=out)
arr3 = join_arrays(arr1, arr2)
print("".join(arr3), file=out)
out.close()
In [307]:
import numba
#numba.jit(nopython=True)
def join_arrays(arr1, arr2):
arr3 = np.zeros(arr1.size, dtype="U1")
for i in range(arr1.size):
if arr1[i] == arr2[i]:
arr3[i] = arr1[i]
elif arr1[i] == "N":
if arr2[i] == "-":
arr3[i] = "N"
else:
arr3[i] = arr2[i]
elif arr2[i] == "N":
if arr1[i] == "-":
arr3[i] = "N"
else:
arr3[i] = arr1[i]
elif arr1[i] == "-":
if arr2[i] == "N":
arr3[i] = "N"
else:
arr3[i] = arr2[i]
elif arr2[i] == "-":
if arr1[i] == "N":
arr3[i] = "N"
else:
arr3[i] = arr1[i]
else:
arr3[i] = "N"
return arr3
a1 = np.array(list("AGAGAG-NN----"))
a2 = np.array(list("----ACTTNNTTT"))
de = np.array(list("AGAGANTTNNTTT"))
a1 = np.array(list("AGAGAG-NN-------"))
a2 = np.array(list("----ACTTNNTTT---"))
de = np.array(list("AGAGANTTNNTTT---"))
In [308]:
join_arrays(a1, a2)
Out[308]:
In [233]:
# how far ahead of the start does this read begin
start = r1.reference_start - reg[1]
Out[233]:
In [228]:
aref = np.array(list(ref[1]))
arr1 = np.zeros(aref.size, dtype="U1")
arr2 = np.zeros(aref.size, dtype="U1")
arr1.fill("-")
In [194]:
r1.reference_start, r1.pos, r1.reference_end, r1.cigar
Out[194]:
In [202]:
cigared(r1.seq, r1.cigar)
Out[202]:
In [199]:
# which is forward and reverse
if r1.blocks[0][0] > r2.blocks[0][0]:
rx = r2
r2 = r1
r1 = rx
# get arrs
aref = np.array(list(ref[1]))
arr1 = np.zeros(aref.size, dtype="U1")
arr2 = np.zeros(aref.size, dtype="U1")
# fill arr1
seq1 = cigared(r1.seq, r1.cigar)
arr1[]
Out[199]:
In [197]:
ref = get_ref_region(data.paramsdict["reference"], *reg)
def cigar(r1, r2, reg):
# which is forward and reverse
if r1.blocks[0][0] > r2.blocks[0][0]:
rx = r2
r2 = r1
r1 = rx
# get arrs
aref = np.array(list(ref[1]))
arr1 = np.zeros(aref.size, dtype="U1")
arr2 = np.zeros(aref.size, dtype="U1")
# fill arr1
# do they overlap?
overlap = False
if max(r1.blocks[0]) > min(r2.blocks[0]):
overlap = True
#osegment = r1.blocks[0][1] - r2.blocks[0][0]
#oseqs = r1.seq[-osegment:], r2.seq[:osegment]
#print(oseqs)
# modify for cigar
for edit in r1.cigar:
r1.seq
before = "-" * (r1.pos - reg[1])
if r1.is_reverse:
read = before + revcomp(r1.seq)
else:
read = before + r1.seq
midns = "-" * (r2.pos - r1.aend)
read += midns
if r2.is_reverse:
read += r2.seq
else:
read += revcomp(r2.seq)
after = "-" * (reg[2] - r2.aend)
read += after
return read
In [ ]:
In [ ]:
In [ ]:
In [350]:
def get_ref_region(reference, contig, rstart, rend):
cmd = [
ip.bins.samtools, 'faidx',
reference,
"{}:{}-{}".format(contig, rstart+1, rend)]
stdout, err = sps.Popen(cmd, stdout=sps.PIPE).communicate()
name, seq = stdout.decode().split("\n", 1)
listseq = [name, seq.replace("\n", "")]
return listseq
In [250]:
ireg = samfile.fetch(*reg)
rdict = {}fo
for read in ireg:
if read.qname not in rdict:
rdict[read.qname] = [read]
else:
rdict[read.qname].append(read)
clust = dict_to_clust(rdict)
In [251]:
clust
Out[251]:
In [252]:
ref = get_ref_region(*reg)
ref
Out[252]:
In [255]:
print(ref[1][:80])
print(r1.seq)
In [272]:
# how for ahead of reference is r1 start
ahead = -1 * (reg[1] - (r1.pos - 1))
In [274]:
print(ref[1][:90])
print("-"*ahead, r1.seq)
In [344]:
print('r', ref[1][:90])
for key in rdict:
r1, r2 = rdict[key]
ahead = -1 * (reg[1] - (r1.pos))
print('h', ("-"*ahead + r1.seq)[:90])
#print(r1.cigar)
print('r', ref[1][-90:])
for key in rdict:
r1, r2 = rdict[key]
ahead = (reg[2] - (r2.pos))
print(reg[2], r2.pos, ahead)
print('h', ("-"*ahead))
#print(r1.cigar)
#print(r2.seq)
#print(r2.get_blocks())
#print(r1.seq)
#print(r1.get_blocks()[0])
#print(r1.aend, r2.get_blocks()[0][0])
#print(r1.cigar)
print(r2.cigarstring)
In [66]:
for read in clust:
print(read)
In [30]:
# access reads from bam file using pysam
samfile = AlignmentFile(
os.path.join(data.dirs.refmapping,
"{}-mapped-sorted.bam".format(sample.name)),
'rb')
# iterate over all mapped regions
clusters = []
for reg in regions:
ireg = samfile.fetch(*reg)
# match paired reads by read names for all reads in each cluster
rdict = {}
for read in ireg:
if read.qname not in rdict:
rdict[read.qname] = [read]
else:
rdict[read.qname].append(read)
clust = dict_to_clust(rdict)
In [31]:
rdict
Out[31]:
In [29]:
from ipyrad.assemble.util import revcomp
In [106]:
def build_ref_clusters_from_cigars(data, sample):
# get regions from bedtools overlaps
regions = bedtools_merge(data, sample).strip().split("\n")
regions = (i.split("\t") for i in regions)
regions = ((i, int(j), int(k)) for (i, j, k) in regions)
# access reads from bam file using pysam
samfile = AlignmentFile(
os.path.join(data.dirs.refmapping,
"{}-mapped-sorted.bam".format(sample.name)),
'rb')
# iterate over all mapped regions
clusters = []
for reg in regions:
ireg = samfile.fetch(*reg)
# match paired reads by read names for all reads in each cluster
rdict = {}
for read in ireg:
if read.qname not in rdict:
rdict[read.qname] = [read]
else:
rdict[read.qname].append(read)
return dict_to_clust(rdict)
In [109]:
seq = build_ref_clusters_from_cigars(data, sample)
seq
Out[109]:
In [27]:
def dict_to_clust(rdict):
clust = []
for pair in rdict.values():
r1, r2 = pair
poss = r1.get_reference_positions() + r2.get_reference_positions()
seedstart, seedend = min(poss), max(poss)
reads_overlap = False
#print(r1.cigartuples, r1.cigarstring, r1.cigar)
if r1.is_reverse:
if r2.aend > r1.get_blocks()[0][0]:
reads_overlap = True
seq = r2.seq + 'nnnn' + revcomp(r1.seq)
else:
seq = r2.seq + 'nnnn' + r2.seq
else:
if r1.aend > r2.get_blocks()[0][0]:
reads_overlap = True
seq = r1.seq + 'nnnn' + revcomp(r2.seq)
else:
seq = r1.seq + 'nnnn' + r2.seq
clust.append(seq)
return clust
In [8]:
# build clusters for aligning with muscle from the sorted bam file
samfile = AlignmentFile(
os.path.join(data.dirs.refmapping,
"{}-mapped-sorted.bam".format(sample.name)),
'rb')
In [24]:
chrom, rstart, rend = regions[0].split()
reg = samfile.fetch(chrom, int(rstart), int(rend))
for read in reg:
print(rstart)
print(read.seq)
In [ ]:
In [9]:
nonmerged1 = os.path.join(
data.tmpdir,
"{}_nonmerged_R1_.fastq".format(sample.name))
nonmerged2 = os.path.join(
data.tmpdir,
"{}_nonmerged_R2_.fastq".format(sample.name))
edits1 = os.path.join(
data.dirs.edits,
"{}.trimmed_R1_.fq.gz".format(sample.name))
edits2 = os.path.join(
data.dirs.edits,
"{}.trimmed_R2_.fq.gz".format(sample.name))
# file precedence
nonm1 = [i for i in (edits1, nonmerged1) if os.path.exists(i)][-1]
nonm2 = [i for i in (edits2, nonmerged2) if os.path.exists(i)][-1]
In [10]:
edits1
Out[10]:
In [11]:
infiles = [
os.path.join(data.dirs.edits, "{}.trimmed_R1_.fastq.gz".format(sample.name)),
os.path.join(data.dirs.edits, "{}_R1_concatedit.fq.gz".format(sample.name)),
os.path.join(data.tmpdir, "{}_merged.fastq".format(sample.name)),
os.path.join(data.tmpdir, "{}_declone.fastq".format(sample.name)),
]
infiles = [i for i in infiles if os.path.exists(i)]
infile = infiles[-1]
In [15]:
split_endtoend_reads(data, sample)
In [11]:
with open(inp, 'r') as infile:
duo = izip(*[iter(infile)] * 2)
idx = 0
while 1:
try:
itera = next(duo)
except StopIteration:
break
r1, r2 = itera[1].split("nnnn")
In [14]:
def split_endtoend_reads(data, sample):
"""
Takes R1nnnnR2 derep reads from paired data and splits it back into
separate R1 and R2 parts for read mapping.
"""
inp = os.path.join(data.tmpdir, "{}_derep.fastq".format(sample.name))
out1 = os.path.join(data.tmpdir, "{}_R1-tmp.fastq".format(sample.name))
out2 = os.path.join(data.tmpdir, "{}_R2-tmp.fastq".format(sample.name))
splitderep1 = open(out1, 'w')
splitderep2 = open(out2, 'w')
with open(inp, 'r') as infile:
# Read in the infile two lines at a time: (seqname, sequence)
duo = izip(*[iter(infile)] * 2)
## lists for storing results until ready to write
split1s = []
split2s = []
## iterate over input splitting, saving, and writing.
idx = 0
while 1:
try:
itera = next(duo)
except StopIteration:
break
## split the duo into separate parts and inc counter
part1, part2 = itera[1].split("nnnn")
idx += 1
## R1 needs a newline, but R2 inherits it from the original file
## store parts in lists until ready to write
split1s.append("{}{}\n".format(itera[0], part1))
split2s.append("{}{}".format(itera[0], part2))
## if large enough then write to file
if not idx % 10000:
splitderep1.write("".join(split1s))
splitderep2.write("".join(split2s))
split1s = []
split2s = []
## write final chunk if there is any
if any(split1s):
splitderep1.write("".join(split1s))
splitderep2.write("".join(split2s))
## close handles
splitderep1.close()
splitderep2.close()
In [10]:
merge_pairs(data, sample, 1, 1)
In [ ]:
In [8]:
s3.data.samples["1A_0"].concatfiles
Out[8]:
In [6]:
s3.run()
In [28]:
def bedtools_merge(data, sample):
"""
Get all contiguous genomic regions with one or more overlapping
reads. This is the shell command we'll eventually run
bedtools bamtobed -i 1A_0.sorted.bam | bedtools merge [-d 100]
-i <input_bam> : specifies the input file to bed'ize
-d <int> : For PE set max distance between reads
"""
LOGGER.info("Entering bedtools_merge: %s", sample.name)
mappedreads = os.path.join(data.dirs.refmapping,
sample.name + "-mapped-sorted.bam")
## command to call `bedtools bamtobed`, and pipe output to stdout
## Usage: bedtools bamtobed [OPTIONS] -i <bam>
## Usage: bedtools merge [OPTIONS] -i <bam>
cmd1 = [ip.bins.bedtools, "bamtobed", "-i", mappedreads]
cmd2 = [ip.bins.bedtools, "merge", "-i", "-"]
## If PE the -d flag to tell bedtools how far apart to allow mate pairs.
## If SE the -d flag is negative, specifying that SE reads need to
## overlap by at least a specific number of bp. This prevents the
## stairstep syndrome when a + and - read are both extending from
## the same cutsite. Passing a negative number to `merge -d` gets this done.
if 'pair' in data.paramsdict["datatype"]:
check_insert_size(data, sample)
#cmd2.insert(2, str(data._hackersonly["max_inner_mate_distance"]))
cmd2.insert(2, str(data._hackersonly["max_inner_mate_distance"]))
cmd2.insert(2, "-d")
else:
cmd2.insert(2, str(-1 * data._hackersonly["min_SE_refmap_overlap"]))
cmd2.insert(2, "-d")
## pipe output from bamtobed into merge
LOGGER.info("stdv: bedtools merge cmds: %s %s", cmd1, cmd2)
proc1 = sps.Popen(cmd1, stderr=sps.STDOUT, stdout=sps.PIPE)
proc2 = sps.Popen(cmd2, stderr=sps.STDOUT, stdout=sps.PIPE, stdin=proc1.stdout)
result = proc2.communicate()[0].decode()
proc1.stdout.close()
## check for errors and do cleanup
if proc2.returncode:
raise IPyradWarningExit("error in %s: %s", cmd2, result)
## Write the bedfile out, because it's useful sometimes.
if os.path.exists(ip.__debugflag__):
with open(os.path.join(data.dirs.refmapping, sample.name + ".bed"), 'w') as outfile:
outfile.write(result)
## Report the number of regions we're returning
nregions = len(result.strip().split("\n"))
LOGGER.info("bedtools_merge: Got # regions: %s", nregions)
return result
In [27]:
bedtools_merge
Out[27]:
In [29]:
def check_insert_size(data, sample):
"""
check mean insert size for this sample and update
hackersonly.max_inner_mate_distance if need be. This value controls how
far apart mate pairs can be to still be considered for bedtools merging
downstream.
"""
## pipe stats output to grep
cmd1 = [
ip.bins.samtools,
"stats",
os.path.join(
data.dirs.refmapping, "{}-mapped-sorted.bam".format(sample.name)),
]
cmd2 = ["grep", "SN"]
proc1 = sps.Popen(cmd1, stderr=sps.STDOUT, stdout=sps.PIPE)
proc2 = sps.Popen(cmd2, stderr=sps.STDOUT, stdout=sps.PIPE, stdin=proc1.stdout)
res = proc2.communicate()[0].decode()
if proc2.returncode:
raise IPyradWarningExit("error in %s: %s", cmd2, res)
## starting vals
avg_insert = 0
stdv_insert = 0
avg_len = 0
## iterate over results
for line in res.split("\n"):
if "insert size average" in line:
avg_insert = float(line.split(":")[-1].strip())
elif "insert size standard deviation" in line:
## hack to fix sim data when stdv is 0.0. Shouldn't
## impact real data bcz stdv gets rounded up below
stdv_insert = float(line.split(":")[-1].strip()) + 0.1
elif "average length" in line:
avg_len = float(line.split(":")[-1].strip())
LOGGER.debug("avg {} stdv {} avg_len {}"
.format(avg_insert, stdv_insert, avg_len))
## If all values return successfully set the max inner mate distance.
## This is tricky. avg_insert is the average length of R1+R2+inner mate
## distance. avg_len is the average length of a read. If there are lots
## of reads that overlap then avg_insert will be close to but bigger than
## avg_len. We are looking for the right value for `bedtools merge -d`
## which wants to know the max distance between reads.
if all([avg_insert, stdv_insert, avg_len]):
## If 2 * the average length of a read is less than the average
## insert size then most reads DO NOT overlap
if stdv_insert < 5:
stdv_insert = 5.
if (2 * avg_len) < avg_insert:
hack = avg_insert + (3 * np.math.ceil(stdv_insert)) - (2 * avg_len)
## If it is > than the average insert size then most reads DO
## overlap, so we have to calculate inner mate distance a little
## differently.
else:
hack = (avg_insert - avg_len) + (3 * np.math.ceil(stdv_insert))
## set the hackerdict value
LOGGER.info("stdv: hacked insert size is %s", hack)
data._hackersonly["max_inner_mate_distance"] = int(np.math.ceil(hack))
else:
## If something fsck then set a relatively conservative distance
data._hackersonly["max_inner_mate_distance"] = 300
LOGGER.debug("inner mate distance for {} - {}".format(sample.name,\
data._hackersonly["max_inner_mate_distance"]))
In [33]:
#from ipyrad.assemble.refmap import *
data = s3.data
samples = list(data.samples.values())
sample = samples[0]
regions = bedtools_merge(data, sample).strip().split("\n")
In [34]:
nregions = len(regions)
chunksize = (nregions / 10) + (nregions % 10)
In [43]:
from pysam import AlignmentFile
sample.files.mapped_reads = os.path.join(
data.dirs.refmapping, sample.name + "-mapped-sorted.bam")
samfile = AlignmentFile(sample.files.mapped_reads, 'rb')
#"./tortas_refmapping/PZ70-mapped-sorted.bam", "rb")
In [97]:
regions[1]
Out[97]:
In [170]:
samfile = pysam.AlignmentFile("ex1.sam", "r")
In [262]:
a = pysam.AlignmentFile("./3-refsetest_refmapping/1A_0.sam")
In [287]:
nthreads = 2
cmd1 = [
ip.bins.bwa, "mem",
"-t", str(max(1, nthreads)),
"-M",
data.paramsdict['reference_sequence'],
sample.concatfiles[0][0],
sample.concatfiles[0][1] or "",
]
cmd1 = [i for i in cmd1 if i]
# Insert optional flags for bwa
bwa_args = data._hackersonly["bwa_args"].split()
bwa_args.reverse()
for arg in bwa_args:
cmd1.insert(2, arg)
with open(sample.files.sam, 'wb') as outfile:
proc1 = sps.Popen(cmd1, stderr=None, stdout=outfile)
error1 = proc1.communicate()[0]
if proc1.returncode:
raise IPyradError(error1)
In [288]:
def bwa_map(data, sample, nthreads, force):
"""
Map reads to reference sequence. This reads in the fasta files
(samples.files.edits), and maps each read to the reference. Unmapped reads
are dropped right back in the de novo pipeline.
Mapped reads end up in a sam file.
"""
sample.files.sam = os.path.join(
data.dirs.refmapping,
"{}.sam".format(sample.name))
sample.files.mapped_reads = os.path.join(
data.dirs.refmapping,
"{}-mapped-sorted.bam".format(sample.name))
sample.files.unmapped_bam = os.path.join(
data.dirs.refmapping,
"{}-unmapped.bam".format(sample.name))
sample.files.unmapped_reads = os.path.join(
data.dirs.refmapping,
"{}-unmapped.fastq".format(sample.name))
# (cmd1) bwa mem [OPTIONS] <index_name> <file_name_A> [<file_name_B>]
# -t # : Number of threads
# -M : Mark split alignments as secondary.
# (cmd2) samtools view [options] <in.bam>|<in.sam>|<in.cram> [region ...]
# -b = write to .bam
# -q = Only keep reads with mapq score >= 30 (seems to be pretty standard)
# -F = Select all reads that DON'T have these flags.
# 0x4 (segment unmapped)
# 0x100 (Secondary alignment)
# 0x800 (supplementary alignment)
# -U = Write out all reads that don't pass the -F filter
# (all unmapped reads go to this file).
# (cmd3) samtools sort [options...] [in.bam]
# -T = Temporary file name, this is required by samtools, ignore it
# Here we hack it to be samhandle.tmp cuz samtools cleans it up
# -O = Output file format, in this case bam
# -o = Output file name
# (cmd5) samtools bam2fq -v 45 [in.bam]
# -v45 set the default qscore arbirtrarily high
#
cmd1 = [
ip.bins.bwa, "mem",
"-t", str(max(1, nthreads)),
"-M",
data.paramsdict['reference_sequence'],
sample.concatfiles[0][0],
sample.concatfiles[0][1] or "",
]
cmd1 = [i for i in cmd1 if i]
# Insert optional flags for bwa
bwa_args = data._hackersonly["bwa_args"].split()
bwa_args.reverse()
for arg in bwa_args:
cmd1.insert(2, arg)
with open(sample.files.sam, 'wb') as outfile:
proc1 = sps.Popen(cmd1, stderr=None, stdout=outfile)
error1 = proc1.communicate()[0]
if proc1.returncode:
raise IPyradError(error1)
# sends unmapped reads to a files and will PIPE mapped reads to cmd3
cmd2 = [
ip.bins.samtools, "view",
"-b",
"-F", "0x904",
"-U", sample.files.unmapped_bam,
sample.files.sam,
]
# this is gonna catch mapped bam output from cmd2 and write to file
cmd3 = [
ip.bins.samtools, "sort",
"-T", os.path.join(data.dirs.refmapping, sample.name + ".sam.tmp"),
"-O", "bam",
"-o", sample.files.mapped_reads]
# Later we're gonna use samtools to grab out regions using 'view'
cmd4 = [ipyrad.bins.samtools, "index", sample.files.mapped_reads]
# convert unmapped reads to fastq
cmd5 = [
ip.bins.samtools, "bam2fq",
"-v 45",
sample.files.unmapped_bam,
]
# Insert additional arguments for paired data to the commands.
# We assume Illumina paired end reads for the orientation
# of mate pairs (orientation: ---> <----).
if 'pair' in data.paramsdict["datatype"]:
# add samtools filter for only keep if both pairs hit
# 0x1 - Read is paired
# 0x2 - Each read properly aligned
cmd2.insert(2, "0x3")
cmd2.insert(2, "-f")
# tell bam2fq that there are output files for each read pair
cmd5.insert(2, os.path.join(
data.dirs.edits, sample.name + "-tmp-umap1.fastq"))
cmd5.insert(2, "-1")
cmd5.insert(2, os.path.join(
data.dirs.edits, sample.name + "-tmp-umap2.fastq"))
cmd5.insert(2, "-2")
else:
cmd5.insert(2, sample.files.unmapped_reads)
cmd5.insert(2, "-0")
# cmd2 writes to sname.unmapped.bam and fills pipe with mapped BAM data
LOGGER.debug(" ".join(cmd2))
proc2 = sps.Popen(cmd2, stderr=sps.STDOUT, stdout=sps.PIPE)
# cmd3 pulls mapped BAM from pipe and writes to sname.mapped-sorted.bam
LOGGER.debug(" ".join(cmd3))
proc3 = sps.Popen(cmd3,
stderr=sps.STDOUT, stdout=sps.PIPE, stdin=proc2.stdout)
error3 = proc3.communicate()[0]
if proc3.returncode:
raise IPyradWarningExit(error3)
proc2.stdout.close()
# cmd4 indexes the bam file
LOGGER.debug(" ".join(cmd4))
proc4 = sps.Popen(cmd4, stderr=sps.STDOUT, stdout=sps.PIPE)
error4 = proc4.communicate()[0]
if proc4.returncode:
raise IPyradWarningExit(error4)
# Running cmd5 writes to either edits/sname-refmap_derep.fastq for SE
# or it makes edits/sname-tmp-umap{12}.fastq for paired data, which
# will then need to be merged.
LOGGER.debug(" ".join(cmd5))
proc5 = sps.Popen(cmd5, stderr=sps.STDOUT, stdout=sps.PIPE)
error5 = proc5.communicate()[0]
if proc5.returncode:
raise IPyradWarningExit(error5)
In [289]:
bwa_map(data, sample, 2, 1)
In [ ]:
In [205]:
cmd1 = [
ip.bins.bwa, "mem",
"-t", str(max(1, 2)),
"-M",
data.paramsdict['reference_sequence'],
sample.concatfiles[0][0],
sample.concatfiles[0][1] or "",
]
proc1 = sps.Popen(cmd1)#, stderr=cmd1_stderr, stdout=cmd1_stdout)
proc1.communicate()
Out[205]:
In [ ]:
pysam.sort("-m", "1000000", "-o", "output.bam", "ex1.bam")
In [182]:
iterreg = samfile.fetch("MT", 5009, 5100)
iterreg = samfile.fetch("MT", 10109, 10200)
iterreg = samfile.fetch("MT", 285510, 285600)
In [181]:
for read in iterreg:
print(read.seq)
print(read.compare(read.get_reference_sequence()))
In [104]:
it = samfile.pileup("MT", 5009, 5100)
pysam.Pileup.
Out[104]:
In [102]:
for read in iterreg:
print(read.aligned_pairs)
In [87]:
rdict = {}
for read in iterreg:
if read.qname not in rdict:
rdict[read.qname] = read
In [80]:
read.seq
Out[80]:
In [84]:
read.cigar
Out[84]:
In [77]:
sorted(
rdict.keys(),
key=lambda x: int(x.split(""))
Out[77]:
In [65]:
it = samfile.pileup('MT', 5009, 5100)
In [71]:
clusts = []
nclusts = 0
for region in regions:
chrom, pos1, pos2 = region.split()
clust = fetch_cluster_se(data, samfile, chrom, int(pos1), int(pos2))
In [ ]:
def fetch_cluster_se(data, samfile, chrom, rstart, rend):
"""
Builds a single end cluster from the refmapped data.
"""
## If SE then we enforce the minimum overlap distance to avoid the
## staircase syndrome of multiple reads overlapping just a little.
overlap_buffer = data._hackersonly["min_SE_refmap_overlap"]
## the *_buff variables here are because we have to play patty
## cake here with the rstart/rend vals because we want pysam to
## enforce the buffer for SE, but we want the reference sequence
## start and end positions to print correctly for downstream.
rstart_buff = rstart + overlap_buffer
rend_buff = rend - overlap_buffer
## Reads that map to only very short segements of the reference
## sequence will return buffer end values that are before the
## start values causing pysam to complain. Very short mappings.
if rstart_buff > rend_buff:
tmp = rstart_buff
rstart_buff = rend_buff
rend_buff = tmp
## Buffering can't make start and end equal or pysam returns nothing.
if rstart_buff == rend_buff:
rend_buff += 1
## store pairs
rdict = {}
clust = []
iterreg = []
iterreg = samfile.fetch(chrom, rstart_buff, rend_buff)
## use dict to match up read pairs
for read in iterreg:
if read.qname not in rdict:
rdict[read.qname] = read
## sort dict keys so highest derep is first ('seed')
sfunc = lambda x: int(x.split(";size=")[1].split(";")[0])
rkeys = sorted(rdict.keys(), key=sfunc, reverse=True)
## get blocks from the seed for filtering, bail out if seed is not paired
try:
read1 = rdict[rkeys[0]]
except ValueError:
LOGGER.error("Found bad cluster, skipping - key:{} rdict:{}".format(rkeys[0], rdict))
return ""
## the starting blocks for the seed
poss = read1.get_reference_positions(full_length=True)
seed_r1start = min(poss)
seed_r1end = max(poss)
## store the seed -------------------------------------------
if read1.is_reverse:
seq = revcomp(read1.seq)
else:
seq = read1.seq
## store, could write orient but just + for now.
size = sfunc(rkeys[0])
clust.append(">{}:{}:{};size={};*\n{}"\
.format(chrom, seed_r1start, seed_r1end, size, seq))
## If there's only one hit in this region then rkeys will only have
## one element and the call to `rkeys[1:]` will raise. Test for this.
if len(rkeys) > 1:
## store the hits to the seed -------------------------------
for key in rkeys[1:]:
skip = False
try:
read1 = rdict[key]
except ValueError:
## enter values that will make this read get skipped
read1 = rdict[key][0]
skip = True
## orient reads only if not skipping
if not skip:
poss = read1.get_reference_positions(full_length=True)
minpos = min(poss)
maxpos = max(poss)
## store the seq
if read1.is_reverse:
seq = revcomp(read1.seq)
else:
seq = read1.seq
## store, could write orient but just + for now.
size = sfunc(key)
clust.append(">{}:{}:{};size={};+\n{}"\
.format(chrom, minpos, maxpos, size, seq))
else:
## seq is excluded, though, we could save it and return
## it as a separate cluster that will be aligned separately.
pass
return clust
In [62]:
cmd1 = [
ip.bins.bwa, "mem",
"-t", str(max(1, nthreads)),
"-M",
data.paramsdict['reference_sequence'],
]
cmd1 += [i for i in sample.concatfiles[0] if i]
# Insert optional flags for bwa
bwa_args = data._hackersonly["bwa_args"].split()
bwa_args.reverse()
for arg in bwa_args:
cmd1.insert(2, arg)
cmd1_stdout_handle = os.path.join(
data.dirs.refmapping, sample.name + ".sam")
cmd1_stdout = open(cmd1_stdout_handle, 'w')
cmd1_stderr = None
proc1 = sps.Popen(cmd1, stderr=cmd1_stderr, stdout=cmd1_stdout)
In [63]:
error1 = proc1.communicate()[0]
In [65]:
cmd2 = [
ip.bins.samtools, "view",
"-b",
#"-q", "30",
"-F", "0x904",
"-U", os.path.join(
data.dirs.refmapping, sample.name + "-unmapped.bam"),
os.path.join(data.dirs.refmapping, sample.name + ".sam")]
In [69]:
proc2 = sps.Popen(cmd2, stderr=sps.STDOUT, stdout=sps.PIPE)
#proc2.communicate()
In [74]:
proc3 = sps.Popen(cmd3, stderr=sps.STDOUT, stdout=sps.PIPE, stdin=proc2.stdout)
proc3.communicate()
Out[74]:
In [77]:
proc4 = sps.Popen(cmd4, stderr=sps.STDOUT, stdout=sps.PIPE)
proc4.communicate()
Out[77]:
In [81]:
sample.files.unmapped_reads = os.path.join(
data.dirs.refmapping,
"{}-unmapped.fastq".format(sample.name))
cmd4 = [ip.bins.samtools, "index", sample.files.mapped_reads]
# this is gonna read in the unmapped files, args are added below,
# and it will output fastq formatted unmapped reads for merging.
# -v 45 sets the default qscore arbitrarily high
cmd5 = [
ip.bins.samtools, "bam2fq",
"-v 45",
os.path.join(data.dirs.refmapping, sample.name + "-unmapped.bam")]
# Insert additional arguments for paired data to the commands.
# We assume Illumina paired end reads for the orientation
# of mate pairs (orientation: ---> <----).
if 'pair' in data.paramsdict["datatype"]:
# add samtools filter for only keep if both pairs hit
# 0x1 - Read is paired
# 0x2 - Each read properly aligned
cmd2.insert(2, "0x3")
cmd2.insert(2, "-f")
# tell bam2fq that there are output files for each read pair
cmd5.insert(2, os.path.join(data.dirs.edits, sample.name + "-tmp-umap1.fastq"))
cmd5.insert(2, "-1")
cmd5.insert(2, os.path.join(data.dirs.edits, sample.name + "-tmp-umap2.fastq"))
cmd5.insert(2, "-2")
else:
cmd5.insert(2, sample.files.unmapped_reads)
cmd5.insert(2, "-0")
In [82]:
cmd5
Out[82]:
In [83]:
proc5 = sps.Popen(cmd5, stderr=sps.STDOUT, stdout=sps.PIPE)
error5 = proc5.communicate()[0]
In [73]:
sample.files.mapped_reads = os.path.join(
data.dirs.refmapping,
"{}-mapped-sorted.bam".format(sample.name))
# this is gonna catch mapped bam output from cmd2 and write to file
cmd3 = [
ip.bins.samtools, "sort",
"-T", os.path.join(data.dirs.refmapping, sample.name + ".sam.tmp"),
"-O", "bam",
"-o", sample.files.mapped_reads]
cmd3
Out[73]:
In [53]:
data.dirs.refmapping
Out[53]:
In [17]:
nthreads = 2
cmd1 = [
ip.bins.bwa, "mem",
"-t", str(max(1, nthreads)),
"-M",
data.paramsdict['reference_sequence']
]
cmd1
#cmd1 += sample.files.dereps
Out[17]:
In [ ]:
In [4]:
s3 = Step3(data, list(data.samples.values()), 0, 5, True, ipyclient)
sample = s3.samples[0]
s3.run()
In [7]:
#derep_sort_map(s3.data, sample, s3.force, s3.nthreads)
sample.concatfiles = concat_multiple_edits(s3.data, sample)
In [13]:
sample.mergedfile = merge_pairs(s3.data, sample, 1, 1)
sample.mergedfile
Out[13]:
In [12]:
new_derep_and_sort(s3.data, sample.mergedfile, os.path.join(s3.data.tmpdir, sample.name+ "_derep.fastq"), s3.nthreads)
In [ ]:
strand = "plus"
if s3.data.paramsdict["datatype"] is ('gbs' or '2brad'):
strand = "both"
cmd = [
ip.bins.vsearch,
"--derep_fulllength", sample.mergedfile,
"--strand", strand,
"--output", outfile,
"--threads", str(nthreads),
"--fasta_width", str(0),
"--fastq_qmax", "1000",
"--sizeout",
"--relabel_md5",
]
In [14]:
s3.data.run("4")
In [14]:
s3.remote_run_dereps()
#s3.remote_run_cluster_map_build()
In [15]:
for sample in s3.samples:
cluster(s3.data, sample, s3.nthreads, s3.force)
In [8]:
for sample in s3.samples:
build_clusters(s3.data, sample, s3.maxindels)
In [26]:
for sample in s3.samples:
muscle_chunker(s3.data, sample)
In [33]:
aasyncs = {}
for sample in s3.samples:
aasyncs[sample.name] = []
for idx in range(10):
handle = os.path.join(s3.data.tmpdir,
"{}_chunk_{}.ali".format(sample.name, idx))
align_and_parse(handle, s3.maxindels, s3.gbs)
In [32]:
def _get_derep_num(read):
"return the number of replicates in a derep read"
return int(read.split("=")[-1].split("\n")[0][:-1])
def _aligned_indel_filter(clust, max_internal_indels):
""" checks for too many internal indels in muscle aligned clusters """
## make into list
lclust = clust.split()
## paired or not
try:
seq1 = [i.split("nnnn")[0] for i in lclust[1::2]]
seq2 = [i.split("nnnn")[1] for i in lclust[1::2]]
intindels1 = [i.rstrip("-").lstrip("-").count("-") for i in seq1]
intindels2 = [i.rstrip("-").lstrip("-").count("-") for i in seq2]
intindels = intindels1 + intindels2
if max(intindels) > max_internal_indels:
return 1
except IndexError:
seq1 = lclust[1::2]
intindels = [i.rstrip("-").lstrip("-").count("-") for i in seq1]
if max(intindels) > max_internal_indels:
return 1
return 0
In [34]:
for sample in s3.samples:
reconcat(s3.data, sample)
In [27]:
def align_and_parse(handle, max_internal_indels=5, is_gbs=False):
""" much faster implementation for aligning chunks """
## data are already chunked, read in the whole thing. bail if no data.
try:
with open(handle, 'rb') as infile:
clusts = infile.read().decode().split("//\n//\n")
# remove any empty spots
clusts = [i for i in clusts if i]
# Skip entirely empty chunks
if not clusts:
raise IPyradError("no clusters in file: {}".format(handle))
except (IOError, IPyradError):
LOGGER.debug("skipping empty chunk - {}".format(handle))
return 0
## count discarded clusters for printing to stats later
highindels = 0
## iterate over clusters sending each to muscle, splits and aligns pairs
aligned = _persistent_popen_align3(clusts, 200, is_gbs)
## store good alignments to be written to file
refined = []
## filter and trim alignments
for clust in aligned:
# check for too many internal indels
if not _aligned_indel_filter(clust, max_internal_indels):
refined.append(clust)
else:
highindels += 1
## write to file after
if refined:
outhandle = handle.rsplit(".", 1)[0] + ".aligned"
with open(outhandle, 'wb') as outfile:
outfile.write(str.encode("\n//\n//\n".join(refined) + "\n"))
## remove the old tmp file
if not LOGGER.getEffectiveLevel() == 10:
os.remove(handle)
return highindels
In [28]:
def _persistent_popen_align3(clusts, maxseqs=200, is_gbs=False):
""" keeps a persistent bash shell open and feeds it muscle alignments """
## create a separate shell for running muscle in, this is much faster
## than spawning a separate subprocess for each muscle call
proc = sps.Popen(
["bash"],
stdin=sps.PIPE,
stdout=sps.PIPE,
bufsize=0,
)
## iterate over clusters in this file until finished
aligned = []
for clust in clusts:
## new alignment string for read1s and read2s
align1 = []
align2 = []
## don't bother aligning if only one seq
if clust.count(">") == 1:
aligned.append(clust.replace(">", "").strip())
else:
# do we need to split the alignment? (is there a PE insert?)
try:
# make into list (only read maxseqs lines, 2X cuz names)
lclust = clust.split()[:maxseqs * 2]
# try to split cluster list at nnnn separator for each read
lclust1 = list(chain(*zip(
lclust[::2], [i.split("nnnn")[0] for i in lclust[1::2]])))
lclust2 = list(chain(*zip(
lclust[::2], [i.split("nnnn")[1] for i in lclust[1::2]])))
# put back into strings
clust1 = "\n".join(lclust1)
clust2 = "\n".join(lclust2)
# Align the first reads.
# The muscle command with alignment as stdin and // as split
cmd1 = ("echo -e '{}' | {} -quiet -in - ; echo {}"
.format(clust1, ip.bins.muscle, "//\n"))
# send cmd1 to the bash shell
proc.stdin.write(cmd1.encode())
# read the stdout by line until splitter is reached
# meaning that the alignment is finished.
for line in iter(proc.stdout.readline, b'//\n'):
align1.append(line.decode())
# Align the second reads.
# The muscle command with alignment as stdin and // as split
cmd2 = ("echo -e '{}' | {} -quiet -in - ; echo {}"
.format(clust2, ip.bins.muscle, "//\n"))
# send cmd2 to the bash shell
proc.stdin.write(cmd2.encode())
# read the stdout by line until splitter is reached
# meaning that the alignment is finished.
for line in iter(proc.stdout.readline, b'//\n'):
align2.append(line.decode())
# join up aligned read1 and read2 and ensure names order match
lines1 = "".join(align1)[1:].split("\n>")
lines2 = "".join(align2)[1:].split("\n>")
dalign1 = dict([i.split("\n", 1) for i in lines1])
dalign2 = dict([i.split("\n", 1) for i in lines2])
# sort the first reads
keys = list(dalign1.keys())
seed = [i for i in keys if i[-1] == "*"][0]
keys.pop(keys.index(seed))
order = [seed] + sorted(
keys, key=_get_derep_num, reverse=True)
# combine in order
alignpe = []
for key in order:
alignpe.append("\n".join([
key,
dalign1[key].replace("\n", "") + "nnnn" + \
dalign2[key].replace("\n", "")]))
## append aligned cluster string
aligned.append("\n".join(alignpe).strip())
# Malformed clust. Dictionary creation with only 1 element
except ValueError as inst:
ip.logger.debug(
"Bad PE cluster - {}\nla1 - {}\nla2 - {}"
.format(clust, lines1, lines2))
## Either reads are SE, or at least some pairs are merged.
except IndexError:
# limit the number of input seqs
# use lclust already built before checking pairs
lclust = "\n".join(clust.split()[:maxseqs * 2])
# the muscle command with alignment as stdin and // as splitter
cmd = ("echo -e '{}' | {} -quiet -in - ; echo {}"
.format(lclust, ip.bins.muscle, "//\n"))
## send cmd to the bash shell (TODO: PIPE could overflow here!)
proc.stdin.write(cmd.encode())
## read the stdout by line until // is reached. This BLOCKS.
for line in iter(proc.stdout.readline, b'//\n'):
align1.append(line.decode())
## remove '>' from names, and '\n' from inside long seqs
lines = "".join(align1)[1:].split("\n>")
## find seed of the cluster and put it on top.
seed = [i for i in lines if i.split(";")[-1][0] == "*"][0]
lines.pop(lines.index(seed))
lines = [seed] + sorted(
lines, key=_get_derep_num, reverse=True)
## format remove extra newlines from muscle
aa = [i.split("\n", 1) for i in lines]
align1 = [i[0] + '\n' + "".join([j.replace("\n", "")
for j in i[1:]]) for i in aa]
# trim edges in sloppy gbs/ezrad data.
# Maybe relevant to other types too...
if is_gbs:
align1 = _gbs_trim(align1)
## append to aligned
aligned.append("\n".join(align1))
# cleanup
proc.stdout.close()
if proc.stderr:
proc.stderr.close()
proc.stdin.close()
proc.wait()
## return the aligned clusters
return aligned
In [35]:
for sample in s3.samples:
ip.assemble.clustmap._sample_cleanup(s3.data, sample)
In [36]:
ip.assemble.clustmap._get_quick_depths(s3.data, sample)
In [37]:
if sample.files.get('clusters'):
pass
In [43]:
fclust = data.samples[sample.name].files.clusters
clusters = gzip.open(fclust, 'rt')
pairdealer = izip(*[iter(clusters)] * 2)
## storage
depths = []
maxlen = []
## start with cluster 0
tdepth = 0
tlen = 0
## iterate until empty
while 1:
## grab next
try:
name, seq = next(pairdealer)
except StopIteration:
break
## if not the end of a cluster
#print name.strip(), seq.strip()
#print(name)
if name.strip() == seq.strip():
depths.append(tdepth)
maxlen.append(tlen)
tlen = 0
tdepth = 0
else:
tdepth += int(name.strip().split("=")[-1][:-1])
tlen = len(seq)
In [12]:
sample.name
Out[12]:
In [ ]:
In [6]:
s3.remote_run_dereps()
In [7]:
s3.remote_run_cluster_map_build()
In [8]:
s3.remote_run_align_cleanup()
In [90]:
handle = "pairtest-tmpalign/1A_0_chunk_0.ali"
with open(handle, 'rb') as infile:
clusts = infile.read().decode().split("//\n//\n")
# remove any empty spots
clusts = [i for i in clusts if i]
# Skip entirely empty chunks
if not clusts:
raise IPyradError("no clusters in file: {}".format(handle))
In [91]:
maxseqs = 200
is_gbs = False
In [ ]:
proc = sps.Popen(
["bash"],
stdin=sps.PIPE,
stdout=sps.PIPE,
bufsize=0,
)
## iterate over clusters in this file until finished
aligned = []
for clust in clusts:
## new alignment string for read1s and read2s
align1 = []
align2 = []
## don't bother aligning if only one seq
if clust.count(">") == 1:
aligned.append(clust.replace(">", "").strip())
else:
# do we need to split the alignment? (is there a PE insert?)
try:
# make into list (only read maxseqs lines, 2X cuz names)
lclust = clust.split()[:maxseqs * 2]
# try to split cluster list at nnnn separator for each read
lclust1 = list(chain(*zip(
lclust[::2], [i.split("nnnn")[0] for i in lclust[1::2]])))
lclust2 = list(chain(*zip(
lclust[::2], [i.split("nnnn")[1] for i in lclust[1::2]])))
# put back into strings
clust1 = "\n".join(lclust1)
clust2 = "\n".join(lclust2)
# Align the first reads.
# The muscle command with alignment as stdin and // as split
cmd1 = ("echo -e '{}' | {} -quiet -in - ; echo {}"
.format(clust1, ip.bins.muscle, "//\n"))
# send cmd1 to the bash shell
proc.stdin.write(cmd1.encode())
# read the stdout by line until splitter is reached
# meaning that the alignment is finished.
for line in iter(proc.stdout.readline, '//\n'):
align1.append(line)
# Align the second reads.
# The muscle command with alignment as stdin and // as split
cmd2 = ("echo -e '{}' | {} -quiet -in - ; echo {}"
.format(clust2, ip.bins.muscle, "//\n"))
# send cmd2 to the bash shell
proc.stdin.write(cmd2.encode())
# read the stdout by line until splitter is reached
# meaning that the alignment is finished.
for line in iter(proc.stdout.readline, b'//\n'):
align2.append(line)
# join up aligned read1 and read2 and ensure names order match
lines1 = "".join(align1)[1:].split("\n>")
lines2 = "".join(align2)[1:].split("\n>")
dalign1 = dict([i.split("\n", 1) for i in lines1])
dalign2 = dict([i.split("\n", 1) for i in lines2])
# sort the first reads
keys = list(dalign1.keys())
seed = [i for i in keys if i[-1] == "*"][0]
keys.pop(keys.index(seed))
order = [seed] + sorted(
keys, key=_get_derep_num, reverse=True)
# combine in order
for key in order:
align1.append("\n".join([
key,
dalign1[key].replace("\n", "") + "nnnn" + \
dalign2[key].replace("\n", "")]))
## append aligned cluster string
aligned.append("\n".join(align1).strip())
# Malformed clust. Dictionary creation with only 1 element
except ValueError as inst:
ip.logger.debug(
"Bad PE cluster - {}\nla1 - {}\nla2 - {}"
.format(clust, lines1, lines2))
## Either reads are SE, or at least some pairs are merged.
except IndexError:
# limit the number of input seqs
# use lclust already built before checking pairs
lclust = "\n".join(clust.split()[:maxseqs * 2])
# the muscle command with alignment as stdin and // as splitter
cmd = ("echo -e '{}' | {} -quiet -in - ; echo {}"
.format(lclust, ip.bins.muscle, "//\n"))
## send cmd to the bash shell (TODO: PIPE could overflow here!)
proc.stdin.write(cmd.encode())
## read the stdout by line until // is reached. This BLOCKS.
for line in iter(proc.stdout.readline, b'//\n'):
align1.append(line.decode())
## remove '>' from names, and '\n' from inside long seqs
lines = "".join(align1)[1:].split("\n>")
## find seed of the cluster and put it on top.
seed = [i for i in lines if i.split(";")[-1][0] == "*"][0]
lines.pop(lines.index(seed))
lines = [seed] + sorted(
lines, key=_get_derep_num, reverse=True)
## format remove extra newlines from muscle
aa = [i.split("\n", 1) for i in lines]
align1 = [i[0] + '\n' + "".join([j.replace("\n", "")
for j in i[1:]]) for i in aa]
# trim edges in sloppy gbs/ezrad data.
# Maybe relevant to other types too...
if is_gbs:
align1 = _gbs_trim(align1)
## append to aligned
aligned.append("\n".join(align1))
# cleanup
proc.stdout.close()
if proc.stderr:
proc.stderr.close()
proc.stdin.close()
proc.wait()
## return the aligned clusters
#return aligned
In [ ]:
In [37]:
from ipyrad.assemble.clustmap import _get_derep_num
In [80]:
# join up aligned read1 and read2 and ensure names order match
lines1 = "".join(align1)[1:].split("\n>")
lines2 = "".join(align2)[1:].split("\n>")
dalign1 = dict([i.split("\n", 1) for i in lines1])
dalign2 = dict([i.split("\n", 1) for i in lines2])
# sort the first reads
keys = list(dalign1.keys())
seed = [i for i in keys if i[-1] == "*"][0]
keys.pop(keys.index(seed))
order = [seed] + sorted(
keys, key=_get_derep_num, reverse=True)
In [82]:
# join up aligned read1 and read2 and ensure names order match
lines1 = "".join(align1)[1:].split("\n>")
lines2 = "".join(align2)[1:].split("\n>")
dalign1 = dict([i.split("\n", 1) for i in lines1])
dalign2 = dict([i.split("\n", 1) for i in lines2])
# sort the first reads
keys = list(dalign1.keys())
seed = [i for i in keys if i[-1] == "*"][0]
keys.pop(keys.index(seed))
order = [seed] + sorted(
keys, key=_get_derep_num, reverse=True)
# combine in order
for key in order:
align1.append("\n".join([
key,
dalign1[key].replace("\n", "") + "nnnn" + \
dalign2[key].replace("\n", "")]))
## append aligned cluster string
aligned.append("\n".join(align1).strip())
In [83]:
aligned
Out[83]:
In [57]:
## remove '>' from names, and '\n' from inside long seqs
lines1 = "".join(align1)[1:].split("\n>")
seed = [i for i in lines1 if i.split(";")[-1][0] == "*"][0]
lines1.pop(lines1.index(seed))
lines1 = [seed] + sorted(
lines1, key=_get_derep_num, reverse=True)
dalign1 = dict([i.split("\n", 1) for i in lines1])
lines2 = "".join(align2)[1:].split("\n>")
dalign2 = dict([i.split("\n", 1) for i in lines2])
#seed = [i for i in lines2 if i.split(";")[-1][0] == "*"][0]
#lines2.pop(lines2.index(seed))
In [24]:
## format remove extra newlines from muscle
aa = [i.split("\n", 1) for i in lines]
align1 = [i[0] + '\n' + "".join([j.replace("\n", "")
for j in i[1:]]) for i in aa]
In [25]:
align1
Out[25]:
In [16]:
derephandle = os.path.join(data.tmpdir, sample.name + "_derep.fastq")
uhandle = os.path.join(data.dirs.clusts, sample.name + ".utemp")
temphandle = os.path.join(data.dirs.clusts, sample.name + ".htemp")
In [19]:
derepfile = os.path.join(data.tmpdir, sample.name + "_derep.fastq")
uhandle = os.path.join(data.dirs.clusts, sample.name + ".utemp")
usort = os.path.join(data.dirs.clusts, sample.name + ".utemp.sort")
hhandle = os.path.join(data.dirs.clusts, sample.name + ".htemp")
sample.files.clusters = os.path.join(
data.dirs.clusts, sample.name + ".clust.gz")
clustsout = gzip.open(sample.files.clusters, 'wt')
In [20]:
cmd = ["sort", "-k", "2", uhandle, "-o", usort]
proc = sps.Popen(cmd, close_fds=True)
proc.communicate()[0]
In [21]:
alldereps = {}
with open(derepfile, 'rt') as ioderep:
dereps = izip(*[iter(ioderep)] * 2)
for namestr, seq in dereps:
nnn, sss = [i.strip() for i in (namestr, seq)]
alldereps[nnn[1:]] = sss
In [25]:
seedsseen = set()
maxindels = 8
## Iterate through the usort file grabbing matches to build clusters
with open(usort, 'rt') as insort:
## iterator, seed null, seqlist null
isort = iter(insort)
lastseed = 0
fseqs = []
seqlist = []
seqsize = 0
while 1:
## grab the next line
try:
hit, seed, _, ind, ori, _ = next(isort).strip().split()
except StopIteration:
break
## same seed, append match
if seed != lastseed:
seedsseen.add(seed)
## store the last cluster (fseq), count it, and clear fseq
if fseqs:
## sort fseqs by derep after pulling out the seed
fseqs = [fseqs[0]] + sorted(fseqs[1:],
key=lambda x:
int(x.split(";size=")[1].split(";")[0]), reverse=True)
seqlist.append("\n".join(fseqs))
seqsize += 1
fseqs = []
# occasionally write/dump stored clusters to file and clear mem
if not seqsize % 10000:
if seqlist:
clustsout.write(
"\n//\n//\n".join(seqlist) + "\n//\n//\n")
## reset list and counter
seqlist = []
## store the new seed on top of fseq list
fseqs.append(">{}*\n{}".format(seed, alldereps[seed]))
lastseed = seed
## add match to the seed
## revcomp if orientation is reversed (comp preserves nnnn)
if ori == "-":
seq = comp(alldereps[hit])[::-1]
else:
seq = alldereps[hit]
## only save if not too many indels
if int(ind) <= maxindels:
fseqs.append(">{}{}\n{}".format(hit, ori, seq))
else:
ip.logger.info("filtered by maxindels: %s %s", ind, seq)
## write whatever is left over to the clusts file
if fseqs:
seqlist.append("\n".join(fseqs))
if seqlist:
clustsout.write("\n//\n//\n".join(seqlist) + "\n//\n//\n")
## now write the seeds that had no hits. Make dict from htemp
with open(hhandle, 'rt') as iotemp:
nohits = izip(*[iter(iotemp)] * 2)
seqlist = []
seqsize = 0
while 1:
try:
nnn, _ = [i.strip() for i in next(nohits)]
except StopIteration:
break
## occasionally write to file
if not seqsize % 10000:
if seqlist:
clustsout.write("\n//\n//\n".join(seqlist) + "\n//\n//\n")
## reset list and counter
seqlist = []
## append to list if new seed
if nnn[1:] not in seedsseen:
seqlist.append("{}*\n{}".format(nnn, alldereps[nnn[1:]]))
seqsize += 1
## write whatever is left over to the clusts file
if seqlist:
clustsout.write("\n//\n//\n".join(seqlist))
## close the file handle
clustsout.close()
del alldereps
In [28]:
seedsseen
Out[28]:
In [6]:
sample.concatfiles = concat_multiple_edits(data, sample)
In [7]:
sample.mergedfile = merge_pairs(data, sample, 1, 1)
In [10]:
new_derep_and_sort(
data,
sample.mergedfile,
os.path.join(data.tmpdir, sample.name + "_derep.fastq"),
2)
In [6]:
data = data
sample = sample
revcomp = 1
vsearch_merge = 1
sample.concatfiles = concat_multiple_edits(data, sample)
sample.mergefile = os.path.join(data.tmpdir, sample.name + "_merged_.fastq")
In [8]:
sample.mergefile = os.path.join(data.tmpdir, sample.name + "_merged_.fastq")
if 'pair' in data.paramsdict['datatype']:
if "reference" not in data.paramsdict["assembly_method"]:
nmerged = ip.assemble.clustmap._merge_pairs(data, sample, 1, 1)
else:
nmerged = 0 # _merge_pairs(data, sample, 0, 0)
sample.stats.reads_merged = nmerged
In [ ]:
new_derep_and_sort(data, sampl)
In [10]:
def new_derep_and_sort(data, infile, outfile, nthreads):
"""
Dereplicates reads and sorts so reads that were highly replicated are at
the top, and singletons at bottom, writes output to derep file. Paired
reads are dereplicated as one concatenated read and later split again.
Updated this function to take infile and outfile to support the double
dereplication that we need for 3rad (5/29/15 iao).
"""
## datatypes options
strand = "plus"
if data.paramsdict["datatype"] is ('gbs' or '2brad'):
strand = "both"
## do dereplication with vsearch
cmd = [
ip.bins.vsearch,
"--derep_fulllength", infile,
"--strand", strand,
"--output", outfile,
"--threads", str(nthreads),
"--fasta_width", str(0),
"--fastq_qmax", "1000",
"--sizeout",
"--relabel_md5",
]
ip.logger.info("derep cmd %s", cmd)
## build PIPEd job
proc = sps.Popen(cmd, stderr=sps.STDOUT, stdout=sps.PIPE, close_fds=True)
errmsg = proc.communicate()[0]
if proc.returncode:
ip.logger.error("error inside derep_and_sort %s", errmsg)
raise IPyradWarningExit(errmsg)
In [ ]:
new_derep_and_sort()
In [43]:
## CONCAT FILES FOR MERGED ASSEMBLIES
mergefile = os.path.join(data.tmpdir, sample.name + "_merged_.fastq")
sample.files.edits = concat_multiple_edits(data, sample)
In [41]:
sample.files.edits = [(mergefile, )]
nmerged = ip.assemble.clustmap._merge_pairs(
data, sample.files.edits, mergefile, 1, 1)
sample.stats.reads_merged = nmerged
In [33]:
mergefile
Out[33]:
In [44]:
sample.files.edits
Out[44]:
In [5]:
s3.run()
In [5]:
# muscle align returns values for bad alignments
ip.assemble.cluster_within.sample_cleanup(data, samples[0])
In [6]:
data._build_stat("s3")
Out[6]:
In [7]:
raise IPyradError("hi")
In [4]:
self = data
derep_sort_map(s3.data, samples[0], s3.force)
In [11]:
ra.exception()
Out[11]:
In [ ]:
data.get_params()