In [1]:
from ipyrad.assemble.clustmap import *
from ipyrad.assemble.cluster_across import *
import ipyparallel as ipp
In [2]:
data = ip.Assembly("5-cyatho")
data.set_params("project_dir", "5-cyatho")
data.set_params("sorted_fastq_path", "./example_empirical_rad/*.gz")
data.set_params("filter_adapters", 2)
data.set_params("clust_threshold", .90)
In [17]:
data.run("12")
In [2]:
from ipyrad.assemble.clustmap import *
ipyclient = ipp.Client()
s3 = Step3(data, list(data.samples.values()), 0, 5, True, ipyclient)
samples = list(s3.data.samples.values())
sample = samples[1]
In [19]:
s3.run()
In [21]:
data.run("45")
In [3]:
data = ip.load_json("5-cyatho/5-cyatho.json")
samples = list(data.samples.values())
ipyclient = ipp.Client()
In [4]:
from ipyrad.assemble.cluster_across import *
popmins = {'o': 0, 'c': 3, 'r': 3}
popdict = {
'o': ['33588_przewalskii', '32082_przewalskii'],
'c': ['29154_superba', '30686_cyathophylla', '41478_cyathophylloides', '41954_cyathophylloides'],
'r': [ '35236_rex', '35855_rex', '38362_rex', '39618_rex', '40578_rex', '30556_thamno', '33413_thamno'],
}
data._link_populations(popdict, popmins)
data.populations
Out[4]:
In [5]:
s6 = Step6(data, samples, ipyclient, 0, 0)
In [7]:
s6.remote_build_concats_tier1()
In [8]:
s6.remote_cluster1()
In [9]:
s6.remote_build_concats_tier2()
In [10]:
s6.remote_cluster2()
In [17]:
s6.remote_build_denovo_clusters()
In [13]:
s6.remote_align_denovo_clusters()
In [30]:
chunk = "./5-cyatho/5-cyatho_across/5-cyatho-tmpalign/5-cyatho.chunk_11760"
align_to_array(s6.data, s6.samples, chunk)
In [29]:
def align_to_array(data, samples, chunk):
# data are already chunked, read in the whole thing
with open(chunk, 'rt') as infile:
clusts = infile.read().split("//\n//\n")[:-1]
# snames to ensure sorted order
samples.sort(key=lambda x: x.name)
snames = [sample.name for sample in samples]
# make a tmparr to store metadata (this can get huge, consider using h5)
maxvar = 25
maxlen = data._hackersonly["max_fragment_length"] + 20
maxinds = data.paramsdict["max_Indels_locus"]
ifilter = np.zeros(len(clusts), dtype=np.bool_)
dfilter = np.zeros(len(clusts), dtype=np.bool_)
varpos = np.zeros((len(clusts), maxvar), dtype='u1')
indels = np.zeros((len(clusts), len(samples), sum(maxinds)), dtype='u1')
edges = np.zeros((len(clusts), len(samples), 4), dtype='u1')
consens = np.zeros((len(clusts), maxlen), dtype="S1")
# create a persistent shell for running muscle in.
proc = sps.Popen(["bash"], stdin=sps.PIPE, stdout=sps.PIPE, bufsize=0)
# iterate over clusters until finished
allstack = []
for ldx in range(len(clusts)):
istack = []
lines = clusts[ldx].strip().split("\n")
names = lines[::2]
seqs = lines[1::2]
seqs = [i[:90] for i in seqs]
align1 = []
# find duplicates and skip aligning but keep it for downstream.
unames = set([i.rsplit("_", 1)[0] for i in names])
if len(unames) < len(names):
dfilter[ldx] = True
istack = ["{}\n{}".format(i[1:], j) for i, j in zip(names, seqs)]
else:
# append counter to names because muscle doesn't retain order
nnames = [">{};*{}".format(j[1:], i) for i, j in enumerate(names)]
# make back into strings
cl1 = "\n".join(["\n".join(i) for i in zip(nnames, seqs)])
# store allele (lowercase) info
amask, abool = store_alleles(seqs)
# send align1 to the bash shell (TODO: check for pipe-overflow)
cmd1 = ("echo -e '{}' | {} -quiet -in - ; echo {}"
.format(cl1, ipyrad.bins.muscle, "//\n"))
proc.stdin.write(cmd1.encode())
# read the stdout by line until splitter is reached
for line in iter(proc.stdout.readline, b'//\n'):
align1.append(line.decode())
# reorder b/c muscle doesn't keep order
lines = "".join(align1)[1:].split("\n>")
dalign1 = dict([i.split("\n", 1) for i in lines])
keys = sorted(
dalign1.keys(),
key=lambda x: int(x.rsplit("*")[-1])
)
seqarr = np.zeros(
(len(nnames), len(dalign1[keys[0]].replace("\n", ""))),
dtype='S1',
)
for kidx, key in enumerate(keys):
concatseq = dalign1[key].replace("\n", "")
seqarr[kidx] = list(concatseq)
# fill in edges for each seq
edg = (
len(concatseq) - len(concatseq.lstrip('-')),
len(concatseq.rstrip('-')),
len(concatseq.rstrip('-')),
len(concatseq.rstrip('-')),
)
sidx = snames.index(key.rsplit('_', 1)[0])
edges[ldx, sidx] = edg
# get ref/variant counts in an array
varmat = refbuild(seqarr.view('u1'), PSEUDO_REF)
constmp = varmat[:, 0].view('S1')
consens[ldx, :constmp.size] = constmp
varstmp = np.nonzero(varmat[:, 1])[0]
varpos[ldx, :varstmp.size] = varstmp
# impute allele information back into the read b/c muscle
wkeys = np.argsort([i.rsplit("_", 1)[0] for i in keys])
# sort in sname (alphanumeric) order for indels storage
for idx in wkeys:
# seqarr and amask are in input order here
args = (seqarr, idx, amask)
seqarr[idx], indidx = retrieve_indels_and_alleles(*args)
sidx = snames.index(names[idx].rsplit("_", 1)[0][1:])
if indidx.size:
indsize = min(indidx.size, sum(maxinds))
indels[ldx, sidx, :indsize] = indidx[:indsize]
wname = names[idx]
# store (only for visually checking alignments)
istack.append(
"{}\n{}".format(wname, b"".join(seqarr[idx]).decode()))
# indel filter
if indels.sum(axis=1).max() >= sum(maxinds):
ifilter[ldx] = True
# store the stack (only for visually checking alignments)
if istack:
allstack.append("\n".join(istack))
# cleanup
proc.stdout.close()
if proc.stderr:
proc.stderr.close()
proc.stdin.close()
proc.wait()
# write to file after (only for visually checking alignments)
odx = chunk.rsplit("_")[-1]
alignfile = os.path.join(data.tmpdir, "aligned_{}.fa".format(odx))
with open(alignfile, 'wt') as outfile:
outfile.write("\n//\n//\n".join(allstack) + "\n")
#os.remove(chunk)
# save indels array to tmp dir
np.save(
os.path.join(data.tmpdir, "ifilter_{}.tmp.npy".format(odx)),
ifilter)
np.save(
os.path.join(data.tmpdir, "dfilter_{}.tmp.npy".format(odx)),
dfilter)
In [13]:
data = ip.load_json("4-refpairtest.json")
samples = list(data.samples.values())
sample = samples[0]
force = True
ipyclient = ipp.Client()
In [3]:
data.samples
Out[3]:
In [4]:
popmins = {'1': 3, '2': 3, '3': 3}
popdict = {
'1': ['1A_0', '1B_0', '1C_0', '1D_0'],
'2': ['2E_0', '2F_0', '2G_0', '2H_0'],
'3': ['3I_0', '3J_0', '3K_0', '3L_0'],
}
data._link_populations(popdict, popmins)
data.populations
Out[4]:
In [5]:
s = Step6(data, samples, ipyclient, 123, True)
s.samples
Out[5]:
In [6]:
s.remote_build_concats_tier1()
In [7]:
s.remote_cluster1()
In [8]:
s.remote_build_concats_tier2()
In [9]:
s.remote_cluster2()
In [10]:
s.remote_build_denovo_clusters()
In [11]:
chunks = glob.glob('4-refpairtest_across/4-refpairtest-tmpalign/*.chunk*')
for chunk in chunks:
align_to_array(s.data, s.samples, chunk)
In [317]:
chunk = "4-refpairtest_across/4-refpairtest-tmpalign/4-refpairtest.chunk_70"
samples = s.samples
data = s.data
# data are already chunked, read in the whole thing
#with open(chunk, 'rt') as infile:
# clusts = infile.read().split("//\n//\n")[:-1]
In [389]:
nnames
Out[389]:
In [451]:
def align_to_array(data, samples, chunk):
# data are already chunked, read in the whole thing
with open(chunk, 'rt') as infile:
clusts = infile.read().split("//\n//\n")[:-1]
# snames to ensure sorted order
samples.sort(key=lambda x: x.name)
snames = [sample.name for sample in samples]
# make a tmparr to store metadata (this can get huge, consider using h5)
maxvar = 25
maxlen = data._hackersonly["max_fragment_length"] + 20
maxinds = data.paramsdict["max_Indels_locus"]
ifilter = np.zeros(len(clusts), dtype=np.bool_)
dfilter = np.zeros(len(clusts), dtype=np.bool_)
varpos = np.zeros((len(clusts), maxvar), dtype='u1')
indels = np.zeros((len(clusts), len(samples), sum(maxinds)), dtype='u1')
edges = np.zeros((len(clusts), len(samples), 4), dtype='u1')
consens = np.zeros((len(clusts), maxlen), dtype="S1")
# create a persistent shell for running muscle in.
proc = sps.Popen(["bash"], stdin=sps.PIPE, stdout=sps.PIPE, bufsize=0)
# iterate over clusters until finished
allstack = []
for ldx in range(len(clusts)):
istack = []
lines = clusts[ldx].strip().split("\n")
names = lines[::2]
seqs = lines[1::2]
align1 = []
# find duplicates and skip aligning/ordering since it will be filtered.
unames = set([i.rsplit("_", 1)[0] for i in names])
if len(unames) < len(names):
dfilter[ldx] = True
istack = ["{}\n{}".format(i[1:], j) for i, j in zip(names, seqs)]
else:
# append counter to names because muscle doesn't retain order
nnames = [">{};*{}".format(j[1:], i) for i, j in enumerate(names)]
# make back into strings
cl1 = "\n".join(["\n".join(i) for i in zip(nnames, seqs)])
# store allele (lowercase) info on seqs
amask, abool = store_alleles(seqs)
# send align1 to the bash shell (TODO: check for pipe-overflow)
cmd1 = ("echo -e '{}' | {} -quiet -in - ; echo {}"
.format(cl1, ipyrad.bins.muscle, "//\n"))
proc.stdin.write(cmd1.encode())
# read the stdout by line until splitter is reached
for line in iter(proc.stdout.readline, b'//\n'):
align1.append(line.decode())
# reorder into original order and arrange into a dictionary and arr
lines = "".join(align1)[1:].split("\n>")
dalign1 = dict([i.split("\n", 1) for i in lines])
keys = sorted(
dalign1.keys(),
key=lambda x: int(x.rsplit("*")[-1])
)
seqarr = np.zeros(
(len(nnames), len(dalign1[keys[0]].replace("\n", ""))),
dtype='S1',
)
for kidx, key in enumerate(keys):
concatseq = dalign1[key].replace("\n", "")
seqarr[kidx] = list(concatseq)
# fill in edges for each seq
edg = (
len(concatseq) - len(concatseq.lstrip('-')),
len(concatseq.rstrip('-')),
len(concatseq.rstrip('-')),
len(concatseq.rstrip('-')),
)
sidx = snames.index(key.rsplit('_', 1)[0])
edges[ldx, sidx] = edg
# get ref/variant counts in an array
varmat = refbuild(seqarr.view('u1'), PSEUDO_REF)
constmp = varmat[:, 0].view("S1")
consens[ldx, :constmp.size] = constmp
varstmp = np.nonzero(varmat[:, 1])[0]
varpos[ldx, :varstmp.size] = varstmp
# impute allele information back into the read b/c muscle
wkeys = np.argsort([i.rsplit("_", 1)[0] for i in keys])
# sort in sname (alphanumeric) order for indels storage
for idx in wkeys:
# seqarr and amask are in input order here
args = (seqarr, idx, amask)
seqarr[idx], indidx = retrieve_indels_and_alleles(*args)
sidx = snames.index(names[idx].rsplit("_", 1)[0][1:])
if indidx.size:
indels[sidx, :indidx.size] = indidx
wname = names[idx]
istack.append(
"{}\n{}".format(wname, b"".join(seqarr[idx]).decode()))
# store the stack
if istack:
allstack.append("\n".join(istack))
# cleanup
proc.stdout.close()
if proc.stderr:
proc.stderr.close()
proc.stdin.close()
proc.wait()
# write to file after
odx = chunk.rsplit("_")[-1]
alignfile = os.path.join(data.tmpdir, "align_{}.fa".format(odx))
with open(alignfile, 'wt') as outfile:
outfile.write("\n//\n//\n".join(allstack) + "\n")
#os.remove(chunk)
# save indels array to tmp dir
ifile = os.path.join(data.tmpdir, "indels_{}.tmp.npy".format(odx))
np.save(ifile, ifilter)
dfile = os.path.join(data.tmpdir, "duples_{}.tmp.npy".format(odx))
np.save(dfile, dfilter)
In [456]:
def store_alleles(seqs):
shape = (len(seqs), max([len(i) for i in seqs]))
arrseqs = np.zeros(shape, dtype=np.bytes_)
for row in range(arrseqs.shape[0]):
seqsrow = seqs[row]
arrseqs[row, :len(seqsrow)] = list(seqsrow)
amask = np.char.islower(arrseqs)
if np.any(amask):
return amask, True
else:
return amask, False
def retrieve_indels_and_alleles(seqarr, idx, amask):
concatarr = seqarr[idx]
newmask = np.zeros(len(concatarr), dtype=np.bool_)
# check for indels and impute to amask
indidx = np.where(concatarr == b"-")[0]
if np.sum(amask):
print('yes')
if indidx.size:
# impute for position of variants
allrows = np.arange(amask.shape[1])
mask = np.ones(allrows.shape[0], dtype=np.bool_)
for idx in indidx:
if idx < mask.shape[0]:
mask[idx] = False
not_idx = allrows[mask == 1]
# fill in new data into all other spots
newmask[not_idx] = amask[idx, :not_idx.shape[0]]
else:
newmask = amask[idx]
# lower the alleles
concatarr[newmask] = np.char.lower(concatarr[newmask])
return concatarr, indidx
In [462]:
np.load("4-refpairtest_across/4-refpairtest-tmpalign/indels_35.tmp.npy")
Out[462]:
In [457]:
align_to_array(data, samples, chunk)
In [459]:
chunks = glob.glob("4-refpairtest_across/4-refpairtest-tmpalign/4-refpairtest.chunk_*")
for chunk in chunks:
align_to_array(data, samples, chunk)
In [263]:
@numba.jit(nopython=True)
def refbuild(iseq, consdict):
""" Returns the most common base at each site in order. """
altrefs = np.zeros((iseq.shape[1], 4), dtype=np.uint8)
for col in range(iseq.shape[1]):
## expand colums with ambigs and remove N-
fcounts = np.zeros(111, dtype=np.int64)
counts = np.bincount(iseq[:, col])#, minlength=90)
fcounts[:counts.shape[0]] = counts
## set N and - to zero, wish numba supported minlen arg
fcounts[78] = 0
fcounts[45] = 0
## add ambig counts to true bases
for aidx in range(consdict.shape[0]):
nbases = fcounts[consdict[aidx, 0]]
for _ in range(nbases):
fcounts[consdict[aidx, 1]] += 1
fcounts[consdict[aidx, 2]] += 1
fcounts[consdict[aidx, 0]] = 0
## now get counts from the modified counts arr
who = np.argmax(fcounts)
altrefs[col, 0] = who
fcounts[who] = 0
## if an alt allele fill over the "." placeholder
who = np.argmax(fcounts)
if who:
altrefs[col, 1] = who
fcounts[who] = 0
## if 3rd or 4th alleles observed then add to arr
who = np.argmax(fcounts)
altrefs[col, 2] = who
fcounts[who] = 0
## if 3rd or 4th alleles observed then add to arr
who = np.argmax(fcounts)
altrefs[col, 3] = who
return altrefs
PSEUDO_REF = np.array([
[82, 71, 65],
[75, 71, 84],
[83, 71, 67],
[89, 84, 67],
[87, 84, 65],
[77, 67, 65],
[78, 78, 78],
[45, 45, 45],
], dtype=np.uint8)
In [274]:
seqarr.view("u1")
Out[274]:
In [132]:
aseqs = np.array([
list('ATG--GGA'),
list('A--GGGGA'),
list('---GGGGA'),
list('--------'),
])
np.where(aseqs == "-")
Out[132]:
In [156]:
PSEUDO_REF = np.array([
[82, 71, 65],
[75, 71, 84],
[83, 71, 67],
[89, 84, 67],
[87, 84, 65],
[77, 67, 65],
[78, 78, 78],
[45, 45, 45],
], dtype=np.uint8)
In [164]:
@numba.jit(nopython=True)
def reftrick(iseq, consdict):
""" Returns the most common base at each site in order. """
altrefs = np.zeros((iseq.shape[1], 4), dtype=np.uint8)
#altrefs[:, 1] = 46
for col in range(iseq.shape[1]):
## expand colums with ambigs and remove N-
fcounts = np.zeros(111, dtype=np.int64)
counts = np.bincount(iseq[:, col])#, minlength=90)
fcounts[:counts.shape[0]] = counts
## set N and - to zero, wish numba supported minlen arg
fcounts[78] = 0
fcounts[45] = 0
## add ambig counts to true bases
for aidx in range(consdict.shape[0]):
nbases = fcounts[consdict[aidx, 0]]
for _ in range(nbases):
fcounts[consdict[aidx, 1]] += 1
fcounts[consdict[aidx, 2]] += 1
fcounts[consdict[aidx, 0]] = 0
## now get counts from the modified counts arr
who = np.argmax(fcounts)
altrefs[col, 0] = who
fcounts[who] = 0
## if an alt allele fill over the "." placeholder
who = np.argmax(fcounts)
if who:
altrefs[col, 1] = who
fcounts[who] = 0
## if 3rd or 4th alleles observed then add to arr
who = np.argmax(fcounts)
altrefs[col, 2] = who
fcounts[who] = 0
## if 3rd or 4th alleles observed then add to arr
who = np.argmax(fcounts)
altrefs[col, 3] = who
return altrefs
In [195]:
sss = np.array([
list("---AAAA----"),
list("---TTAA----"),
list("AAAAAAAAAAA"),
list("AATAAAATAAA"),
], dtype="S1")
sss
Out[195]:
In [196]:
varmat = reftrick(sss.view(np.uint8), PSEUDO_REF)
varmat
Out[196]:
In [226]:
names
Out[226]:
In [202]:
%%timeit
store_alleles(seqs)
In [219]:
[b"".join(i) for i in arrseqs ]
Out[219]:
In [229]:
snames
Out[229]:
In [237]:
widx = np.argsort([i.rsplit(";", 1)[0] for i in keys])
for i in widx:
print(names[i], b"".join(seqarr[i]))
In [184]:
consensus = varmat[:, 0]
variants = np.nonzero(varmat[:, 1])[0]
variants
Out[184]:
In [148]:
seqarr == seqarr[0]
Out[148]:
In [145]:
for kidx in range(seqarr.shape[0]):
pass
In [ ]:
In [ ]:
In [133]:
# reorder b/c muscle doesn't keep order
lines = "".join(align1)[1:].split("\n>")
dalign1 = dict([i.split("\n", 1) for i in lines])
keys = sorted(
dalign1.keys(),
key=lambda x: int(x.rsplit("*")[-1])
)
seqarr = np.zeros(
(len(nnames), len(dalign1[keys[0]])),
dtype='U1',
)
for kidx, key in enumerate(keys):
concatseq = dalign1[key].replace("\n", '')
seqarr[kidx] = list(dalign1[key].replace("\n", ""))
seqarr
Out[133]:
In [101]:
# fill in edges for each seq
edg = (
len(concatseq) - len(concatseq.lstrip('-')),
len(concatseq.rstrip('-')),
len(concatseq.rstrip('-')),
len(concatseq.rstrip('-')),
)
edges[ldx, kidx] = edg
edges[ldx, kidx]
concatseq[0:50] + 'nnnn'[50:50] + concatseq[50:50]
Out[101]:
In [22]:
seeds = [
os.path.join(
data.dirs.across,
"{}-{}.htemp".format(data.name, jobid)) for jobid in jobids
]
allseeds = os.path.join(data.dirs.across, "{}-x.fa".format(data.name))
cmd1 = ['cat'] + seeds
cmd2 = [ipyrad.bins.vsearch, '--sortbylength', '-', '--output', allseeds]
proc1 = sps.Popen(cmd1, stdout=sps.PIPE, close_fds=True)
proc2 = sps.Popen(cmd2, stdin=proc1.stdout, stdout=sps.PIPE, close_fds=True)
out = proc2.communicate()
proc1.stdout.close()
In [23]:
out
Out[23]:
In [12]:
data = s.data
jobid = 2
nthreads = 4
In [14]:
# get files for this jobid
catshuf = os.path.join(
data.dirs.across,
"{}-{}-catshuf.fa".format(data.name, jobid))
uhaplos = os.path.join(
data.dirs.across,
"{}-{}.utemp".format(data.name, jobid))
hhaplos = os.path.join(
data.dirs.across,
"{}-{}.htemp".format(data.name, jobid))
#logfile = os.path.join(data.dirs.across, "s6_cluster_stats.txt")
## parameters that vary by datatype
## (too low of cov values yield too many poor alignments)
strand = "plus"
cov = 0.75 # 0.90
if data.paramsdict["datatype"] in ["gbs", "2brad"]:
strand = "both"
cov = 0.60
elif data.paramsdict["datatype"] == "pairgbs":
strand = "both"
cov = 0.75 # 0.90
## nthreads is calculated in 'call_cluster()'
cmd = [ipyrad.bins.vsearch,
"-cluster_smallmem", catshuf,
"-strand", strand,
"-query_cov", str(cov),
"-minsl", str(0.5),
"-id", str(data.paramsdict["clust_threshold"]),
"-userout", uhaplos,
"-notmatched", hhaplos,
"-userfields", "query+target+qstrand",
"-maxaccepts", "1",
"-maxrejects", "0",
"-fasta_width", "0",
"-threads", str(nthreads), # "0",
"-fulldp",
"-usersort",
]
proc = sps.Popen(cmd, stderr=sps.STDOUT, stdout=sps.PIPE)
out = proc.communicate()
if proc.returncode:
raise IPyradError(out)
In [20]:
proc = sps.Popen(cmd, stderr=sps.STDOUT, stdout=sps.PIPE)
out = proc.communicate()
if proc.returncode:
raise IPyradError(out)
In [22]:
build_concat_files(s.data, 1, [s.data.samples[i] for i in s.cgroups[1]], 123)
In [23]:
data = s.data
jobid = 1
samples = [s.data.samples[i] for i in s.cgroups[1]]
randomseed = 123
In [24]:
conshandles = [
sample.files.consens[0] for sample in samples if
sample.stats.reads_consens]
conshandles.sort()
assert conshandles, "no consensus files found"
In [26]:
## concatenate all of the gzipped consens files
cmd = ['cat'] + conshandles
groupcons = os.path.join(
data.dirs.across,
"{}-{}-catcons.gz".format(data.name, jobid))
LOGGER.debug(" ".join(cmd))
with open(groupcons, 'w') as output:
call = sps.Popen(cmd, stdout=output, close_fds=True)
call.communicate()
## a string of sed substitutions for temporarily replacing hetero sites
## skips lines with '>', so it doesn't affect taxon names
subs = ["/>/!s/W/A/g", "/>/!s/w/A/g", "/>/!s/R/A/g", "/>/!s/r/A/g",
"/>/!s/M/A/g", "/>/!s/m/A/g", "/>/!s/K/T/g", "/>/!s/k/T/g",
"/>/!s/S/C/g", "/>/!s/s/C/g", "/>/!s/Y/C/g", "/>/!s/y/C/g"]
subs = ";".join(subs)
## impute pseudo-haplo information to avoid mismatch at hetero sites
## the read data with hetero sites is put back into clustered data later.
## pipe passed data from gunzip to sed.
cmd1 = ["gunzip", "-c", groupcons]
cmd2 = ["sed", subs]
LOGGER.debug(" ".join(cmd1))
LOGGER.debug(" ".join(cmd2))
proc1 = sps.Popen(cmd1, stdout=sps.PIPE, close_fds=True)
allhaps = groupcons.replace("-catcons.gz", "-cathaps.fa")
with open(allhaps, 'w') as output:
proc2 = sps.Popen(cmd2, stdin=proc1.stdout, stdout=output, close_fds=True)
proc2.communicate()
proc1.stdout.close()
## now sort the file using vsearch
allsort = groupcons.replace("-catcons.gz", "-catsort.fa")
cmd1 = [ipyrad.bins.vsearch,
"--sortbylength", allhaps,
"--fasta_width", "0",
"--output", allsort]
LOGGER.debug(" ".join(cmd1))
proc1 = sps.Popen(cmd1, close_fds=True)
proc1.communicate()
Out[26]:
In [28]:
data.dirs.across
Out[28]:
In [8]:
# calculate the theading of cluster1 jobs:
ncpus = len(self.ipyclient.ids)
njobs = len(self.cgroups)
nnodes = len(self.hostd)
minthreads = 2
maxthreads = 10
In [9]:
# product
targets = [0, 1]
self.thview = self.ipyclient.load_balanced_view(targets=targets)
In [10]:
s.prepare_concats()
In [31]:
cluster1(data, 2, 4)
In [30]:
def cluster1(data, jobid, nthreads):
# get files for this jobid
catshuf = os.path.join(
data.dirs.across,
"{}-{}-catshuf.fa".format(data.name, jobid))
uhaplos = os.path.join(
data.dirs.across,
"{}-{}.utemp".format(data.name, jobid))
hhaplos = os.path.join(
data.dirs.across,
"{}-{}.htemp".format(data.name, jobid))
## parameters that vary by datatype
## (too low of cov values yield too many poor alignments)
strand = "plus"
cov = 0.75 # 0.90
if data.paramsdict["datatype"] in ["gbs", "2brad"]:
strand = "both"
cov = 0.60
elif data.paramsdict["datatype"] == "pairgbs":
strand = "both"
cov = 0.75 # 0.90
## nthreads is calculated in 'call_cluster()'
cmd = [ipyrad.bins.vsearch,
"-cluster_smallmem", catshuf,
"-strand", strand,
"-query_cov", str(cov),
"-minsl", str(0.5),
"-id", str(data.paramsdict["clust_threshold"]),
"-userout", uhaplos,
"-notmatched", hhaplos,
"-userfields", "query+target+qstrand",
"-maxaccepts", "1",
"-maxrejects", "0",
"-fasta_width", "0",
"-threads", str(nthreads), # "0",
"-fulldp",
"-usersort",
]
proc = sps.Popen(cmd, stderr=sps.STDOUT, stdout=sps.PIPE)
out = proc.communicate()
if proc.returncode:
raise IPyradError(out)
In [12]:
if len(self.samples) < 50:
nclusters = 1
elif len(self.samples) < 100:
nclusters = 2
elif len(self.samples) < 200:
nclusters = 4
elif len(self.samples) > 500:
nclusters = 10
else:
nclusters = int(len(self.samples) / 10)
In [ ]:
In [10]:
self.cgroups
Out[10]:
In [48]:
hosts
Out[48]:
In [ ]: