notebook.community

Edit and run



In [1]:

    
import ipyrad as ip
import ipyparallel as ipp



In [2]:

    
from ipyrad.assemble.clustmap import *
from ipyrad.assemble.consens_se import *



In [3]:

    
ipyclient = ipp.Client()









    



/home/deren/miniconda3/lib/python3.6/site-packages/ipyparallel/client/client.py:459: RuntimeWarning: 
            Controller appears to be listening on localhost, but not on this machine.
            If this is true, you should specify Client(...,sshserver='you@oud')
            or instruct your controller to listen on an external IP.
  RuntimeWarning)



In [4]:

    
data = ip.Assembly("test")
data.set_params("project_dir", "hotfix")
data.set_params("raw_fastq_path", "ipsimdata/pairddrad_example_R*_.fastq.gz")
data.set_params("barcodes_path", "ipsimdata/pairddrad_example_barcodes.txt")
data.set_params("reference_sequence", "ipsimdata/pairddrad_example_genome.fa")
data.set_params("assembly_method", "reference")
data.set_params("datatype", "pairddrad")
data.set_params("restriction_overhang", ("TGCAG", "CCG"))    
data.get_params()









    



New Assembly: test
0   assembly_name               test                                         
1   project_dir                 ./hotfix                                     
2   raw_fastq_path              ./ipsimdata/pairddrad_example_R*_.fastq.gz   
3   barcodes_path               ./ipsimdata/pairddrad_example_barcodes.txt   
4   sorted_fastq_path                                                        
5   assembly_method             reference                                    
6   reference_sequence          ./ipsimdata/pairddrad_example_genome.fa      
7   datatype                    pairddrad                                    
8   restriction_overhang        ('TGCAG', 'CCG')                             
9   max_low_qual_bases          5                                            
10  phred_Qscore_offset         33                                           
11  mindepth_statistical        6                                            
12  mindepth_majrule            6                                            
13  maxdepth                    10000                                        
14  clust_threshold             0.85                                         
15  max_barcode_mismatch        0                                            
16  filter_adapters             0                                            
17  filter_min_trim_len         35                                           
18  max_alleles_consens         2                                            
19  max_Ns_consens              (5, 5)                                       
20  max_Hs_consens              (8, 8)                                       
21  min_samples_locus           4                                            
22  max_SNPs_locus              (20, 20)                                     
23  max_Indels_locus            (8, 8)                                       
24  max_shared_Hs_locus         0.5                                          
25  trim_reads                  (0, 0, 0, 0)                                 
26  trim_loci                   (0, 0, 0, 0)                                 
27  output_formats              ['p', 's', 'v']                              
28  pop_assign_file



In [5]:

    
data.run("12", force=True)









    



Assembly: test
[####################] 100% 0:00:04 | sorting reads        | s1 |
[####################] 100% 0:00:01 | writing/compressing  | s1 |
[####################] 100% 0:00:04 | processing reads     | s2 |



In [7]:

    
data.run("3")









    



Assembly: test
[####################] 100% 0:00:00 | indexing reference   | s3 |
[####################] 100% 0:00:00 | concatenating        | s3 |
[####################] 100% 0:00:01 | join unmerged pairs  | s3 |
[####################] 100% 0:00:03 | dereplicating        | s3 |
[####################] 100% 0:00:00 | splitting dereps     | s3 |
[####################] 100% 0:00:03 | mapping reads        | s3 |
[####################] 100% 0:00:10 | building clusters    | s3 |
[####################] 100% 0:00:00 | calc cluster stats   | s3 |



In [8]:

    
data.run('4')









    



Assembly: test
[####################] 100% 0:00:06 | inferring [H, E]     | s4 |



In [9]:

    
data.run("5")









    



Assembly: test
[####################] 100% 0:00:00 | calculating depths   | s5 |
[####################] 100% 0:00:00 | chunking clusters    | s5 |
[####################] 100% 0:00:27 | consens calling      | s5 |
[####################] 100% 0:00:01 | indexing alleles     | s5 |



In [11]:

    
data.run("6")









    



Assembly: test
[####################] 100% 0:00:00 | concatenating bams   | s6 |
[####################] 100% 0:00:00 | fetching regions     | s6 |
[####################] 100% 0:00:00 | building loci        | s6 |



In [51]:

    
from ipyrad.assemble.write_outputs import *
bself = Step7(data, True, ipyclient)
self.split_clusters()

jobs = glob.glob(os.path.join(self.data.tmpdir, "chunk-*"))
jobs = sorted(jobs, key=lambda x: int(x.rsplit("-")[-1]))        
for jobfile in jobs:
    args = (self.data, self.chunksize, jobfile)



In [119]:

    
def get_edges(self, seqs):
    """
    Trim terminal edges or mask internal edges based on three criteria and
    take the max for each edge.
    1. user entered hard trimming.
    2. removing cutsite overhangs.
    3. trimming singleton-like overhangs from seqs of diff lengths.
    """
    # record whether to filter this locus based on sample coverage
    bad = False

    # 1. hard trim edges
    trim1 = np.array(self.data.paramsdict["trim_loci"])

    # 2. fuzzy match for trimming restriction site where it's expected.
    trim2 = np.array([0, 0, 0, 0])
    overhangs = np.array([
        i.encode() for i in self.data.paramsdict["restriction_overhang"]
        ])
    for pos, overhang in enumerate(overhangs):
        if overhang:
            cutter = np.array(list(overhang))
            trim2[pos] = check_cutter(seqs, pos, cutter, 0.75)

    # 3. find where the edge is not indel marked (really unknown ("N"))
    trim3 = np.array([0, 0, 0, 0])
    try:            
        minsamp = min(4, seqs.shape[0])
        # minsamp = max(minsamp, self.data.paramsdict["min_samples_locus"])
        mincovs = np.sum((seqs != 78) & (seqs != 45), axis=0)
        for pos in range(4):
            trim3[pos] = check_minsamp(seqs, pos, minsamp, mincovs)
    except ValueError:
        print('error')
        bad = True

    # get max edges
    print(trim1, trim2, trim3)
    trim = np.max([trim1, trim2, trim3], axis=0)

    # return edges as slice indices
    r1left = trim[0]
    
    # single-end simple:
    if "pair" not in self.data.paramsdict["datatype"]:
        r1right = seqs.shape[1] - trim[1]
        r2left = r2right = r1right
        edges = (r1left, r1right, r2left, r2right)
        
    else:
        r1right = 
        
        

    # get filter
    if (r1right < r1left) or (r2left < r1right) or (r2right < r2left):
        bad = True

    # if loc length is too short then filter
    if (r2right - r1left) < self.data.paramsdict["filter_min_trim_len"]:
        bad = True

    return bad, edges



In [120]:

    
def check_minsamp(seq, position, minsamp, mincovs):
    "used in Processor.get_edges() for trimming edges of - or N sites."
    
    if position == 0:           
        # find leftmost edge with minsamp coverage
        leftmost = np.where(mincovs >= minsamp)[0]
        if leftmost.size:
            return leftmost.min()
        
        # no sites actually have minsamp coverage although there are minsamp
        # rows of data, this can happen when reads only partially overlap. Loc
        # should be excluded for minsamp filter.
        else:
            raise ValueError("no sites above minsamp coverage in edge trim")
    
    elif position == 1:
        maxhit = np.where(mincovs >= minsamp)[0].max()
        return seq.shape[1] - (maxhit + 1)

    ## TODO...    
    elif position == 2:
        return 0
    
    else:
        return 0



In [121]:

    
# store list of edge trims for VCF building
edgelist = []

# todo: this could be an iterator...
with open(self.chunkfile, 'rb') as infile:
    loci = infile.read().split(b"//\n//\n")

    # iterate over loci
    for iloc, loc in enumerate(loci):                              
        # load names and seqs 
        lines = loc.decode().strip().split("\n")
        names = []
        nidxs = []
        aseqs = []
        useqs = []
        for line in lines:
            if line[0] == ">":
                name, nidx = line[1:].rsplit("_", 1)
                names.append(name)
                nidxs.append(nidx)
            else:
                aseqs.append(list(line))
                useqs.append(list(line.upper()))

        # filter to include only samples in this assembly
        mask = [i in self.data.snames for i in names]
        names = np.array(names)[mask].tolist()
        nidxs = np.array(nidxs)[mask].tolist()
        useqs = np.array(useqs)[mask, :].astype(bytes).view(np.uint8)
        aseqs = np.array(aseqs)[mask, :].astype(bytes).view(np.uint8)

        # apply filters
        efilter, edges = get_edges(self, useqs)
        print(efilter, edges)









    



[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 456, 456, 456)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 471, 471, 471)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 466, 466, 466)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 478, 478, 478)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 464, 464, 464)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 459, 459, 459)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 484, 484, 484)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 441, 441, 441)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 470, 470, 470)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 486, 486, 486)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 442, 442, 442)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 465, 465, 465)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 478, 478, 478)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 453, 453, 453)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 453, 453, 453)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 466, 466, 466)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 473, 473, 473)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 441, 441, 441)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 438, 438, 438)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 464, 464, 464)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 482, 482, 482)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 443, 443, 443)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 469, 469, 469)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 456, 456, 456)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 480, 480, 480)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 463, 463, 463)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 478, 478, 478)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 438, 438, 438)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 442, 442, 442)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 443, 443, 443)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 454, 454, 454)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 467, 467, 467)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 451, 451, 451)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 472, 472, 472)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 476, 476, 476)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 439, 439, 439)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 457, 457, 457)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 453, 453, 453)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 475, 475, 475)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 448, 448, 448)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 440, 440, 440)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 460, 460, 460)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 469, 469, 469)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 473, 473, 473)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 444, 444, 444)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 442, 442, 442)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 449, 449, 449)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 484, 484, 484)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 486, 486, 486)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 478, 478, 478)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 467, 467, 467)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 479, 479, 479)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 476, 476, 476)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 446, 446, 446)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 452, 452, 452)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 478, 478, 478)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 450, 450, 450)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 458, 458, 458)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 453, 453, 453)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 452, 452, 452)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 451, 451, 451)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 459, 459, 459)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 443, 443, 443)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 484, 484, 484)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 453, 453, 453)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 442, 442, 442)



In [54]:

    
data = self.data
chunksize = self.chunksize
chunkfile = jobs[0]

self = Processor(data, chunksize, chunkfile)



In [23]:

    
self.remote_process_chunks()
self.collect_stats()
self.store_file_handles()









    



[####################] 100% 0:00:01 | applying filters     | s7 |



In [24]:

    
start = time.time()
printstr = ("building arrays     ", "s7")
rasyncs = {}
args0 = (self.data,)
args1 = (self.data, self.ntaxa, self.nbases, self.nloci)
args2 = (self.data, self.ntaxa, self.nsnps)

write_loci_and_alleles(*args0)



In [25]:

    
fill_seq_array(*args1)



In [49]:

    
data = self.data
ntaxa = self.ntaxa
nsnps = self.nsnps

# get faidict to convert chroms to ints
if data.isref:
    faidict = chroms2ints(data, True)

# open new database file handle
with h5py.File(data.snps_database, 'w') as io5:

    # Database files for storing arrays on disk. 
    # Should optimize for slicing by rows if we run into slow writing, or 
    # it uses too much mem. For now letting h5py to auto-chunking.
    io5.create_dataset(
        name="snps",
        shape=(ntaxa, nsnps),
        dtype=np.uint8,
    )
    # store snp locations:
    # (loc-counter, loc-snp-counter, loc-snp-pos, chrom, chrom-snp-pos)
    io5.create_dataset(
        name="snpsmap",
        shape=(nsnps, 5),
        dtype=np.uint32,
    )
    # store snp locations
    io5.create_dataset(
        name="pseudoref",
        shape=(nsnps, 4),
        dtype=np.uint8,
    )
    # store genotype calls (0/0, 0/1, 0/2, etc.)
    io5.create_dataset(
        name="genos",
        shape=(nsnps, ntaxa, 2),
        dtype=np.uint8,
    )

    # gather all loci bits
    locibits = glob.glob(os.path.join(data.tmpdir, "*.loci"))
    sortbits = sorted(locibits, 
        key=lambda x: int(x.rsplit("-", 1)[-1][:-5]))

    # name order for entry in array
    sidxs = {sample: i for (i, sample) in enumerate(data.snames)}

    # iterate through file
    start = end = 0
    tmploc = {}
    locidx = 1
    snpidx = 1   
    
    # array to store until writing
    tmparr = np.zeros((ntaxa, nsnps), dtype=np.uint8)
    tmpmap = np.zeros((nsnps, 5), dtype=np.uint32)

    # iterate over chunkfiles
    for bit in sortbits:
        # iterate lines of file until locus endings
        for line in iter(open(bit, 'r')):

            # still filling locus until |\n
            if "|\n" not in line:
                name, seq = line.split()
                tmploc[name] = seq

            # locus is full, dump it
            else:
                # convert seqs to an array
                loc = (
                    np.array([list(i) for i in tmploc.values()])
                    .astype(bytes).view(np.uint8)
                    )
                snps, idxs, _ = line[len(data.snppad):].rsplit("|", 2)
                snpsmask = np.array(list(snps)) != " "
                snpsidx = np.where(snpsmask)[0]

                # select only the SNP sites
                snpsites = loc[:, snpsmask]

                # store end position of locus for map
                end = start + snpsites.shape[1]
                print(start, end)
                
                for idx, name in enumerate(tmploc):
                    tmparr[sidxs[name], start:end] = snpsites[idx, :]

                # store snpsmap data 1-indexed with chroms info
                if data.isref:
                    chrom, pos = idxs.split(",")[0].split(":")
                    start = int(pos.split("-")[0])
                    #chromidx = faidict[chrom]
                    chromidx = int(chrom)
                    for isnp in range(snpsites.shape[1]):
                        isnpx = snpsidx[isnp]
                        tmpmap[snpidx - 1] = (
                            locidx, isnp, isnpx, chromidx, isnpx + start,
                        )
                        snpidx += 1
                # store snpsmap data (snpidx is 1-indexed)
                else:
                    for isnp in range(snpsites.shape[1]):
                        tmpmap[snpidx - 1] = (
                            locidx, isnp, snpsidx[isnp], 0, snpidx,
                        )
                        snpidx += 1
                locidx += 1

                # reset locus
                start = end
                tmploc = {}

    # fill missing with 78 (N)
    tmparr[tmparr == 0] = 78









    



0 6
6 16
16 28
28 37
37 46
46 51
51 61
61 74
74 83
83 89
89 102
102 112
112 121
121 128
128 134
134 142
142 149
149 158
158 163
163 172
172 187
187 201
201 208
208 217
217 226
226 239
239 246
246 257
257 265
265 273
273 277
277 290
290 297
297 301
301 311
311 320
320 330
330 340
340 352
352 361
361 369
369 381
381 391
391 404
404 413
413 419
419 432
432 442
442 458
458 465
465 482
482 487
487 492
492 499
499 511
511 517
517 530
530 534
534 543
543 549
549 562
562 574
574 585
585 597
597 612
612 620
620 627
627 635
635 645
645 661
661 667
667 683
683 691
691 697
697 710
710 721
721 737
737 745
745 751
751 771
771 779
779 785
785 792
792 809
809 819
819 831
831 838
838 848
848 863
863 871
871 881
881 892
892 895
895 903
903 921
921 927
927 934
934 946
946 954
954 964
964 975
975 979
979 989
989 996
996 1004
1004 1014
1014 1022
1022 1031
1031 1041
1041 1050
1050 1062
1062 1066
1066 1079
1079 1090
1090 1099
1099 1112
1112 1129
1129 1140
1140 1153
1153 1162
1162 1168
1168 1180
1180 1186
1186 1193
1193 1207
1207 1212
1212 1219
1219 1230
1230 1242
1242 1254
1254 1268
1268 1277
1277 1289
1289 1298
1298 1305
1305 1317
1317 1328
1328 1338
1338 1346
1346 1353
1353 1363
1363 1372
1372 1380
1380 1397
1397 1404
1404 1419
1419 1430
1430 1437
1437 1447
1447 1459
1459 1471
1471 1478
1478 1487
1487 1492
1492 1501
1501 1512
1512 1517
1517 1525
1525 1541
1541 1555
1555 1561
1561 1577
1577 1582
1582 1595
1595 1606
1606 1612
1612 1622
1622 1635
1635 1641
1641 1651
1651 1659
1659 1662
1662 1674
1674 1684
1684 1696
1696 1703
1703 1714
1714 1721
1721 1731
1731 1737
1737 1743
1743 1750
1750 1763
1763 1775
1775 1787
1787 1794
1794 1805
1805 1812
1812 1821
1821 1827
1827 1833
1833 1847
1847 1856
1856 1867
1867 1876
1876 1886
1886 1895
1895 1904
1904 1909
1909 1922
1922 1932
1932 1943
1943 1953
1953 1963
1963 1971
1971 1984
1984 1994
1994 1997
1997 2005
2005 2014
2014 2023
2023 2042
2042 2051
2051 2063
2063 2074
2074 2082
2082 2091
2091 2100
2100 2106
2106 2119
2119 2126
2126 2138
2138 2144
2144 2155
2155 2162
2162 2171
2171 2174
2174 2183
2183 2193
2193 2205
2205 2215
2215 2225
2225 2232
2232 2242
2242 2252
2252 2256
2256 2265
2265 2274
2274 2286
2286 2298
2298 2304
2304 2321
2321 2334
2334 2342
2342 2352
2352 2357
2357 2365
2365 2375
2375 2385
2385 2389
2389 2396
2396 2400
2400 2411
2411 2419
2419 2426
2426 2440
2440 2448
2448 2460
2460 2467
2467 2479






    



---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-49-3630a9f0177d> in <module>()
     85 
     86                 for idx, name in enumerate(tmploc):
---> 87                     tmparr[sidxs[name], start:end] = snpsites[idx, :]
     88 
     89                 # store snpsmap data 1-indexed with chroms info

ValueError: could not broadcast input array from shape (12) into shape (7)



In [48]:

    
print(tmparr.shape)


print(start, end)
tmparr[sidxs[name], start:end]









    



(13, 2474)
2467 2479






    Out[48]:





array([0, 0, 0, 0, 0, 0, 0], dtype=uint8)



In [21]:

    
fill_snp_array(*args2)









    



---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-21-f157821ffeb6> in <module>()
----> 1 fill_snp_array(*args2)

~/Documents/ipyrad/ipyrad/assemble/write_outputs.py in fill_snp_array(data, ntaxa, nsnps)
   1585                     end = start + snpsites.shape[1]
   1586                     for idx, name in enumerate(tmploc):
-> 1587                         tmparr[sidxs[name], start:end] = snpsites[idx, :]
   1588 
   1589                     # store snpsmap data 1-indexed with chroms info

ValueError: could not broadcast input array from shape (12) into shape (7)



In [12]:

    
data.run("7")









    



Assembly: test
[####################] 100% 0:00:05 | applying filters     | s7 |
[####################] 100% 0:00:01 | building arrays      | s7 |

Encountered an unexpected error:
ValueError(could not broadcast input array from shape (12) into shape (7))






    



---------------------------------------------------------------------------ValueError                                Traceback (most recent call last)<string> in <module>()
~/Documents/ipyrad/ipyrad/assemble/write_outputs.py in fill_snp_array(data, ntaxa, nsnps)
   1585                     end = start + snpsites.shape[1]
   1586                     for idx, name in enumerate(tmploc):
-> 1587                         tmparr[sidxs[name], start:end] = snpsites[idx, :]
   1588 
   1589                     # store snpsmap data 1-indexed with chroms info
ValueError: could not broadcast input array from shape (12) into shape (7)



In [ ]:



In [ ]:



In [5]:

    
data.run("3")









    



Assembly: test
[####################] 100% 0:00:00 | indexing reference   | s3 |
[####################] 100% 0:00:00 | concatenating        | s3 |
[####################] 100% 0:00:01 | join unmerged pairs  | s3 |
[####################] 100% 0:00:00 | dereplicating        | s3 |
[####################] 100% 0:00:00 | splitting dereps     | s3 |
[####################] 100% 0:00:02 | mapping reads        | s3 |
[####################] 100% 0:00:09 | building clusters    | s3 |
[####################] 100% 0:00:00 | calc cluster stats   | s3 |



In [6]:

    
data.run("456")









    



Assembly: test
[####################] 100% 0:00:02 | inferring [H, E]     | s4 |
[####################] 100% 0:00:00 | calculating depths   | s5 |
[####################] 100% 0:00:00 | chunking clusters    | s5 |
[####################] 100% 0:00:24 | consens calling      | s5 |
[####################] 100% 0:00:01 | indexing alleles     | s5 |
[####################] 100% 0:00:00 | concatenating bams   | s6 |
[####################] 100% 0:00:00 | building clusters    | s6 |

  Encountered an unexpected error (see ./ipyrad_log.txt)
  Error message is below -------------------------------
name 'data' is not defined



In [11]:

    
data.run("1", force=True)









    



Assembly: test
[force] overwriting fastq files previously created by ipyrad.
This _does not_ affect your original/raw data files.
[####################] 100% 0:00:03 | sorting reads        | s1 |
[####################] 100% 0:00:00 | writing/compressing  | s1 |



In [9]:

    
data = ip.load_json("tortas/5-tortas.json")









    



loading Assembly: 5-tortas
from saved path: ~/Documents/ipyrad/tests/tortas/5-tortas.json



In [18]:

    
d2 = data.branch("d2", subsamples=["AGO09concat"])



In [19]:

    
d2.run("3", force=True)









    



Assembly: d2
[####################] 100% 0:00:00 | indexing reference   | s3 |
[####################] 100% 0:00:00 | concatenating        | s3 |
[####################] 100% 0:02:44 | join unmerged pairs  | s3 |
[####################] 100% 0:01:14 | dereplicating        | s3 |
[####################] 100% 0:00:18 | splitting dereps     | s3 |
[####################] 100% 0:52:29 | mapping reads        | s3 |
[####################] 100% 0:55:43 | building clusters    | s3 |
[####################] 100% 0:00:13 | calc cluster stats   | s3 |



In [20]:

    
d2.run("45", force=True)









    



Assembly: d2
[####################] 100% 0:00:21 | inferring [H, E]     | s4 |
[####################] 100% 0:00:13 | calculating depths   | s5 |
[####################] 100% 0:00:22 | chunking clusters    | s5 |
[####################] 100% 15:27:41 | consens calling      | s5 |
[####################] 100% 0:00:41 | indexing alleles     | s5 |



In [7]:

    
d2 = ip.load_json("./tortas/d2.json")









    



loading Assembly: d2
from saved path: ~/Documents/ipyrad/tests/tortas/d2.json



In [9]:

    
d2.stats









    Out[9]:







  
    
      
      state
      reads_raw
      reads_passed_filter
      refseq_mapped_reads
      refseq_unmapped_reads
      clusters_total
      clusters_hidepth
      hetero_est
      error_est
      reads_consens
    
  
  
    
      AGO09concat
      5
      15650127
      15121047
      8307851
      6813196
      1364279
      381081
      0.001636
      0.001511
      380396



In [11]:

    
self = Step5(data, True, ipyclient)



In [12]:

    
self.remote_calculate_depths()









    



[####################] 100% 0:00:17 | calculating depths   | s5 |



In [13]:

    
self.remote_make_chunks()









    



[####################] 100% 0:00:28 | chunking clusters    | s5 |



In [14]:

    
statsdicts = self.remote_process_chunks()









    



[####################] 100% 1:07:30 | consens calling      | s5 |



In [15]:

    
statsdicts









    Out[15]:





{'AGO08concat': [({'name': 90642,
    'heteros': 15544,
    'nsites': 22873090,
    'nconsens': 90642},
   {'depth': 227971, 'maxh': 30, 'maxn': 2}),
  ({'name': 406474, 'heteros': 17080, 'nsites': 22034591, 'nconsens': 87829},
   {'depth': 230745, 'maxh': 69, 'maxn': 2}),
  ({'name': 725373, 'heteros': 17170, 'nsites': 22095933, 'nconsens': 88083},
   {'depth': 230497, 'maxh': 62, 'maxn': 3}),
  ({'name': 1044454, 'heteros': 28259, 'nsites': 22420201, 'nconsens': 88519},
   {'depth': 229823, 'maxh': 301, 'maxn': 2})],
 'AGO11concat': [({'name': 88491,
    'heteros': 15532,
    'nsites': 17265948,
    'nconsens': 88491},
   {'depth': 144187, 'maxh': 31, 'maxn': 0}),
  ({'name': 319207, 'heteros': 16819, 'nsites': 16849885, 'nconsens': 86498},
   {'depth': 146149, 'maxh': 62, 'maxn': 0}),
  ({'name': 552196, 'heteros': 16745, 'nsites': 16895490, 'nconsens': 86778},
   {'depth': 145870, 'maxh': 61, 'maxn': 0}),
  ({'name': 785074, 'heteros': 26253, 'nsites': 17128785, 'nconsens': 86947},
   {'depth': 145382, 'maxh': 374, 'maxn': 0})],
 'AGO02concat': [({'name': 79397,
    'heteros': 14200,
    'nsites': 16889465,
    'nconsens': 79397},
   {'depth': 189044, 'maxh': 24, 'maxn': 0}),
  ({'name': 345488, 'heteros': 15332, 'nsites': 16378730, 'nconsens': 77023},
   {'depth': 191394, 'maxh': 48, 'maxn': 0}),
  ({'name': 614298, 'heteros': 15677, 'nsites': 16486755, 'nconsens': 77368},
   {'depth': 191048, 'maxh': 49, 'maxn': 0}),
  ({'name': 883384, 'heteros': 24439, 'nsites': 16840082, 'nconsens': 77989},
   {'depth': 190103, 'maxh': 370, 'maxn': 0})],
 'AGO09concat': [({'name': 1, 'heteros': 0, 'nsites': 147, 'nconsens': 1},
   {'depth': 9, 'maxh': 0, 'maxn': 0})]}



In [ ]:

    
self.remote_concatenate_chunks()



In [ ]:

    
self.data_store(statsdicts)



In [10]:

    
data.stats









    Out[10]:







  
    
      
      state
      reads_raw
      reads_passed_filter
      refseq_mapped_reads
      refseq_unmapped_reads
      clusters_total
      clusters_hidepth
      hetero_est
      error_est
      reads_consens
    
  
  
    
      AGO02concat
      5
      11050294
      10800672
      7210402
      3590270
      1073857
      312271
      0.002455
      0.001599
      310509
    
    
      AGO08concat
      5
      13408401
      13030329
      7245593
      5784736
      1274580
      355547
      0.002825
      0.001497
      353139
    
    
      AGO09concat
      5
      15650127
      15121047
      8307851
      6813196
      1364279
      1
      0.002711
      0.001882
      378504
    
    
      AGO11concat
      5
      12848936
      12370018
      7855116
      4514902
      930830
      349245
      0.002108
      0.001574
      347812



In [5]:

    
dd = data.branch("dd")
dd.run("5", force=True)









    



Assembly: dd
[####################] 100% 0:00:17 | calculating depths   | s5 |
[####################] 100% 0:00:31 | chunking clusters    | s5 |
[####################] 100% 1 day, 1:41:43 | consens calling      | s5 |
[####################] 100% 0:00:18 | indexing alleles     | s5 |

  Encountered an error (see details in ./ipyrad_log.txt)
  Error summary is below -------------------------------
KeyboardInterrupt()



In [6]:

    
dd.run("3", force=True)









    Out[6]:







  
    
      
      state
      reads_raw
      reads_passed_filter
      refseq_mapped_reads
      refseq_unmapped_reads
      clusters_total
      clusters_hidepth
      hetero_est
      error_est
      reads_consens
    
  
  
    
      AGO02concat
      4
      11050294
      10800672
      7210402
      3590270
      1073857
      312271
      0.001542
      0.001308
      310509
    
    
      AGO08concat
      4
      13408401
      13030329
      7245593
      5784736
      1274580
      355547
      0.001910
      0.001097
      353139
    
    
      AGO09concat
      4
      15650127
      15121047
      8307851
      6813196
      1364279
      1
      0.000042
      0.002260
      378504
    
    
      AGO11concat
      4
      12848936
      12370018
      7855116
      4514902
      930830
      349245
      0.001330
      0.001307
      347812



In [4]:

    
data.run("5", force=True)









    



Assembly: 5-tortas
[####################] 100% 0:00:16 | calculating depths   | s5 |
[####################] 100% 0:00:29 | chunking clusters    | s5 |
[####################] 100% 1:04:59 | consens calling      | s5 |
[####################] 100% 0:00:59 | indexing alleles     | s5 |

  Encountered an error (see details in ./ipyrad_log.txt)
  Error summary is below -------------------------------
IPyradError(error in samtools: b'[E::sam_parse1] CIGAR and query sequence are of different length\n[W::sam_read1] parse error at line 10786\n[main_samview] truncated file.\n')



In [5]:

    
step = Step6(data, True, ipyclient)



In [6]:

    
step.remote_concat_bams()









    



[####################] 100% 0:00:00 | concatenating bams   | s6 |



In [7]:

    
step.remote_build_ref_regions()









    



[####################] 100% 0:00:00 | building clusters    | s6 |



In [8]:

    
self = step



In [9]:

    
regs = self.regions[:20]
regs









    Out[9]:





[('MT', 5008, 5467),
 ('MT', 10476, 10950),
 ('MT', 15959, 16428),
 ('MT', 21437, 21918),
 ('MT', 26927, 27394),
 ('MT', 32403, 32865),
 ('MT', 37874, 38361),
 ('MT', 43370, 43814),
 ('MT', 48823, 49296),
 ('MT', 54305, 54794),
 ('MT', 59803, 60248),
 ('MT', 65257, 65725),
 ('MT', 70734, 71215),
 ('MT', 76224, 76680),
 ('MT', 81689, 82145),
 ('MT', 87154, 87623),
 ('MT', 92632, 93108),
 ('MT', 98117, 98561),
 ('MT', 103570, 104011),
 ('MT', 109020, 109487)]



In [11]:

    
# access reads from bam file using pysam
bamfile = AlignmentFile(
    os.path.join(
        self.data.dirs.across,
        "cat.sorted.bam"),
    'rb')

# catcons output file for raw clusters and db samples
outfile = gzip.open(
    os.path.join(
        self.data.dirs.across,
        "{}.catcons.gz".format(self.data.name)),
    'wb')

# write header line to catcons with sample names
snames = sorted([i.name for i in self.samples])
nsamples = len(snames)
outfile.write(
    b" ".join([i.encode() for i in snames]) + b"\n")

# get clusters
lidx = 0
clusts = []
# while 1:

#     try:
#         region = next(self.regions)
#         reads = bamfile.fetch(*region)
#     except StopIteration:
#         break

for region in regs:
    reads = bamfile.fetch(*region)
 
    # get reference
    print(region)
    refn, refs = get_ref_region(
        data.paramsdict["reference_sequence"], 
        region[0], region[1]+1, region[2]+1)  
    
    # build cluster dict for sorting                
    rdict = {}
    for read in reads:
        rdict[read.qname] = read.seq   
    keys = sorted(rdict.keys(), key=lambda x: x.rsplit(":", 2)[0])
    
    # build cluster based on map positions (reads are already oriented)
    arr = np.zeros((nsamples + 1, len(refs)), dtype=bytes)
    
    # fill it
    arr[0] = list(refs)
    for idx, key in enumerate(keys):
        # get start and stop from this seq
        sname = key.rsplit("_", 1)[0]
        rstart, rstop = key.rsplit(":", 2)[-1].split("-")
        sidx = snames.index(sname)
        
        # get start relative to ref
        start = int(rstart) - int(region[1]) - 1
        stop = start + int(rstop) - int(rstart)
        print(sidx + 1, start, stop, arr.shape[1])
        arr[sidx + 1, int(start): int(stop)] = list(rdict[key])
        
    print("")
    arr[arr == b""] = b"N"
    for line in arr:
        outfile.write(line.tostring() + b"\n")
    outfile.write(b"\n")
    
        
outfile.close()









    



('MT', 5008, 5467)
4 0 459 459
5 0 459 459
7 0 459 459
8 0 459 459
11 0 459 459

('MT', 10476, 10950)
1 0 474 474
3 0 474 474
5 0 474 474
7 0 474 474
8 0 474 474
11 0 474 474

('MT', 15959, 16428)
1 0 469 469
4 0 469 469
5 0 469 469
7 0 469 469
8 0 469 469
9 0 469 469
11 0 469 469

('MT', 21437, 21918)
1 0 481 481
2 0 481 481
3 0 481 481
5 0 481 481
6 0 481 481
7 0 481 481
8 0 481 481
11 0 481 481
12 0 481 481

('MT', 26927, 27394)
1 0 467 467
2 0 467 467
3 0 467 467
4 0 467 467
6 0 467 467
7 0 467 467
8 0 467 467
10 0 467 467
11 0 467 467
12 0 467 467

('MT', 32403, 32865)
2 0 462 462
3 0 462 462
4 0 462 462
5 0 462 462
6 0 462 462
7 0 462 462
8 0 462 462
9 0 462 462
10 0 462 462
11 0 462 462
12 0 462 462

('MT', 37874, 38361)
3 0 487 487
4 0 487 487
5 0 487 487
7 0 487 487
9 0 487 487
10 0 487 487

('MT', 43370, 43814)
1 0 444 444
2 0 444 444
3 0 444 444
4 0 444 444
6 0 444 444
7 0 444 444
8 0 444 444
9 0 444 444
11 0 444 444

('MT', 48823, 49296)
1 0 473 473
2 0 473 473
3 0 473 473
4 0 473 473
5 0 473 473
6 0 473 473
7 0 473 473
9 0 473 473
10 0 473 473
11 0 473 473

('MT', 54305, 54794)
1 0 489 489
2 0 489 489
3 0 489 489
4 0 489 489
6 0 489 489
7 0 489 489
8 0 489 489
9 0 489 489
10 0 489 489
12 0 489 489

('MT', 59803, 60248)
2 0 445 445
3 0 445 445
4 0 445 445
6 0 445 445
7 0 445 445
8 0 445 445
10 0 445 445
11 0 445 445

('MT', 65257, 65725)
1 0 468 468
2 0 468 468
3 0 468 468
4 0 468 468
5 0 468 468
6 0 468 468
8 0 468 468
11 0 468 468

('MT', 70734, 71215)
2 0 481 481
4 0 481 481
5 0 481 481
7 0 481 481
9 0 481 481
10 0 481 481
11 0 481 481
12 0 481 481

('MT', 76224, 76680)
1 0 456 456
2 0 456 456
3 0 456 456
4 0 456 456
5 0 456 456
6 0 456 456
7 0 456 456
8 0 456 456
10 0 456 456
11 0 456 456
12 0 456 456

('MT', 81689, 82145)
1 0 456 456
2 0 456 456
3 0 456 456
4 0 456 456
6 0 456 456
7 0 456 456
8 0 456 456
10 0 456 456
12 0 456 456

('MT', 87154, 87623)
1 0 469 469
2 0 469 469
3 0 469 469
4 0 469 469
5 0 469 469
6 0 469 469
7 0 469 469
9 0 469 469
10 0 469 469
11 0 469 469
12 0 469 469

('MT', 92632, 93108)
1 0 476 476
3 0 476 476
4 0 476 476
5 0 476 476
7 0 476 476
9 0 476 476
10 0 476 476
11 0 476 476
12 0 476 476

('MT', 98117, 98561)
2 0 444 444
3 0 444 444
5 0 444 444
6 0 444 444
8 0 444 444
9 0 444 444
11 0 444 444

('MT', 103570, 104011)
1 0 441 441
2 0 441 441
4 0 441 441
5 0 441 441
6 0 441 441
7 0 441 441
8 0 441 441
9 0 441 441
10 0 441 441
11 0 441 441

('MT', 109020, 109487)
1 0 467 467
2 0 467 467
3 0 467 467
4 0 467 467
6 0 467 467
7 0 467 467
8 0 467 467
9 0 467 467
10 0 467 467
11 0 467 467
12 0 467 467



In [50]:

    
print(start, stop, stop-start, len(rdict[key]), rstart, rstop, int(rstop) - int(rstart))
print(region, region[2] - region[1], len(refs))









    



300 559 259 559 41920 42479 559
('Contig0', 41619, 42478) 859 859



In [61]:

    
key.split("")









    



---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-61-55088ce8a832> in <module>()
----> 1 key.split("")

ValueError: empty separator



In [62]:

    
snames









    Out[62]:





['AGO02concat', 'AGO08concat', 'AGO09concat', 'AGO11concat']



In [141]:

    
revcomp("AATTCCATTCTTCCTTTCCCATACCTCCCGCCCTGCTCCTTTCCKCTCTTGATTTCTTCTTGAGGGAGGCAGAGGANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGGCACCAGATTTTCTCACTGTTCAGGTCAGGGTTTGACTTCAGCCCCATCTCTAATACAAGCCATGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN")









    Out[141]:





'NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCATGGCTTGTATTAGAGATGGGGCTGAAGTCAAACCCTGACCTGAACAGTGAGAAAATCTGGTGCCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNTCCTCTGCCTCCCTCAAGAAGAAATCAAGAGKGGAAAGGAGCAGGGCGGGAGGTATGGGAAAGGAAGAATGGAATT'



In [37]:

    
# get consens seq and variant site index 
    clust = []
    avars = refvars(arr.view(np.uint8), PSEUDO_REF)
    dat = b"".join(avars.view("S1")[:, 0]).decode()
    snpstring = "".join(["*" if i else " " for i in avars[:, 1]])
    clust.append("ref_{}:{}-{}\n{}".format(*region, dat))

    # or, just build variant string (or don't...)
    # write all loci with [locids, nsnps, npis, nindels, ?]
    for key in keys:
        clust.append("{}\n{}".format(key, rdict[key]))
    clust.append("SNPs\n" + snpstring)
    clusts.append("\n".join(clust))

    # advance locus counter
    lidx += 1

    # write chunks to file
    if not lidx % 1000:
        outfile.write(
            str.encode("\n//\n//\n".join(clusts) + "\n//\n//\n"))
        clusts = []

# write remaining
if clusts:                
    outfile.write(
        str.encode("\n//\n//\n".join(clusts) + "\n//\n//\n"))
outfile.close()









    



---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-37-7858eebe5047> in <module>()
     43     arr = np.zeros((len(rdict), len(refs)), dtype=bytes)
     44     for idx, key in enumerate(keys):
---> 45         arr[idx] = list(rdict[key])
     46 
     47     # get consens seq and variant site index

ValueError: cannot copy sequence with size 231 to array axis with dimension 535



In [16]:

    
step.remote_build_ref_clusters()









    



---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-16-2a0e5422841e> in <module>()
----> 1 step.remote_build_ref_clusters()

~/Documents/ipyrad/ipyrad/assemble/cluster_across.py in remote_build_ref_clusters(self)
    588             arr = np.zeros((len(rdict), len(read.seq)), dtype=bytes)
    589             for idx, key in enumerate(keys):
--> 590                 arr[idx] = list(rdict[key])
    591 
    592             # get consens seq and variant site index

ValueError: cannot copy sequence with size 543 to array axis with dimension 559



In [4]:

    
from ipyrad.assemble.clustmap import *



In [6]:

    
self = Step3(data, 8, True, ipyclient)



In [7]:

    
self.run()









    



[####################] 100% 0:00:00 | indexing reference   | s3 |
[####################] 100% 0:00:00 | concatenating        | s3 |
[####################] 100% 0:05:42 | join unmerged pairs  | s3 |
[####################] 100% 0:02:17 | dereplicating        | s3 |
[####################] 100% 0:00:39 | splitting dereps     | s3 |
[####################] 100% 1:53:03 | mapping reads        | s3 |
[####################] 100% 1:40:12 | building clusters    | s3 |
[####################] 100% 0:00:24 | calc cluster stats   | s3 |



In [8]:

    
self.data.run("45")









    



Assembly: 5-tortas
[####################] 100% 0:15:41 | inferring [H, E]     | s4 |
[####################] 100% 0:00:26 | calculating depths   | s5 |
[####################] 100% 0:00:45 | chunking clusters    | s5 |
[####################] 100% 16:10:59 | consens calling      | s5 |
[####################] 100% 0:01:31 | indexing alleles     | s5 |



In [13]:

    
self.remote_index_refs()
self.remote_run(
    function=concat_multiple_edits,
    printstr=("concatenating       ", "s3"),
    args=(),
)
self.remote_run(
    function=merge_end_to_end,
    printstr=("join unmerged pairs ", "s3"),
    args=(False, False,),
)









    



[####################] 100% 0:00:00 | indexing reference   | s3 |
[####################] 100% 0:00:00 | concatenating        | s3 |
[####################] 100% 0:05:49 | join unmerged pairs  | s3 |



In [15]:

    
self.remote_run(
    function=dereplicate,
    printstr=("dereplicating       ", "s3"),
    args=(self.nthreads,),
    threaded=True,
)









    



[####################] 100% 0:02:22 | dereplicating        | s3 |



In [16]:

    
self.remote_run(
    function=split_endtoend_reads,
    printstr=("splitting dereps    ", "s3"),
    args=(),
)









    



[####################] 100% 0:00:36 | splitting dereps     | s3 |



In [ ]:

    
self.remote_run(
    function=mapping_reads,
    printstr=("mapping reads       ", "s3"),
    args=(self.nthreads,),
    threaded=True,
)









    



[##########          ]  50% 1:18:01 | mapping reads        | s3 |



In [ ]:

    
sample = list(self.data.samples.values())[0]
merge_end_to_end(self.data, sample, True, True)



In [ ]:



In [15]:

    
sample = list(data.samples.values())[0]

infiles = [
    os.path.join(data.dirs.edits, "{}.trimmed_R1_.fastq.gz".format(sample.name)),
    os.path.join(data.dirs.edits, "{}_R1_concatedit.fq.gz".format(sample.name)),
    os.path.join(data.tmpdir, "{}_merged.fastq".format(sample.name)),
    os.path.join(data.tmpdir, "{}_declone.fastq".format(sample.name)),
]
infiles = [i for i in infiles if os.path.exists(i)]
infile = infiles[-1]

infile









    Out[15]:





'/home/deren/Documents/ipyrad/tests/tortas/5-tortas-tmpalign/AGO02concat_merged.fastq'



In [19]:

    
strand = "plus"
if data.paramsdict["datatype"] is ('gbs' or '2brad'):
    strand = "both"
nthreads=2

cmd = [
    ip.bins.vsearch,
    "--derep_fulllength", infile,
    "--strand", strand,
    "--output", os.path.join(data.tmpdir, sample.name + "_derep.fastq"),
    "--threads", str(nthreads),
    "--fasta_width", str(0),
    "--fastq_qmax", "1000",
    "--sizeout", 
    "--relabel_md5",
]
cmd









    Out[19]:





['/home/deren/Documents/ipyrad/bin/vsearch-linux-x86_64',
 '--derep_fulllength',
 '/home/deren/Documents/ipyrad/tests/tortas/5-tortas-tmpalign/AGO02concat_merged.fastq',
 '--strand',
 'plus',
 '--output',
 '/home/deren/Documents/ipyrad/tests/tortas/5-tortas-tmpalign/AGO02concat_derep.fastq',
 '--threads',
 '2',
 '--fasta_width',
 '0',
 '--fastq_qmax',
 '1000',
 '--sizeout',
 '--relabel_md5']



In [20]:

    
proc = sps.Popen(cmd, stderr=sps.STDOUT, stdout=sps.PIPE, close_fds=True)
errmsg = proc.communicate()[0]
if proc.returncode:
    ip.logger.error("error inside dereplicate %s", errmsg)
    raise IPyradWarningExit(errmsg)



In [10]:

    
s.remote_run(
    function=dereplicate,
    printstr=("dereplicating       ", "s3"),
    args=(s.nthreads,),
    threaded=True,
)









    



[                    ]   0% 0:01:55 | dereplicating        | s3 |





    



---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-10-9074a628ea2e> in <module>()
      3     printstr=("dereplicating       ", "s3"),
      4     args=(s.nthreads,),
----> 5     threaded=True,
      6 )

~/Documents/ipyrad/ipyrad/assemble/clustmap.py in remote_run(self, printstr, function, args, threaded)
    598             ready = [rasyncs[i].ready() for i in rasyncs]
    599             self.data._progressbar(len(ready), sum(ready), start, printstr)
--> 600             time.sleep(0.1)
    601             if len(ready) == sum(ready):
    602                 break

KeyboardInterrupt:

	state	reads_raw	reads_passed_filter	refseq_mapped_reads	refseq_unmapped_reads	clusters_total	clusters_hidepth	hetero_est	error_est	reads_consens
AGO02concat	5	11050294	10800672	7210402	3590270	1073857	312271	0.002455	0.001599	310509
AGO08concat	5	13408401	13030329	7245593	5784736	1274580	355547	0.002825	0.001497	353139
AGO09concat	5	15650127	15121047	8307851	6813196	1364279	1	0.002711	0.001882	378504
AGO11concat	5	12848936	12370018	7855116	4514902	930830	349245	0.002108	0.001574	347812