ipyrad testing for pairddrad data



In [1]:

    
import ipyrad as ip      ## for RADseq assembly
print ip.__version__     ## print version



In [2]:

    
## clear existing test_dir/
import shutil
import os
if os.path.exists("./test_pairddrad/"):
    shutil.rmtree("./test_pairddrad/")



In [7]:

    
data1 = ip.Assembly('data1')









    



  New Assembly: data1



In [8]:

    
data1.get_params()









    



  0   assembly_name               data1                                        
  1   project_dir                 .                                            
  2   raw_fastq_path              ./*.fastq                                    
  3   barcodes_path               ./*.barcodes.txt                             
  4   sorted_fastq_path                                                        
  5   assembly_method             denovo                                       
  6   reference_sequence                                                       
  7   datatype                    rad                                          
  8   restriction_overhang        ('TGCAG', '')                                
  9   max_low_qual_bases          5                                            
  10  phred_Qscore_offset         33                                           
  11  mindepth_statistical        6                                            
  12  mindepth_majrule            6                                            
  13  maxdepth                    1000                                         
  14  clust_threshold             0.85                                         
  15  max_barcode_mismatch        1                                            
  16  filter_adapters             0                                            
  17  filter_min_trim_len         35                                           
  18  max_alleles_consens         2                                            
  19  max_Ns_consens              (5, 5)                                       
  20  max_Hs_consens              (8, 8)                                       
  21  min_samples_locus           4                                            
  22  max_SNPs_locus              (100, 100)                                   
  23  max_Indels_locus            (5, 99)                                      
  24  max_shared_Hs_locus         0.25                                         
  25  edit_cutsites               (0, 0)                                       
  26  trim_overhang               (1, 2, 2, 1)                                 
  27  output_formats              *                                            
  28  pop_assign_file                                                          
  29  excludes                                                                 
  30  outgroups



In [11]:

    
data1.set_params('project_dir', "./test_pairddrad")
data1.set_params('raw_fastq_path', "./data/sim_pairddrad_*.gz")
data1.set_params('barcodes_path', "./data/sim_pairddrad_barcodes.txt")
data1.set_params('restriction_overhang', ("TGCAG", "AATT"))
data1.set_params('datatype', 'pairddrad')
#data1.set_params(17, 1)

data1.get_params()









    



  0   assembly_name               data1                                        
  1   project_dir                 ./test_pairddrad                             
  2   raw_fastq_path              ./data/sim_pairddrad_*.gz                    
  3   barcodes_path               ./data/sim_pairddrad_barcodes.txt            
  4   sorted_fastq_path                                                        
  5   assembly_method             denovo                                       
  6   reference_sequence                                                       
  7   datatype                    pairddrad                                    
  8   restriction_overhang        ('TGCAG', 'AATT')                            
  9   max_low_qual_bases          5                                            
  10  phred_Qscore_offset         33                                           
  11  mindepth_statistical        6                                            
  12  mindepth_majrule            6                                            
  13  maxdepth                    1000                                         
  14  clust_threshold             0.85                                         
  15  max_barcode_mismatch        1                                            
  16  filter_adapters             0                                            
  17  filter_min_trim_len         35                                           
  18  max_alleles_consens         2                                            
  19  max_Ns_consens              (5, 5)                                       
  20  max_Hs_consens              (8, 8)                                       
  21  min_samples_locus           4                                            
  22  max_SNPs_locus              (100, 100)                                   
  23  max_Indels_locus            (5, 99)                                      
  24  max_shared_Hs_locus         0.25                                         
  25  edit_cutsites               (0, 0)                                       
  26  trim_overhang               (1, 2, 2, 1)                                 
  27  output_formats              *                                            
  28  pop_assign_file                                                          
  29  excludes                                                                 
  30  outgroups



In [ ]:

    
#data1.link_fastqs(path="test_pairddrad/test_pairddrad_fastqs/", append=True)



In [12]:

    
data1.step1()
print data1.stats









    



  Saving current assembly.
     state  reads_raw
1A0      1      20000
1B0      1      20000
1C0      1      20000
1D0      1      20000
2E0      1      20000
2F0      1      20000
2G0      1      20000
2H0      1      20000
3I0      1      20000
3J0      1      20000
3K0      1      20000
3L0      1      20000



In [13]:

    
data1.step2()#["1B0", "2H0", "3J0", "3K0"], force=True)
print data1.stats









    



  Saving current assembly.
     state  reads_raw  reads_filtered
1A0      2      20000           20000
1B0      2      20000           20000
1C0      2      20000           20000
1D0      2      20000           20000
2E0      2      20000           20000
2F0      2      20000           20000
2G0      2      20000           20000
2H0      2      20000           20000
3I0      2      20000           20000
3J0      2      20000           20000
3K0      2      20000           20000
3L0      2      20000           20000



In [14]:

    
# data1.step3()                        ## do all samples
# data1.step3("1A0")                   ## do one sample
# data1.step3(["1A0", "1B0", "1C0"])   ## do list of samples
data1.step3()#["1B0", "2H0", "3J0", "3K0"], force=True) 
print data1.stats









    



  Saving current assembly.
     state  reads_raw  reads_filtered  clusters_total  clusters_hidepth
1A0      3      20000           20000            1000              1000
1B0      3      20000           20000            1000              1000
1C0      3      20000           20000            1000              1000
1D0      3      20000           20000            1000              1000
2E0      3      20000           20000            1000              1000
2F0      3      20000           20000            1000              1000
2G0      3      20000           20000            1000              1000
2H0      3      20000           20000            1000              1000
3I0      3      20000           20000            1000              1000
3J0      3      20000           20000            1000              1000
3K0      3      20000           20000            1000              1000
3L0      3      20000           20000            1000              1000



In [16]:

    
%%time
data1.step4()#["1B0", "2H0", "3J0", "3K0"], force=True) 
print data1.stats









    



    skipping 1B0; already estimated. Use force=True to overwrite.
    skipping 2H0; already estimated. Use force=True to overwrite.
    skipping 3J0; already estimated. Use force=True to overwrite.
    skipping 3K0; already estimated. Use force=True to overwrite.
  Saving current assembly.
     state  reads_raw  reads_filtered  clusters_total  clusters_hidepth  \
1A0      4      20000           20000            1000              1000   
1B0      4      20000           20000            1000              1000   
1C0      4      20000           20000            1000              1000   
1D0      4      20000           20000            1000              1000   
2E0      4      20000           20000            1000              1000   
2F0      4      20000           20000            1000              1000   
2G0      4      20000           20000            1000              1000   
2H0      4      20000           20000            1000              1000   
3I0      4      20000           20000            1000              1000   
3J0      4      20000           20000            1000              1000   
3K0      4      20000           20000            1000              1000   
3L0      4      20000           20000            1000              1000   

     hetero_est  error_est  
1A0    0.001308   0.000488  
1B0    0.001422   0.000494  
1C0    0.001427   0.000482  
1D0    0.001286   0.000497  
2E0    0.001226   0.000491  
2F0    0.001406   0.000479  
2G0    0.001497   0.000506  
2H0    0.001460   0.000502  
3I0    0.001432   0.000504  
3J0    0.001464   0.000483  
3K0    0.001412   0.000502  
3L0    0.001282   0.000470  
CPU times: user 331 ms, sys: 4.28 ms, total: 336 ms
Wall time: 7.07 s



In [1]:

    
#data1.save("upto2")
## still figuring out how best to save...
import ipyrad as ip
data1 = ip.load_assembly("test_pairddrad/test_pairddrad.assembly")









    



DEBUG:ipyrad:H4CKERZ-mode: __loglevel__ = DEBUG
INFO:ipyrad.core.parallel:Local connection to 4 engines [ipyrad-2480]






    



Loading Assembly: test_pairddrad  [test_pairddrad/test_pairddrad.assembly]
ipyparallel setup: Local connection to 4 engines



In [17]:

    
data1.step5()#["1B0"], force=True) 
print data1.stats









    



  Saving current assembly.
     state  reads_raw  reads_filtered  clusters_total  clusters_hidepth  \
1A0      5      20000           20000            1000              1000   
1B0      5      20000           20000            1000              1000   
1C0      5      20000           20000            1000              1000   
1D0      5      20000           20000            1000              1000   
2E0      5      20000           20000            1000              1000   
2F0      5      20000           20000            1000              1000   
2G0      5      20000           20000            1000              1000   
2H0      5      20000           20000            1000              1000   
3I0      5      20000           20000            1000              1000   
3J0      5      20000           20000            1000              1000   
3K0      5      20000           20000            1000              1000   
3L0      5      20000           20000            1000              1000   

     hetero_est  error_est  reads_consens  
1A0    0.001308   0.000488           1000  
1B0    0.001422   0.000494           1000  
1C0    0.001427   0.000482           1000  
1D0    0.001286   0.000497           1000  
2E0    0.001226   0.000491           1000  
2F0    0.001406   0.000479           1000  
2G0    0.001497   0.000506           1000  
2H0    0.001460   0.000502           1000  
3I0    0.001432   0.000504           1000  
3J0    0.001464   0.000483           1000  
3K0    0.001412   0.000502           1000  
3L0    0.001282   0.000470           1000



In [ ]:

    
data1.step5(["1B0", "2H0", "3J0", "3K0"], force=True) 
print data1.stats



In [18]:

    
data1.step6()#(["1B0", "2H0", "3J0", "3K0"], force=True) 
print data1.stats









    



  Saving current assembly.
     state  reads_raw  reads_filtered  clusters_total  clusters_hidepth  \
1A0      6      20000           20000            1000              1000   
1B0      6      20000           20000            1000              1000   
1C0      6      20000           20000            1000              1000   
1D0      6      20000           20000            1000              1000   
2E0      6      20000           20000            1000              1000   
2F0      6      20000           20000            1000              1000   
2G0      6      20000           20000            1000              1000   
2H0      6      20000           20000            1000              1000   
3I0      6      20000           20000            1000              1000   
3J0      6      20000           20000            1000              1000   
3K0      6      20000           20000            1000              1000   
3L0      6      20000           20000            1000              1000   

     hetero_est  error_est  reads_consens  
1A0    0.001308   0.000488           1000  
1B0    0.001422   0.000494           1000  
1C0    0.001427   0.000482           1000  
1D0    0.001286   0.000497           1000  
2E0    0.001226   0.000491           1000  
2F0    0.001406   0.000479           1000  
2G0    0.001497   0.000506           1000  
2H0    0.001460   0.000502           1000  
3I0    0.001432   0.000504           1000  
3J0    0.001464   0.000483           1000  
3K0    0.001412   0.000502           1000  
3L0    0.001282   0.000470           1000



In [19]:

    
data1.step7()









    



inc ['1B0', '2G0', '1C0', '1A0', '2H0', '2E0', '3L0', '3I0', '1D0', '2F0', '3J0', '3K0']
sidx:  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
finished filtering
  Saving current assembly.



In [21]:

    
data1.outfiles









    Out[21]:





{'loci': '/home/deren/Documents/ipyrad/tests/test_pairddrad/data1_outfiles/data1.loci'}



In [28]:

    
less $data1.outfiles.loci



In [ ]:

    
import pandas as pd
print pd.read_table('test_pairddrad/test_pairddrad_consens/s5_consens.txt', delim_whitespace=1, header=0)



In [ ]:

    
## how to merge Assembly objects
data1.files.edits['1B0']
data1.samples['1B0'].files



In [ ]:

    
print data1.stats



In [ ]:

    
for i in data1.log:
    print i



In [ ]:

    
import numpy as np



In [ ]:

    
adds = np.ones([10, 4], dtype='int16')
adds.shape



In [ ]:

    
np.array.([30, 1], dtype='int16')



In [ ]:

    
longest_reads = 190
nreads = int(1e5)

arr = np.empty([nreads,200, 4], dtype='int16')
arr[0][0:adds.shape[0]] = adds
arr[0]



In [ ]:

    
import glob
import os
import numpy



In [ ]:

    
catg = numpy.load("test_pairddrad/test_pairddrad_consens/1B0.catg")



In [ ]:

    
catg[0]



In [ ]:

    
cats1 = glob.glob(os.path.join(
                      data1.dirs.consens,
                      data1.samples['1B0'].name+"_tmpcats.*"))



In [ ]:

    
cats1.sort(key=lambda x: int(x.split(".")[-1]))



In [ ]:

    
catg = numpy.load(cats1[0])
lastg = numpy.load(cats1[-1])



In [ ]:

    
catg[0]



In [ ]:

    
lastg.shape

Making alignment faster



In [ ]:

    
def alignfast(data, names, seqs):
    """ makes subprocess call to muscle """
    inputstring = "\n".join(">"+i+"\n"+j for i, j in zip(names, seqs))
    cmd = "/bin/echo '"+inputstring+"' | "+data.muscle+" -quiet -in -"
    piped = subprocess.Popen(cmd, shell=True, 
                       stdin=subprocess.PIPE,
                       stdout=subprocess.PIPE,
                       stderr=subprocess.STDOUT,
                       close_fds=True)
    _, fout = piped.stdin, piped.stdout
    return fout.read()



In [ ]:

    
def alignfast(data, names, seqs):
    """ makes subprocess call to muscle """
    inputstring = "\n".join(">"+i+"\n"+j for i, j in zip(names, seqs))
    cmd = "/bin/echo '"+inputstring+"' | "+data.muscle+" -quiet -in -"
    piped = subprocess.Popen(shlex.split(cmd), 
                       stdout=subprocess.PIPE)
    return piped.stdout.read()



In [ ]:

    
%%timeit
out = alignfast(data1, names, seqs)



In [ ]:

    
print out



In [ ]:

    
def newmuscle(data, names, seqs):
    inputstring = "\n".join(">"+i+"\n"+j for i, j in zip(names, seqs))
    return subprocess.Popen(data.muscle, 
                            stdin=subprocess.PIPE, 
                            stdout=subprocess.PIPE)\
                            .communicate(inputstring)[0]



In [ ]:

    
%%timeit 
newmuscle(data1, names, seqs)



In [ ]:

    
def alignfast(data, names, seqs):
    """ makes subprocess call to muscle """
    inputstring = "\n".join(">"+i+"\n"+j for i, j in zip(names, seqs))
    cmd = "/bin/echo '"+inputstring+"' | "+data.muscle+" -quiet -in -"
    piped = subprocess.check_output(cmd, shell=True)
    return piped



In [ ]:

    
out = alignfast(data1, names, seqs)
print out



In [ ]:

    
%%timeit
out = alignfast(data1, names, seqs)



In [ ]:

    
inputstring = "\n".join(i+"\n"+j for i, j in zip(names, seqs))
print inputstring



In [ ]:

    
pipe = subprocess.Popen(shlex.split(cmd), 
                        stdin=subprocess.PIPE, 
                        stderr=subprocess.PIPE, stdout=subprocess.PIPE)
pipe.stdin.write("muscle -h")
get = pipe.communicate(input=pipe)
get



In [ ]:

    
def alignfast2(data, names, seqs):
    """ makes subprocess call to muscle """
    inputstring = "\n".join(i+"\n"+j for i, j in zip(names, seqs))
    cmd = "/bin/echo '"+inputstring+"' | "+data.muscle+" -quiet -in -"
    
    piped = subprocess.Popen(shlex.split(cmd), 
                             stdin=subprocess.PIPE, 
                             stdout=subprocess.PIPE,
                             stderr=subprocess.PIPE)
    stdin, stdout = piped.communicate(input=cmd)
    return stdin, stdout



In [ ]:

    
def alignfast3(data, names, seqs):
    """ makes subprocess call to muscle """
    inputstring = "\n".join(i+"\n"+j for i, j in zip(names, seqs))
    cmd = "/bin/echo '"+inputstring+"' | "+data.muscle+" -quiet -in -"
    
    piped = subprocess.check_output(cmd,
                             shell=True)
    return piped



In [ ]:

    
def sortalign(stringnames):
    """ parses muscle output from a string to two lists """
    objs = stringnames.split("\n>")
    seqs = [i.split("\n")[0].replace(">", "")+"\n"+\
              "".join(i.split('\n')[1:]) for i in objs]
              
    aligned = [i.split("\n") for i in seqs]
    newnames = [">"+i[0] for i in aligned]
    seqs = [i[1] for i in aligned]     
    ## return in sorted order by names
    sortedtups = [(i, j) for i, j in zip(*sorted(zip(newnames, seqs), 
                                         key=lambda pair: pair[0]))]
    return sortedtups

get clusters



In [ ]:

    
import gzip
import subprocess
import shlex
infile = gzip.open("test_pairddrad/test_pairddrad_clust_0.85/1B0.clust.gz")
clusts = infile.read().split("//\n//\n")[:10]



In [ ]:

    
for clust in clusts:
    lines = clust.split("\n")
    names = lines[::2]
    seqs = lines[1::2]
    
seqs[0] = list(seqs[0])
seqs[0].insert(7, "AA")
seqs[0] = "".join(seqs[0])



In [ ]:

    
%%timeit
alignfast(data1, names, seqs)



In [ ]:

    
%%timeit 
alignfast3(data1, names, seqs)



In [ ]:

    
out = alignfast3(data1, names, seqs)
#sorts = sortalign(out)
print out



In [ ]:

    
out



In [ ]:

    
def sortalign(stringnames):
    """ parses muscle output from a string to two lists """
    objs = stringnames[1:].split("\n>")
    seqs = [i.split("\n")[0].replace(">", "")+"\n"+\
              "".join(i.split('\n')[1:]) for i in objs]
              
    aligned = [i.split("\n") for i in seqs]
    newnames = [">"+i[0] for i in aligned]
    seqs = [i[1] for i in aligned]  
    
    ## return in sorted order by names
    sortedtups = [(i, j) for i, j in zip(*sorted(zip(newnames, seqs), 
                                         key=lambda pair: pair[0]))]
    return sortedtups



In [ ]:

    
def parsemuscle(out):
    """ parse muscle string output into two sorted lists """
    lines = out[1:].split("\n>")
    names = [line.split("\n", 1)[0] for line in lines]
    seqs = [line.split("\n", 1)[1].replace("\n", "") for line in lines]
    tups = zip(names, seqs)
    anames, aseqs = zip(*sorted(tups, key=lambda x: int(x[0].split(";")[-1][1:])))



In [ ]: