In [1]:
import ipyrad as ip ## for RADseq assembly
print ip.__version__ ## print version
In [2]:
## clear existing test_dir/
import shutil
import os
if os.path.exists("./test_pairddrad/"):
shutil.rmtree("./test_pairddrad/")
In [7]:
data1 = ip.Assembly('data1')
In [8]:
data1.get_params()
In [11]:
data1.set_params('project_dir', "./test_pairddrad")
data1.set_params('raw_fastq_path', "./data/sim_pairddrad_*.gz")
data1.set_params('barcodes_path', "./data/sim_pairddrad_barcodes.txt")
data1.set_params('restriction_overhang', ("TGCAG", "AATT"))
data1.set_params('datatype', 'pairddrad')
#data1.set_params(17, 1)
data1.get_params()
In [ ]:
#data1.link_fastqs(path="test_pairddrad/test_pairddrad_fastqs/", append=True)
In [12]:
data1.step1()
print data1.stats
In [13]:
data1.step2()#["1B0", "2H0", "3J0", "3K0"], force=True)
print data1.stats
In [14]:
# data1.step3() ## do all samples
# data1.step3("1A0") ## do one sample
# data1.step3(["1A0", "1B0", "1C0"]) ## do list of samples
data1.step3()#["1B0", "2H0", "3J0", "3K0"], force=True)
print data1.stats
In [16]:
%%time
data1.step4()#["1B0", "2H0", "3J0", "3K0"], force=True)
print data1.stats
In [1]:
#data1.save("upto2")
## still figuring out how best to save...
import ipyrad as ip
data1 = ip.load_assembly("test_pairddrad/test_pairddrad.assembly")
In [17]:
data1.step5()#["1B0"], force=True)
print data1.stats
In [ ]:
data1.step5(["1B0", "2H0", "3J0", "3K0"], force=True)
print data1.stats
In [18]:
data1.step6()#(["1B0", "2H0", "3J0", "3K0"], force=True)
print data1.stats
In [19]:
data1.step7()
In [21]:
data1.outfiles
Out[21]:
In [28]:
less $data1.outfiles.loci
In [ ]:
import pandas as pd
print pd.read_table('test_pairddrad/test_pairddrad_consens/s5_consens.txt', delim_whitespace=1, header=0)
In [ ]:
## how to merge Assembly objects
data1.files.edits['1B0']
data1.samples['1B0'].files
In [ ]:
print data1.stats
In [ ]:
for i in data1.log:
print i
In [ ]:
import numpy as np
In [ ]:
adds = np.ones([10, 4], dtype='int16')
adds.shape
In [ ]:
np.array.([30, 1], dtype='int16')
In [ ]:
longest_reads = 190
nreads = int(1e5)
arr = np.empty([nreads,200, 4], dtype='int16')
arr[0][0:adds.shape[0]] = adds
arr[0]
In [ ]:
import glob
import os
import numpy
In [ ]:
catg = numpy.load("test_pairddrad/test_pairddrad_consens/1B0.catg")
In [ ]:
catg[0]
In [ ]:
cats1 = glob.glob(os.path.join(
data1.dirs.consens,
data1.samples['1B0'].name+"_tmpcats.*"))
In [ ]:
cats1.sort(key=lambda x: int(x.split(".")[-1]))
In [ ]:
catg = numpy.load(cats1[0])
lastg = numpy.load(cats1[-1])
In [ ]:
catg[0]
In [ ]:
lastg.shape
In [ ]:
def alignfast(data, names, seqs):
""" makes subprocess call to muscle """
inputstring = "\n".join(">"+i+"\n"+j for i, j in zip(names, seqs))
cmd = "/bin/echo '"+inputstring+"' | "+data.muscle+" -quiet -in -"
piped = subprocess.Popen(cmd, shell=True,
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
close_fds=True)
_, fout = piped.stdin, piped.stdout
return fout.read()
In [ ]:
def alignfast(data, names, seqs):
""" makes subprocess call to muscle """
inputstring = "\n".join(">"+i+"\n"+j for i, j in zip(names, seqs))
cmd = "/bin/echo '"+inputstring+"' | "+data.muscle+" -quiet -in -"
piped = subprocess.Popen(shlex.split(cmd),
stdout=subprocess.PIPE)
return piped.stdout.read()
In [ ]:
%%timeit
out = alignfast(data1, names, seqs)
In [ ]:
print out
In [ ]:
def newmuscle(data, names, seqs):
inputstring = "\n".join(">"+i+"\n"+j for i, j in zip(names, seqs))
return subprocess.Popen(data.muscle,
stdin=subprocess.PIPE,
stdout=subprocess.PIPE)\
.communicate(inputstring)[0]
In [ ]:
%%timeit
newmuscle(data1, names, seqs)
In [ ]:
def alignfast(data, names, seqs):
""" makes subprocess call to muscle """
inputstring = "\n".join(">"+i+"\n"+j for i, j in zip(names, seqs))
cmd = "/bin/echo '"+inputstring+"' | "+data.muscle+" -quiet -in -"
piped = subprocess.check_output(cmd, shell=True)
return piped
In [ ]:
out = alignfast(data1, names, seqs)
print out
In [ ]:
%%timeit
out = alignfast(data1, names, seqs)
In [ ]:
inputstring = "\n".join(i+"\n"+j for i, j in zip(names, seqs))
print inputstring
In [ ]:
pipe = subprocess.Popen(shlex.split(cmd),
stdin=subprocess.PIPE,
stderr=subprocess.PIPE, stdout=subprocess.PIPE)
pipe.stdin.write("muscle -h")
get = pipe.communicate(input=pipe)
get
In [ ]:
def alignfast2(data, names, seqs):
""" makes subprocess call to muscle """
inputstring = "\n".join(i+"\n"+j for i, j in zip(names, seqs))
cmd = "/bin/echo '"+inputstring+"' | "+data.muscle+" -quiet -in -"
piped = subprocess.Popen(shlex.split(cmd),
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
stdin, stdout = piped.communicate(input=cmd)
return stdin, stdout
In [ ]:
def alignfast3(data, names, seqs):
""" makes subprocess call to muscle """
inputstring = "\n".join(i+"\n"+j for i, j in zip(names, seqs))
cmd = "/bin/echo '"+inputstring+"' | "+data.muscle+" -quiet -in -"
piped = subprocess.check_output(cmd,
shell=True)
return piped
In [ ]:
def sortalign(stringnames):
""" parses muscle output from a string to two lists """
objs = stringnames.split("\n>")
seqs = [i.split("\n")[0].replace(">", "")+"\n"+\
"".join(i.split('\n')[1:]) for i in objs]
aligned = [i.split("\n") for i in seqs]
newnames = [">"+i[0] for i in aligned]
seqs = [i[1] for i in aligned]
## return in sorted order by names
sortedtups = [(i, j) for i, j in zip(*sorted(zip(newnames, seqs),
key=lambda pair: pair[0]))]
return sortedtups
In [ ]:
import gzip
import subprocess
import shlex
infile = gzip.open("test_pairddrad/test_pairddrad_clust_0.85/1B0.clust.gz")
clusts = infile.read().split("//\n//\n")[:10]
In [ ]:
for clust in clusts:
lines = clust.split("\n")
names = lines[::2]
seqs = lines[1::2]
seqs[0] = list(seqs[0])
seqs[0].insert(7, "AA")
seqs[0] = "".join(seqs[0])
In [ ]:
%%timeit
alignfast(data1, names, seqs)
In [ ]:
%%timeit
alignfast3(data1, names, seqs)
In [ ]:
out = alignfast3(data1, names, seqs)
#sorts = sortalign(out)
print out
In [ ]:
out
In [ ]:
def sortalign(stringnames):
""" parses muscle output from a string to two lists """
objs = stringnames[1:].split("\n>")
seqs = [i.split("\n")[0].replace(">", "")+"\n"+\
"".join(i.split('\n')[1:]) for i in objs]
aligned = [i.split("\n") for i in seqs]
newnames = [">"+i[0] for i in aligned]
seqs = [i[1] for i in aligned]
## return in sorted order by names
sortedtups = [(i, j) for i, j in zip(*sorted(zip(newnames, seqs),
key=lambda pair: pair[0]))]
return sortedtups
In [ ]:
def parsemuscle(out):
""" parse muscle string output into two sorted lists """
lines = out[1:].split("\n>")
names = [line.split("\n", 1)[0] for line in lines]
seqs = [line.split("\n", 1)[1].replace("\n", "") for line in lines]
tups = zip(names, seqs)
anames, aseqs = zip(*sorted(tups, key=lambda x: int(x[0].split(";")[-1][1:])))
In [ ]: