The simulations software simrrls is available at github.com/dereneaton/simrrls.
In [1]:
import os
import glob
import gzip
import itertools
import numpy as np
import simrrls
import ipyrad as ip
print "simrrls v.{}".format(simrrls.__version__)
print "ipyrad v.{}".format(ip.__version__)
In [31]:
## the dir to write data to
DIR = "./ipsimdata/"
## if it doesn't exist make it
if not os.path.exists(DIR):
os.makedirs(DIR)
## if theres anything in it, remove it
for oldfile in glob.glob(os.path.join(DIR, "*")):
os.remove(oldfile)
In [32]:
%%bash -s $DIR
## simulate rad_example (includes indels)
simrrls -o $1/rad_example -f rad -dm 10 -ds 2 -I 0.05 -L 1000
## simulate larger rad data set
#simrrls -o $1/rad_large -f rad -dm 10 -ds 2 -I 0.01 -L 10000
## simulate pairddrad_example
simrrls -o $1/gbs_example -f gbs -dm 10 -ds 2 -I 0.05 -L 1000
## simulate pairddrad_example
simrrls -o $1/pairddrad_example -f pairddrad -dm 10 -ds 2 -L 1000
## simulate pairgbs_example
simrrls -o $1/pairgbs_example -f pairgbs -dm 10 -ds 2 -L 1000
## simulate pairddrad_wmerge_example
simrrls -o $1/pairddrad_wmerge_example -f pairddrad -dm 10 -ds 2 \
-i1 -50 -i2 100 -L 1000
## simulate pairgbs_wmerge_example (mixture of merged and non-merged reads)
simrrls -o $1/pairgbs_wmerge_example -f pairgbs -dm 10 -ds 2 \
-i1 -50 -i2 100 -L 1000
In [33]:
import itertools
import gzip
import numpy as np
In [34]:
## Utility function
def revcomp(sequence):
"returns reverse complement of a string"
sequence = sequence[::-1].strip()\
.replace("A", "t")\
.replace("T", "a")\
.replace("C", "g")\
.replace("G", "c").upper()
return sequence
In [35]:
def RAD_to_genome(R1s, R2s, n_inserts, insert_low, insert_high, out_chr):
"""
Create a simulated genome file with RAD data interspersed.
"""
## start genome fasta sequence with 5000 random bp
genome = np.random.choice(list("ATGC"), 5000)
## read in the RAD data files
dat1 = gzip.open(R1s, 'r')
qiter1 = itertools.izip(*[iter(dat1)]*4)
if R2s:
dat2 = gzip.open(R2s, 'r')
qiter2 = itertools.izip(*[iter(dat2)]*4)
else:
qiter2 = itertools.izip(*[iter(str, 1)]*4)
## sample unique reads from rads
uniqs = []
locid = 0
while len(uniqs) < n_inserts:
## grab a read and get locus id
qrt1 = qiter1.next()
qrt2 = qiter2.next()
iloc = []
ilocid = int(qrt1[0].split("_")[1][5:])
## go until end of locus copies
while ilocid == locid:
iloc.append([qrt1[1].strip(), qrt2[1].strip()])
qrt1 = qiter1.next()
qrt2 = qiter2.next()
ilocid = int(qrt1[0].split("_")[1][5:])
## sample one read
uniqs.append(iloc[0])
locid += 1
## insert RADs into genome
for ins in range(n_inserts):
## get read, we leave the barcode on cuz it won't hurt
r1 = np.array(list(uniqs[ins][0]))
r2 = np.array(list(revcomp(uniqs[ins][1])))
## add R1 to genome
genome = np.concatenate((genome, r1), axis=0)
if qrt2[0]:
## add insert size
howlong = np.random.uniform(insert_low, insert_high)
chunk = np.random.choice(list("ATGC"), int(howlong))
#print howlong
genome = np.concatenate((genome, chunk), axis=0)
## add read2
genome = np.concatenate((genome, r2), axis=0)
## add spacer between loci
genome = np.concatenate((genome, np.random.choice(list("ATGC"), 5000)), axis=0)
print("input genome is {} bp".format(genome.shape[0]))
print("inserted {} loci into genome".format(n_inserts))
with open(out_chr, "w") as fasta:
header = ">MT dna:chromosome chromosome:test:MT:1:{}:1 REF\n"\
.format(genome.shape[0])
fasta.write(header)
outseq = list(genome)
for i in range(0, len(outseq), 70):
fasta.write("{}\n".format("".join(outseq[i:i+70])))
In [36]:
## Meta args
N_INSERTS = 500
INSERT_LOW = 250
INSERT_HIGH = 300
In [37]:
## SE RAD data
DATA_R1 = os.path.join(DIR, "rad_example_R1_.fastq.gz")
OUTPUT_CHR = os.path.join(DIR, "rad_example_genome.fa")
big = RAD_to_genome(DATA_R1, 0, N_INSERTS,
INSERT_LOW, INSERT_HIGH,
OUTPUT_CHR)
In [38]:
## SE GBS data
DATA_R1 = os.path.join(DIR, "gbs_example_R1_.fastq.gz")
OUTPUT_CHR = os.path.join(DIR, "gbs_example_genome.fa")
RAD_to_genome(DATA_R1, 0, N_INSERTS,
INSERT_LOW, INSERT_HIGH,
OUTPUT_CHR)
In [39]:
## PAIR ddRAD data
DATA_R1 = os.path.join(DIR, "pairddrad_example_R1_.fastq.gz")
DATA_R2 = os.path.join(DIR, "pairddrad_example_R2_.fastq.gz")
OUTPUT_CHR = os.path.join(DIR, "pairddrad_example_genome.fa")
RAD_to_genome(DATA_R1, DATA_R2, N_INSERTS,
INSERT_LOW, INSERT_HIGH,
OUTPUT_CHR)
In [40]:
## PAIR ddRAD data
DATA_R1 = os.path.join(DIR, "pairddrad_wmerge_example_R1_.fastq.gz")
DATA_R2 = os.path.join(DIR, "pairddrad_wmerge_example_R2_.fastq.gz")
OUTPUT_CHR = os.path.join(DIR, "pairddrad_wmerge_example_genome.fa")
RAD_to_genome(DATA_R1, DATA_R2, N_INSERTS,
INSERT_LOW, INSERT_HIGH,
OUTPUT_CHR)
In [41]:
## PAIR GBS data
DATA_R1 = os.path.join(DIR, "pairgbs_wmerge_example_R1_.fastq.gz")
DATA_R2 = os.path.join(DIR, "pairgbs_wmerge_example_R2_.fastq.gz")
OUTPUT_CHR = os.path.join(DIR, "pairgbs_wmerge_example_genome.fa")
RAD_to_genome(DATA_R1, DATA_R2, N_INSERTS,
INSERT_LOW, INSERT_HIGH,
OUTPUT_CHR)
In [43]:
## create an assembly for denovo
data1 = ip.Assembly("denovo")
data1.set_params(1, "tests1")
data1.set_params(2, os.path.join(DIR, 'rad_example_R1_.fastq.gz'))
data1.set_params(3, os.path.join(DIR, 'rad_example_barcodes.txt'))
data1.set_params("max_alleles_consens", 4) ## raise this for now
## branch into an assembly for reference
data2 = data1.branch("reference")
data2.set_params(5, 'reference')
data2.set_params(6, os.path.join(DIR, 'rad_example_genome.fa'))
## branch into an assembly for denovo+reference
data3 = data2.branch("denovo-plus")
data3.set_params(5, 'denovo+reference')
## branch into an assembly for reference
data4 = data2.branch("denovo-minus")
data4.set_params(5, 'denovo-reference')
## assemble both
data1.run("1234567", force=True)
data2.run("1234567", force=True)
data3.run("1234567", force=True)
data4.run("1234567", force=True)
In [45]:
NLOCI = 1000
## check results
assert all(data1.stats.clusters_hidepth == NLOCI)
assert data1.stats_dfs.s7_loci.sum_coverage.max() == NLOCI
assert all(data2.stats.clusters_hidepth == N_INSERTS)
assert data2.stats_dfs.s7_loci.sum_coverage.max() == N_INSERTS
assert all(data3.stats.clusters_hidepth == NLOCI)
assert data3.stats_dfs.s7_loci.sum_coverage.max() == NLOCI
assert all(data4.stats.clusters_hidepth == NLOCI-N_INSERTS)
assert data4.stats_dfs.s7_loci.sum_coverage.max() == NLOCI-N_INSERTS
In [38]:
## create an assembly for denovo
data1 = ip.Assembly("denovo")
data1.set_params(1, "tests2")
data1.set_params(2, os.path.join(DIR, "gbs_example_R1_.fastq.gz"))
data1.set_params(3, os.path.join(DIR, "gbs_example_barcodes.txt"))
data1.set_params("max_alleles_consens", 4) ## raise this for now
## branch into an assembly for reference
data2 = data1.branch("reference")
data2.set_params(5, "reference")
data2.set_params(6, os.path.join(DIR, "gbs_example_genome.fa"))
## branch into an assembly for denovo+reference
data3 = data2.branch("denovo-plus")
data3.set_params(5, 'denovo+reference')
## branch into an assembly for reference
data4 = data2.branch("denovo-minus")
data4.set_params(5, 'denovo-reference')
## assemble both
data1.run("1234567", force=True)
data2.run("1234567", force=True)
data3.run("1234567", force=True)
data4.run("1234567", force=True)
In [39]:
## check results
assert all(data1.stats.clusters_hidepth == 1000)
assert data1.stats_dfs.s7_loci.sum_coverage.max() == 1000
assert all(data2.stats.clusters_hidepth == N_INSERTS)
assert data2.stats_dfs.s7_loci.sum_coverage.max() == N_INSERTS
assert all(data3.stats.clusters_hidepth == 1000)
assert data3.stats_dfs.s7_loci.sum_coverage.max() == 1000
assert all(data4.stats.clusters_hidepth == 1000-N_INSERTS)
assert data4.stats_dfs.s7_loci.sum_coverage.max() == 1000-N_INSERTS
In [52]:
## create an assembly for denovo
data1 = ip.Assembly("denovo")
data1.set_params(1, "tests3")
data1.set_params(2, os.path.join(DIR, "pairddrad_example_*.fastq.gz"))
data1.set_params(3, os.path.join(DIR, "pairddrad_example_barcodes.txt"))
data1.set_params(7, "pairddrad")
data1.set_params(8, ("TGCAG", "CCGG"))
## branch into an assembly for reference
data2 = data1.branch("reference")
data2.set_params(5, "reference")
data2.set_params(6, os.path.join(DIR, "pairddrad_example_genome.fa"))
## branch into an assembly for denovo+reference
data3 = data2.branch("denovo-plus")
data3.set_params(5, 'denovo+reference')
## branch into an assembly for reference
data4 = data2.branch("denovo-minus")
data4.set_params(5, 'denovo-reference')
## assemble both
data1.run("1234567", force=True)
data2.run("1234567", force=True)
data3.run("1234567", force=True)
data4.run("1234567", force=True)
In [54]:
## check results
assert all(data1.stats.clusters_hidepth == 1000)
assert data1.stats_dfs.s7_loci.sum_coverage.max() == 1000
assert all(data2.stats.clusters_hidepth == N_INSERTS)
assert data2.stats_dfs.s7_loci.sum_coverage.max() == N_INSERTS
assert all(data3.stats.clusters_hidepth == 1000)
assert data3.stats_dfs.s7_loci.sum_coverage.max() == 1000
assert all(data4.stats.clusters_hidepth == 1000-N_INSERTS)
assert data4.stats_dfs.s7_loci.sum_coverage.max() == 1000-N_INSERTS
In [62]:
data2.stats_dfs
Out[62]:
In [ ]:
## create an assembly for denovo
data1 = ip.Assembly("denovo")
data1.set_params(1, "tests4")
data1.set_params(2, os.path.join(DIR, "pairgbs_wmerge_example_*.fastq.gz"))
data1.set_params(3, os.path.join(DIR, "pairgbs_wmerge_example_barcodes.txt"))
data1.set_params(7, "pairgbs")
data1.set_params(8, ("TGCAG", "TGCAG"))
##...
## branch into an assembly for denovo+reference
data3 = data1.branch("denovo-plus")
data3.set_params(5, 'denovo+reference')
## branch into an assembly for reference
data4 = data1.branch("denovo-minus")
data4.set_params(5, 'denovo-reference')
## assemble both
data1.run("1234567", force=True)
data2.run("1234567", force=True)
data3.run("1234567", force=True)
data4.run("1234567", force=True)
In [ ]:
## check results
assert all(data1.stats.clusters_hidepth == 1000)
assert data1.stats_dfs.s7_loci.sum_coverage.max() == 1000
assert all(data2.stats.clusters_hidepth == N_INSERTS)
assert data2.stats_dfs.s7_loci.sum_coverage.max() == N_INSERTS
assert all(data3.stats.clusters_hidepth == 1000)
assert data3.stats_dfs.s7_loci.sum_coverage.max() == 1000
assert all(data4.stats.clusters_hidepth == 1000-N_INSERTS)
assert data4.stats_dfs.s7_loci.sum_coverage.max() == 1000-N_INSERTS
In [63]:
%%bash -s $DIR
## compressed dir/ w/ all data files
tar -zcvf ipsimdata.tar.gz $1