The simulations software simrrls is available at github.com/dereneaton/simrrls. First we create a directory called ipsimdata/
and then simulate data and put it in this directory. Simrrls has a few dependencies that are required.
In [52]:
## name for our sim data directory
DIR = "./ipsimdata"
## A mouse MT genome used to stick our data into.
INPUT_CHR = "/home/deren/Downloads/MusMT.fa"
## number of RAD loci to simulate
NLOCI = 1000
## number of inserts to reference genome and insert size
N_INSERTS = 100
INSERT_SIZE = 50
In [53]:
import os
import shutil
## rm sim dir if it exists, else create it.
while 1:
if os.path.exists(DIR):
shutil.rmtree(DIR)
else:
os.mkdir(DIR)
## make sure dir is finished removing
if os.path.exists(DIR):
break
## rm testdirs if they exist
TESTDIRS = ["./testref1", "./testref2", "./testref3", "./testref4"]
for testdir in TESTDIRS:
if os.path.exists(testdir):
shutil.rmtree(testdir)
In [54]:
import simrrls
print 'simrrls', simrrls.__version__
In [55]:
import subprocess
## this the bash command-line call to simrrls
cmd = """\
simrrls -o {odir}/{oname} -f {form} -dm 10 -ds 2
-I 0.01 -L {nloci} -i1 {imin} -i2 {imax}
"""
## simulate rad_example (includes indels)
call = cmd.format(odir=DIR, oname="rad_example", form="rad",
imin=50, imax=100, nloci=NLOCI)
print call
subprocess.check_output(call.split())
## simulate gbs_example (includes indels)
call = cmd.format(odir=DIR, oname="gbs_example", form="gbs",
imin=50, imax=100, nloci=NLOCI)
print call
subprocess.check_output(call.split())
## simulate pairddrad_example (includes indels)
call = cmd.format(odir=DIR, oname="pairddrad_example", form="pairddrad",
imin=50, imax=100, nloci=NLOCI)
print call
subprocess.check_output(call.split())
## simulate gbs_example (includes indels)
call = cmd.format(odir=DIR, oname="pairgbs_example", form="pairgbs",
imin=50, imax=100, nloci=NLOCI)
print call
subprocess.check_output(call.split())
## simulate pairddrad_example (includes indels and merged reads)
call = cmd.format(odir=DIR, oname="pairddrad_wmerge_example", form="pairddrad",
imin=-50, imax=50, nloci=NLOCI)
print call
subprocess.check_output(call.split())
## simulate gbs_example (includes indels and merged reads)
call = cmd.format(odir=DIR, oname="pairgbs_wmerge_example", form="pairgbs",
imin=-50, imax=50, nloci=NLOCI)
print call
subprocess.check_output(call.split())
Out[55]:
These functions take simulated rad-data and a "large" input genome (really it could just be a randomly generated fasta), and randomly inserts a handful of simulated rad tags into the genome. This guarantees that reference mapping will actually do something. For PE simulated data R2 reads are reversed before they're inserted, because smalt is using the -l pe
flag, which looks for reads in this orientation --> <--
. Also, for PE inner mate distance is fixed at 50. If you wanna get ambitious you could draw this value from a distribution, but seems like more effort than it's worth. This wants to be run from ipsimdata/
, but you can run it from anywhere if you update the paths.
In [56]:
import itertools
import gzip
import random
from Bio import SeqIO
In [57]:
## Utility function
def revcomp(sequence):
"returns reverse complement of a string"
sequence = sequence[::-1].strip()\
.replace("A", "t")\
.replace("T", "a")\
.replace("C", "g")\
.replace("G", "c").upper()
return sequence
In [58]:
def RAD_to_genome(R1s, R2s, n_inserts, insert_sz, input_chr, out_chr):
"""
Writes simulated rad data into a genome fasta file.
Assumes RAD data file has names formatted like the
output from simrrls.
"""
## read in the full genome file
record = SeqIO.read(input_chr, "fasta")
lenchr = len(record.seq)
## read in the RAD data files
dat1 = gzip.open(R1s, 'r')
qiter1 = itertools.izip(*[iter(dat1)]*4)
if R2s:
dat2 = gzip.open(R2s, 'r')
qiter2 = itertools.izip(*[iter(dat2)]*4)
else:
qiter2 = itertools.izip(*[iter(str, 1)]*4)
## sample unique reads from rads
uniqs = []
locid = 0
while len(uniqs) < n_inserts:
## grab a read and get locus id
qrt1 = qiter1.next()
qrt2 = qiter2.next()
iloc = []
ilocid = int(qrt1[0].split("_")[1][5:])
## go until end of locus copies
while ilocid == locid:
iloc.append([qrt1[1].strip(), qrt2[1].strip()])
qrt1 = qiter1.next()
qrt2 = qiter2.next()
ilocid = int(qrt1[0].split("_")[1][5:])
## sample one read
uniqs.append(random.sample(iloc, 1)[0])
locid += 1
## insert RADs into genome
sloc = 100
for ins in range(n_inserts):
## get read, we leave the barcode on cuz it won't hurt
r1 = uniqs[ins][0]
r2 = uniqs[ins][1]
if not r2:
record.seq = record.seq[:sloc]+r1+\
record.seq[sloc:]
else:
record.seq = record.seq[:sloc]+r1+\
record.seq[sloc:sloc+insert_sz]+\
revcomp(r2)+\
record.seq[sloc+insert_sz:]
sloc += 300
## write to file
rlen = len(qrt1[1].strip())
if r2:
rlen *= 2
print("input genome is {} bp".format(lenchr))
print('imputed {} loci {} bp in len'.format(n_inserts, rlen))
print("new pseudo-genome is {} bp".format(len(record.seq)))
output_handle = open(out_chr, "w")
SeqIO.write(record, output_handle, "fasta")
output_handle.close()
In [59]:
## SE RAD data
DATA_R1 = DIR+"/rad_example_R1_.fastq.gz"
OUTPUT_CHR = DIR+"/rad_example_genome.fa"
RAD_to_genome(DATA_R1, 0, N_INSERTS, INSERT_SIZE, INPUT_CHR, OUTPUT_CHR)
In [60]:
## SE GBS data
DATA_R1 = DIR+"/gbs_example_R1_.fastq.gz"
OUTPUT_CHR = DIR+"/gbs_example_genome.fa"
RAD_to_genome(DATA_R1, 0, N_INSERTS, INSERT_SIZE, INPUT_CHR, OUTPUT_CHR)
In [61]:
## PAIR ddRAD data
DATA_R1 = DIR+"/pairddrad_wmerge_example_R1_.fastq.gz"
DATA_R2 = DIR+"/pairddrad_wmerge_example_R2_.fastq.gz"
OUTPUT_CHR = DIR+"/pairddrad_wmerge_example_genome.fa"
RAD_to_genome(DATA_R1, DATA_R2, N_INSERTS, INSERT_SIZE, INPUT_CHR, OUTPUT_CHR)
In [62]:
## PAIR GBS data
DATA_R1 = DIR+"/pairgbs_wmerge_example_R1_.fastq.gz"
DATA_R2 = DIR+"/pairgbs_wmerge_example_R2_.fastq.gz"
OUTPUT_CHR = DIR+"/pairgbs_wmerge_example_genome.fa"
RAD_to_genome(DATA_R1, DATA_R2, N_INSERTS, INSERT_SIZE, INPUT_CHR, OUTPUT_CHR)
In [63]:
import ipyrad as ip
## create an assembly for denovo
data1 = ip.Assembly("denovo")
data1.set_params(1, "testref1")
data1.set_params(2, DIR+'/rad_example_R1_.fastq.gz')
data1.set_params(3, DIR+'/rad_example_barcodes.txt')
## branch into an assembly for reference
data2 = data1.branch("reference")
data2.set_params(5, 'reference')
data2.set_params(6, DIR+'/rad_example_genome.fa')
## assemble both
data1.run(force=True)
data2.run(force=True)
## check results
assert data1.stats_dfs.s7_loci.sum_coverage.max() == NLOCI
assert data2.stats_dfs.s7_loci.sum_coverage.max() == N_INSERTS
In [64]:
import ipyrad as ip
## create an assembly for denovo
data1 = ip.Assembly("denovo")
data1.set_params(1, "testref2")
data1.set_params(2, DIR+'/gbs_example_R1_.fastq.gz')
data1.set_params(3, DIR+'/gbs_example_barcodes.txt')
## branch into an assembly for reference
data2 = data1.branch("reference")
data2.set_params(5, 'reference')
data2.set_params(6, DIR+'/gbs_example_genome.fa')
## assemble both
data1.run(force=True)
data2.run(force=True)
## check results
assert data1.stats_dfs.s7_loci.sum_coverage.max() == NLOCI
assert data2.stats_dfs.s7_loci.sum_coverage.max() == N_INSERTS
In [65]:
import ipyrad as ip
## create an assembly for denovo
data1 = ip.Assembly("denovo")
data1.set_params(1, "testref3")
data1.set_params(2, DIR+'/pairddrad_wmerge_example_R1_.fastq.gz')
data1.set_params(3, DIR+'/pairddrad_wmerge_example_barcodes.txt')
## branch into an assembly for reference
data2 = data1.branch("reference")
data2.set_params(5, 'reference')
data2.set_params(6, DIR+'/pairddrad_wmerge_example_genome.fa')
## assemble both
data1.run(force=True)
data2.run(force=True)
## check results
assert data1.stats_dfs.s7_loci.sum_coverage.max() == NLOCI
assert data2.stats_dfs.s7_loci.sum_coverage.max() == N_INSERTS
In [66]:
import ipyrad as ip
## create an assembly for denovo
data1 = ip.Assembly("denovo")
data1.set_params(1, "testref4")
data1.set_params(2, DIR+'/pairgbs_wmerge_example_R1_.fastq.gz')
data1.set_params(3, DIR+'/pairgbs_wmerge_example_barcodes.txt')
## branch into an assembly for reference
data2 = data1.branch("reference")
data2.set_params(5, 'reference')
data2.set_params(6, DIR+'/pairgbs_wmerge_example_genome.fa')
## assemble both
data1.run(force=True)
data2.run(force=True)
## check results
assert data1.stats_dfs.s7_loci.sum_coverage.max() == 1000
assert data2.stats_dfs.s7_loci.sum_coverage.max() == N_INSERTS
In [67]:
import glob
import os
## rm dir if it exists, else create it.
for tdir in glob.glob("testref[1-9]"):
shutil.rmtree(tdir)
## rm reference index
for iref in glob.glob(DIR+"/*_genome.fa.*"):
os.remove(iref)
In [68]:
%%bash
## compressed dir/ w/ all data files
tar -zcvf ipsimdata.tar.gz ipsimdata/*