These functions take simulated rad-data and a "large" input genome (really it could just be a randomly generated fasta), and randomly inserts a handful of simulated rad tags into the genome. This guarantees that reference mapping will actually do something.
For PE simulated data R2 reads are reversed before they're inserted, because smalt is using the -l pe
flag, which looks for reads in this orientation --> <--.
Also, for PE inner mate distance is fixed at 50. If you wanna get ambitious you could draw this value from a distribution, but seems like more effort than it's worth.
This wants to be run from ipyrad/tests/data, but you can run it from anywhere if you update the paths.
In [1]:
## Utility functions.
def revcomp(sequence):
"returns reverse complement of a string"
sequence = sequence[::-1].strip()\
.replace("A", "t")\
.replace("T", "a")\
.replace("C", "g")\
.replace("G", "c").upper()
return sequence
def comp(seq):
""" returns a seq with small complement"""
return seq.replace("A", 't')\
.replace('T', 'a')\
.replace('C', 'g')\
.replace('G', 'c')\
.replace('n', 'Z')\
.upper()\
.replace("Z", "n")\
.replace("S", "s")
In [44]:
import itertools
import gzip
import random
from Bio import SeqIO
RAD_DATA = "./sim_rad_test_R1_.fastq.gz"
RAD_DATA_R1 = "./sim_pairddradmerge_R1_.fastq.gz"
RAD_DATA_R2 = "./sim_pairddradmerge_R2_.fastq.gz"
INPUT_CHR = "/Volumes/WorkDrive/ipyrad/refhacking/MusMT.fa"
OUTPUT_CHR = "./sim_mt_genome.fa"
N_INSERTS = 25
INSERT_SIZE = 50
## Get the SE reads
with gzip.open( RAD_DATA, 'rb' ) as infile:
seqs = []
for i, quatro in enumerate( itertools.izip_longest(*[iter(infile)]*4)):
seqs.append(quatro[1])
SE_samp = random.sample(seqs, N_INSERTS)
## Get the PE reads
with gzip.open( RAD_DATA_R1, 'rb' ) as in_R1, gzip.open( RAD_DATA_R2, 'rb' ) as in_R2:
## create iterators to sample 4 lines at a time
quart1 = itertools.izip(*[iter(in_R1)]*4)
## pair second read files, quarts samples both
quart2 = itertools.izip(*[iter(in_R2)]*4)
quarts = itertools.izip(quart1, quart2)
seqs = []
while True:
try:
quart = quarts.next()
except StopIteration:
break
seqs.append( [quart[0][1].strip(), quart[1][1].strip()] )
PE_samp = random.sample(seqs, N_INSERTS)
## Get the input fasta file (Mouse mtdna)
record = SeqIO.read(INPUT_CHR, "fasta")
record.seq = record.seq + record.seq
lenchr = len(record)
print(lenchr)
## Get the positions to insert the hits at, they will be non-overlapping
## divide the total length of the chromosome by the length of the total of
## all the reads to insert # of SE reads + PE reads 2*Readlength + innermate dist
# "working" locs = [x*(lenchr/(N_INSERTS+(N_INSERTS*2+INSERT_SIZE))) for x in range(N_INSERTS*2)]
locs = [x*((lenchr-(INSERT_SIZE*N_INSERTS))/(N_INSERTS+(N_INSERTS*2))) for x in range(N_INSERTS*2)]
## Insert SE
for i in range(N_INSERTS):
loc = locs[i]
record.seq = record.seq[:loc]+SE_samp[i].strip("\n")+record.seq[loc:]
print(locs)
print(len(record.seq))
samp = PE_samp
# Insert PE
for i in range(N_INSERTS):
loc = locs[i+N_INSERTS]
# print("do loc - {}".format(loc))
# For each insertion point get all sequence before it, then insert the first read,
# then insert INSERT_SIZE more bases from the original sequence, then paste the second
# read, then the rest of the sequence.
# For Read 2 [::-1] gives the reverse
record.seq = record.seq[:loc]+PE_samp[i][0]+record.seq[loc:loc+INSERT_SIZE]+\
revcomp(PE_samp[i][1])+record.seq[loc+INSERT_SIZE:] # revcomp
## Methods for generating other types of reads besides illumina pe
#PE_samp[i][1][::1]+record.seq[loc+INSERT_SIZE:] # rev
#comp(PE_samp[i][1])+record.seq[loc+INSERT_SIZE:] # comp
print(len(record.seq))
output_handle = open(OUTPUT_CHR, "w")
SeqIO.write(record, output_handle, "fasta")
output_handle.close()
In [13]:
## Make SE psuedo-genome
import itertools
import gzip
import random
from Bio import SeqIO
RAD_DATA = "./sim_rad_test_R1_.fastq.gz"
INPUT_CHR = "/Volumes/WorkDrive/ipyrad/refhacking/MusMT.fa"
OUTPUT_CHR = "./sim_mt_genome.fa"
N_INSERTS = 25
with gzip.open( RAD_DATA, 'rb' ) as infile:
seqs = []
for i, quatro in enumerate( itertools.izip_longest(*[iter(infile)]*4)):
seqs.append(quatro[1])
samp = random.sample(seqs, N_INSERTS)
record = SeqIO.read(INPUT_CHR, "fasta")
lenchr = len(record)
print(lenchr)
## Evenly space the hits so they don't overlap
locs = [x*(lenchr/N_INSERTS) for x in range(N_INSERTS)]
print(locs)
## The old way where they could sometimes overlap
#locs = random.sample(range(lenchr), N_INSERTS)
for i, loc in enumerate(locs):
record.seq = record.seq[:loc]+samp[i].strip("\n")+record.seq[loc:]
print(len(record.seq))
output_handle = open(OUTPUT_CHR, "w")
SeqIO.write(record, output_handle, "fasta")
output_handle.close()
In [1]:
import itertools
import gzip
import random
from Bio import SeqIO
RAD_DATA_R1 = "./sim_pairddradmerge_R1_.fastq.gz"
RAD_DATA_R2 = "./sim_pairddradmerge_R2_.fastq.gz"
INPUT_CHR = "./sim_mt_genome.fa"
OUTPUT_CHR = "./sim_mt_genome_SE+PE.fa"
N_INSERTS = 25
INSERT_SIZE = 50
with gzip.open( RAD_DATA_R1, 'rb' ) as in_R1, gzip.open( RAD_DATA_R2, 'rb' ) as in_R2:
## create iterators to sample 4 lines at a time
quart1 = itertools.izip(*[iter(in_R1)]*4)
## pair second read files, quarts samples both
quart2 = itertools.izip(*[iter(in_R2)]*4)
quarts = itertools.izip(quart1, quart2)
seqs = []
while True:
try:
quart = quarts.next()
except StopIteration:
break
seqs.append( [quart[0][1].strip(), quart[1][1].strip()] )
samp = random.sample(seqs, N_INSERTS)
record = SeqIO.read(INPUT_CHR, "fasta")
lenchr = len(record)
print(lenchr)
locs = random.sample(range(lenchr), N_INSERTS)
locs.sort()
# Get a random sample of base positions to insert at from across the sim genome
for i, loc in enumerate(locs):
print("do loc - {}".format(loc))
print(samp[i][0], revcomp(samp[i][1]))
# For each insertion point get all sequence before it, then insert the first read,
# then insert INSERT_SIZE more bases from the original sequence, then paste the second
# read, then the rest of the sequence.
# For Read 2 [::-1] gives the reverse
record.seq = record.seq[:loc]+samp[i][0]+record.seq[loc:loc+INSERT_SIZE]+\
revcomp(samp[i][1])+record.seq[loc+INSERT_SIZE:]
#samp[i][1].strip("\n")[::-1]+record.seq[loc+INSERT_SIZE:]
print(len(record.seq))
output_handle = open(OUTPUT_CHR, "w")
SeqIO.write(record, output_handle, "fasta")
output_handle.close()
In [7]:
import gzip
import shutil
outfiles = ["/private/tmp/ipyrad-test-pair/test-refseq-pair_edits/3L0_R1_.fastq.gz", "/private/tmp/ipyrad-test-pair/test-refseq-pair_edits/3L0_R2_.fastq.gz"]
for f in outfiles:
print(f)
with open(f, 'r') as f_in, gzip.open(f, 'wb') as f_out:
shutil.copyfileobj(f_in, f_out)
In [17]:
sq = "AAATGCGCCGGGCGCGAA"
print("AAATGCGCCGGGCGCGAA"[::-1])
print(revcomp(sq))
sq[::-1].strip().replace("T","a")
Out[17]:
In [25]:
print((PE_samp[1][1])[::-1])
print(PE_samp[1][1])
In [43]:
record = SeqIO.read(INPUT_CHR, "fasta")
record.seq = record.seq + record.seq
print record.seq