In [1]:
import Bio
import Bio.Restriction
import Bio.SeqIO
import Bio.SearchIO
import random
In [ ]:
ref_genome = "/s/fir/a/nobackup/data/ECOLI_Reference_Genome/ecoli.fa"
In [10]:
with open("/s/fir/a/nobackup/data/ECOLI_Reference_Genome/ecoli.fa", "rU") as ref_handle:
for ref_seq in Bio.SeqIO.parse(ref_handle, "fasta"): # iterate through the all (in this case one) seqs in the fasta file
ref_frag_seqs = Bio.Restriction.SwaI.catalyze(ref_seq.seq) # split that seqence into a tuple of sequences
print(ref_frag_seqs[:2]) # print the first two fragments
In [11]:
random.seed(8675309) # initialize this to a "random" number so this notebook is reproducible
def noise(lst, small_cutoff, stddev):
return [int(random.gauss(0, stddev)) + s for s in lst if s >= small_cutoff]
ref_frag_lengths = [len(ref_frag) for ref_frag in ref_frag_seqs]
noisy_frags = noise(ref_frag_lengths, small_cutoff=700, stddev=150)
print(noisy_frags)
In [12]:
def write_frags(fname, frags):
with open(fname, "w") as om:
for frag in frags:
# SOMA format uses frag sizes in kb, we don't per frag stddev, so use 0.0
line = str(frag/1000.0) + "\t" + str(0.0) + "\n"
om.write(line)
write_frags("ecoli_optmap", noisy_frags)
In [14]:
1+2
Out[14]:
In [15]:
! ls -l ecoli_optmap
In [16]:
interior_end_points = [random.randint(0, len(ref_seq)) for i in range(9)]
end_points = [0] + interior_end_points + [len(ref_seq)]
end_points.sort()
print(end_points)
In [17]:
contig_loci = {} # bookkeeping to see how clone the TWIN alignments with accumulated restriction fragment based locus match
with open("fake_contigs.fa", "w") as contigs_handle:
for interval_num, (start, end) in enumerate(zip(end_points, end_points[1:])):
subseq = ref_seq[start:end]
if random.randint(0,1) == 0: # reverse complement with 50% probability
subseq.reverse_complement()
print( len(subseq))
subseq.id = "fake_contig_" + str(interval_num)
contig_loci[subseq.id] = start
Bio.SeqIO.write(subseq, contigs_handle, "fasta")
! grep -n "^>" fake_contigs.f
In [18]:
with open("fake_contigs.fa") as f:
for s in Bio.SeqIO.parse( f, "fasta"):
print(len(s))
In [19]:
! ~/twin/digest.py fake_contigs.fa SwaI fake_contigs.silico
! ls -l fake_contigs*
In [ ]: