In [1]:
import Bio
import Bio.Restriction
import Bio.SeqIO
import Bio.SearchIO
import random


/usr/lib64/python2.7/site-packages/Bio/SearchIO/__init__.py:211: BiopythonExperimentalWarning: Bio.SearchIO is an experimental submodule which may undergo significant changes prior to its future official release.
  BiopythonExperimentalWarning)

In [ ]:
ref_genome = "/s/fir/a/nobackup/data/ECOLI_Reference_Genome/ecoli.fa"

In [10]:
with open("/s/fir/a/nobackup/data/ECOLI_Reference_Genome/ecoli.fa", "rU") as ref_handle:
    for ref_seq in Bio.SeqIO.parse(ref_handle, "fasta"): # iterate through the all (in this case one) seqs in the fasta file
        ref_frag_seqs = Bio.Restriction.SwaI.catalyze(ref_seq.seq) # split that seqence into a tuple of sequences
    
print(ref_frag_seqs[:2]) # print the first two fragments


(Seq('AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAG...TTT', SingleLetterAlphabet()), Seq('AAATTAAAATCCATCTTTCAACCTCTTGATATTTTGGGGGTTAATTAATCTTTC...TTT', SingleLetterAlphabet()))

In [11]:
random.seed(8675309) # initialize this to a "random" number so this notebook is reproducible

def noise(lst, small_cutoff, stddev):
    return [int(random.gauss(0, stddev)) + s for s in lst if s >= small_cutoff]

ref_frag_lengths = [len(ref_frag) for ref_frag in ref_frag_seqs]
noisy_frags = noise(ref_frag_lengths, small_cutoff=700, stddev=150)

print(noisy_frags)


[40226, 81532, 122840, 1194, 50221, 19636, 27276, 57108, 33243, 2605, 107188, 25540, 15351, 42488, 81284, 141588, 50132, 28308, 57921, 10670, 50153, 17596, 115609, 30380, 8532, 31972, 6119, 33543, 21278, 46756, 31057, 26520, 15478, 69668, 88918, 5889, 2709, 18869, 3227, 9980, 5752, 7733, 36140, 29378, 9736, 90791, 7930, 94844, 2194, 39616, 9094, 34753, 5255, 98464, 101786, 51787, 17789, 29444, 77084, 40071, 31578, 19094, 4456, 4226, 1239, 28256, 203819, 57944, 21107, 65100, 7656, 175914, 75772, 15125, 99637, 29570, 11774, 45392, 41521, 184441, 111685, 9905, 104362, 25395, 17475, 13946, 61524, 8200, 733, 12159, 36378, 3281, 28732, 73220, 8003, 2200, 6392, 8888, 14466, 87525, 41859, 13379, 15020, 63761, 34057, 71288, 55396, 761, 7451, 55237, 40286, 39667, 22059]

In [12]:
def write_frags(fname, frags):
    with open(fname, "w") as om:
        for frag in frags:
            # SOMA format uses frag sizes in kb, we don't per frag stddev, so use 0.0
            line = str(frag/1000.0) + "\t" + str(0.0) + "\n"
            om.write(line) 

write_frags("ecoli_optmap", noisy_frags)

In [14]:
1+2


Out[14]:
3

In [15]:
! ls -l ecoli_optmap


-rw------- 1 darshanw grad 1207 Dec 17 17:16 ecoli_optmap

In [16]:
interior_end_points = [random.randint(0, len(ref_seq)) for i in range(9)]
end_points = [0] + interior_end_points + [len(ref_seq)]
end_points.sort()

print(end_points)


[0, 656470, 1160129, 1236731, 1366606, 1573538, 2173411, 2430032, 4351221, 4541002, 4639675]

In [17]:
contig_loci = {} # bookkeeping to see how clone the TWIN alignments with accumulated restriction fragment based locus match

with open("fake_contigs.fa", "w") as contigs_handle:
    for interval_num, (start, end) in enumerate(zip(end_points, end_points[1:])):
        subseq = ref_seq[start:end]
        if random.randint(0,1) == 0:  # reverse complement with 50% probability
            subseq.reverse_complement()
        print( len(subseq))
        subseq.id = "fake_contig_" + str(interval_num)
        contig_loci[subseq.id] = start
        Bio.SeqIO.write(subseq, contigs_handle, "fasta")
        

! grep -n "^>" fake_contigs.f


656470
503659
76602
129875
206932
599873
256621
1921189
189781
98673
grep: fake_contigs.f: No such file or directory

In [18]:
with open("fake_contigs.fa") as f:
    for s in Bio.SeqIO.parse( f, "fasta"):
        print(len(s))


656470
503659
76602
129875
206932
599873
256621
1921189
189781
98673

In [19]:
! ~/twin/digest.py fake_contigs.fa SwaI  fake_contigs.silico

! ls -l fake_contigs*


/bin/sh: /s/chopin/a/grad/darshanw/twin/digest.py: No such file or directory
-rw------- 1 darshanw grad 4717958 Dec 22 21:54 fake_contigs.fa

In [ ]: