python script to determine fragment length resulting from a restriction digest

ben matthews bmatthews@rockefeller.edu 8/12/2014

import libraries


In [1]:
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.Alphabet import generic_dna
from Bio import Restriction
import re
import numpy as np

define location of genome file; in this case the Aedes aegypti genome (available from https://www.vectorbase.org/organisms/aedes-aegypti/liverpool/aaegl3 )


In [2]:
genome_file = "/Users/ben/bioinfo/genome/AaegL3_scaffolds.fa"

define list of restriction enzyme recognition sites to use and initialize arrays for fragmenting and sizing


In [3]:
rxlist = [Restriction.BamHI.site, Restriction.BstYI.site, Restriction.PmeI.site, Restriction.EcoRI.site]
rx_re_list = []
rx_frag_list = []
rx_frag_temp = []

for i, rx in enumerate(rxlist):
    rx_re_list.append(re.compile(r"|".join([str(Seq(rx, generic_dna)),str(Seq(rx, generic_dna).complement())])))
    rx_frag_list.append(np.array([]))
    rx_frag_temp.append([])

open fasta file


In [4]:
handle =  open(genome_file, "rU")

iterate through fasta entries


In [ ]:
for i, record in enumerate(SeqIO.parse(handle, "fasta")):
    for j, rx_re in enumerate(rx_re_list):
        rx_frag = rx_re.split(str(record.seq))
        rx_frag_temp[j] = [len(frag) for frag in rx_frag]
        rx_frag_list[j] = np.r_[rx_frag_list[j],rx_frag_temp[j]]

iterate through results and report summary of fragment lengths


In [ ]:
for i, rx in enumerate(rxlist):
    print("RX recognition site " + rx + " results in " + str(len(rx_frag_list[i])) + " fragments of mean length " + str(np.mean(rx_frag_list[i])) + " ± " + str(np.std(rx_frag_list[i])) + "bp")