import libraries
In [1]:
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.Alphabet import generic_dna
from Bio import Restriction
import re
import numpy as np
define location of genome file; in this case the Aedes aegypti genome (available from https://www.vectorbase.org/organisms/aedes-aegypti/liverpool/aaegl3 )
In [2]:
genome_file = "/Users/ben/bioinfo/genome/AaegL3_scaffolds.fa"
define list of restriction enzyme recognition sites to use and initialize arrays for fragmenting and sizing
In [3]:
rxlist = [Restriction.BamHI.site, Restriction.BstYI.site, Restriction.PmeI.site, Restriction.EcoRI.site]
rx_re_list = []
rx_frag_list = []
rx_frag_temp = []
for i, rx in enumerate(rxlist):
rx_re_list.append(re.compile(r"|".join([str(Seq(rx, generic_dna)),str(Seq(rx, generic_dna).complement())])))
rx_frag_list.append(np.array([]))
rx_frag_temp.append([])
open fasta file
In [4]:
handle = open(genome_file, "rU")
iterate through fasta entries
In [ ]:
for i, record in enumerate(SeqIO.parse(handle, "fasta")):
for j, rx_re in enumerate(rx_re_list):
rx_frag = rx_re.split(str(record.seq))
rx_frag_temp[j] = [len(frag) for frag in rx_frag]
rx_frag_list[j] = np.r_[rx_frag_list[j],rx_frag_temp[j]]
iterate through results and report summary of fragment lengths
In [ ]:
for i, rx in enumerate(rxlist):
print("RX recognition site " + rx + " results in " + str(len(rx_frag_list[i])) + " fragments of mean length " + str(np.mean(rx_frag_list[i])) + " ± " + str(np.std(rx_frag_list[i])) + "bp")