import libraries, prepare to plot inline
In [19]:
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.Alphabet import generic_dna
from Bio import Restriction
import re
import numpy as np
import matplotlib
import seaborn
import pandas
%matplotlib inline
Define progress bar class: from here
In [3]:
import sys, time
from IPython.display import clear_output
try:
from IPython.display import clear_output
have_ipython = True
except ImportError:
have_ipython = False
class ProgressBar:
def __init__(self, iterations):
self.iterations = iterations
self.prog_bar = '[]'
self.fill_char = '*'
self.width = 40
self.__update_amount(0)
if have_ipython:
self.animate = self.animate_ipython
else:
self.animate = self.animate_noipython
def animate_ipython(self, iter):
print('\r', self),
sys.stdout.flush()
self.update_iteration(iter + 1)
def update_iteration(self, elapsed_iter):
self.__update_amount((elapsed_iter / float(self.iterations)) * 100.0)
self.prog_bar += ' %d of %s complete' % (elapsed_iter, self.iterations)
def __update_amount(self, new_amount):
percent_done = int(round((new_amount / 100.0) * 100.0))
all_full = self.width - 2
num_hashes = int(round((percent_done / 100.0) * all_full))
self.prog_bar = '[' + self.fill_char * num_hashes + ' ' * (all_full - num_hashes) + ']'
pct_place = (len(self.prog_bar) // 2) - len(str(percent_done))
pct_string = '%d%%' % percent_done
self.prog_bar = self.prog_bar[0:pct_place] + \
(pct_string + self.prog_bar[pct_place + len(pct_string):])
def __str__(self):
return str(self.prog_bar)
define location of genome file; in this case the Aedes aegypti genome (available from https://www.vectorbase.org/organisms/aedes-aegypti/liverpool/aaegl3 )
In [4]:
genome_file = "/scratch/ben/rx_fragment_analysis/AaegL3_scaffolds.fa"
define list of restriction enzyme recognition sites to use and initialize arrays for fragmenting and sizing
In [29]:
rx_batch = Restriction.RestrictionBatch(first=[], suppliers=['N'])
#rx_list = [eval('Restriction.' + rx) for rx in rx_batch.elements()]
rx_list = [Restriction.SwaI, Restriction.TaqI, Restriction.TfiI, Restriction.TseI, Restriction.Tsp45I, Restriction.TspMI, Restriction.TspRI, Restriction.Tth111I, Restriction.XbaI, Restriction.XcmI, Restriction.XhoI, Restriction.XmaI, Restriction.XmnI, Restriction.ZraI]
rx_frag = [];
for i,rx in enumerate(rx_list):
rx_frag.append([]);
print(rx_list)
open fasta file
In [30]:
genome_handle = open(genome_file, "rU")
p = ProgressBar(4757)
iterate through fasta entries and restriction enzymes while adding fragment lengths to the appropriate rx_frag list
In [ ]:
for z, record in enumerate(SeqIO.parse(genome_handle, "fasta")):
clear_output()
p.animate(z)
for i, rx in enumerate(rx_list):
rx_frag[i] = rx_frag[i] + (np.diff(rx.search(record.seq)).tolist())
iterate through results and report summary of fragment lengths in text form
In [28]:
f = open('rx_frag_text.txt',"w")
for i, rx in enumerate(rx_frag):
f.write(str(rx_list[i]) + " with recognition site " + rx_list[i].site + " results in " + str(len(rx_frag[i])) + " fragments of mean length " + str(np.mean(rx_frag[i])) + " ± " + str(np.std(rx_frag[i])) + "bp\n")
produce violin plot for log10 transformed fragment length data using the seaborn plotting package
In [46]:
newparams = {'axes.labelsize': 14, 'axes.linewidth': 1, 'savefig.dpi': 600,
'lines.linewidth': 1.5, 'figure.figsize': (8, 3),
'figure.subplot.wspace': 0.4,
'ytick.labelsize': 12, 'xtick.labelsize': 12,
'ytick.major.pad': 5, 'xtick.major.pad': 5,
'legend.fontsize': 12, 'legend.frameon': False,
'legend.handlelength': 1.5}
matplotlib.rcParams.update(rcdef)
matplotlib.rcParams.update(newparams)
conc_data = [np.log10(y) for y in rx_frag]
seaborn.violinplot(vals=conc_data,names=[str(rx) + ":" + str(rx.site) for rx in rx_list])
seaborn.axlabel('restriction enzyme', 'log10 fragment length')
In [22]:
import csv
f = csv.writer(open('rx_output.csv','a'),delimiter=',')
for i, rx in enumerate(rx_list):
f.writerow(rx_frag[i])
In [20]:
test_df = pandas.DataFrame(data=rx_frag,index=rx_list)
In [25]:
rx_list[1]
In [26]:
string(rx_list[0])
In [27]:
str(rx_list[1])
Out[27]:
In [ ]: