python script to determine fragment length resulting from a restriction digest

ben matthews bmatthews@rockefeller.edu 8/12/2014

import libraries, prepare to plot inline


In [19]:
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.Alphabet import generic_dna
from Bio import Restriction
import re
import numpy as np
import matplotlib
import seaborn
import pandas

%matplotlib inline

Define progress bar class: from here


In [3]:
import sys, time
from IPython.display import clear_output
try:
    from IPython.display import clear_output
    have_ipython = True
except ImportError:
    have_ipython = False

class ProgressBar:
    def __init__(self, iterations):
        self.iterations = iterations
        self.prog_bar = '[]'
        self.fill_char = '*'
        self.width = 40
        self.__update_amount(0)
        if have_ipython:
            self.animate = self.animate_ipython
        else:
            self.animate = self.animate_noipython

    def animate_ipython(self, iter):
        print('\r', self),
        sys.stdout.flush()
        self.update_iteration(iter + 1)

    def update_iteration(self, elapsed_iter):
        self.__update_amount((elapsed_iter / float(self.iterations)) * 100.0)
        self.prog_bar += '  %d of %s complete' % (elapsed_iter, self.iterations)

    def __update_amount(self, new_amount):
        percent_done = int(round((new_amount / 100.0) * 100.0))
        all_full = self.width - 2
        num_hashes = int(round((percent_done / 100.0) * all_full))
        self.prog_bar = '[' + self.fill_char * num_hashes + ' ' * (all_full - num_hashes) + ']'
        pct_place = (len(self.prog_bar) // 2) - len(str(percent_done))
        pct_string = '%d%%' % percent_done
        self.prog_bar = self.prog_bar[0:pct_place] + \
            (pct_string + self.prog_bar[pct_place + len(pct_string):])

    def __str__(self):
        return str(self.prog_bar)

define location of genome file; in this case the Aedes aegypti genome (available from https://www.vectorbase.org/organisms/aedes-aegypti/liverpool/aaegl3 )


In [4]:
genome_file = "/scratch/ben/rx_fragment_analysis/AaegL3_scaffolds.fa"

define list of restriction enzyme recognition sites to use and initialize arrays for fragmenting and sizing


In [29]:
rx_batch = Restriction.RestrictionBatch(first=[], suppliers=['N'])
#rx_list = [eval('Restriction.' + rx) for rx in rx_batch.elements()]

rx_list = [Restriction.SwaI, Restriction.TaqI, Restriction.TfiI, Restriction.TseI, Restriction.Tsp45I, Restriction.TspMI, Restriction.TspRI, Restriction.Tth111I, Restriction.XbaI, Restriction.XcmI, Restriction.XhoI, Restriction.XmaI, Restriction.XmnI, Restriction.ZraI]

rx_frag = [];

for i,rx in enumerate(rx_list):
    rx_frag.append([]);
    
print(rx_list)


[SwaI, TaqI, TfiI, TseI, Tsp45I, TspMI, TspRI, Tth111I, XbaI, XcmI, XhoI, XmaI, XmnI, ZraI]

open fasta file


In [30]:
genome_handle =  open(genome_file, "rU")
p = ProgressBar(4757)

iterate through fasta entries and restriction enzymes while adding fragment lengths to the appropriate rx_frag list


In [ ]:
for z, record in enumerate(SeqIO.parse(genome_handle, "fasta")):
    clear_output()
    p.animate(z)
    
    for i, rx in enumerate(rx_list):
        rx_frag[i] = rx_frag[i] + (np.diff(rx.search(record.seq)).tolist())


 [*                 3%                  ]  145 of 4757 complete

iterate through results and report summary of fragment lengths in text form


In [28]:
f = open('rx_frag_text.txt',"w")
for i, rx in enumerate(rx_frag):
    f.write(str(rx_list[i]) + " with recognition site " + rx_list[i].site + " results in " + str(len(rx_frag[i])) + " fragments of mean length " + str(np.mean(rx_frag[i])) + " ± " + str(np.std(rx_frag[i])) + "bp\n")

produce violin plot for log10 transformed fragment length data using the seaborn plotting package


In [46]:
newparams = {'axes.labelsize': 14, 'axes.linewidth': 1, 'savefig.dpi': 600, 
             'lines.linewidth': 1.5, 'figure.figsize': (8, 3),
             'figure.subplot.wspace': 0.4,
             'ytick.labelsize': 12, 'xtick.labelsize': 12,
             'ytick.major.pad': 5, 'xtick.major.pad': 5,
             'legend.fontsize': 12, 'legend.frameon': False, 
             'legend.handlelength': 1.5}

matplotlib.rcParams.update(rcdef)
matplotlib.rcParams.update(newparams)

conc_data = [np.log10(y) for y in rx_frag]
seaborn.violinplot(vals=conc_data,names=[str(rx) + ":" + str(rx.site) for rx in rx_list])
seaborn.axlabel('restriction enzyme', 'log10 fragment length')



In [22]:
import csv
f = csv.writer(open('rx_output.csv','a'),delimiter=',')
for i, rx in enumerate(rx_list):
    f.writerow(rx_frag[i])


---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-22-acc04dbc07ed> in <module>()
      2 f = csv.writer(open('rx_output.csv','a'),delimiter=',')
      3 for i, rx in enumerate(rx_list):
----> 4     f.writerow(rx_frag[i])

KeyboardInterrupt: 

In [20]:
test_df = pandas.DataFrame(data=rx_frag,index=rx_list)


---------------------------------------------------------------------------
MemoryError                               Traceback (most recent call last)
<ipython-input-20-bb6a409670d7> in <module>()
----> 1 test_df = pandas.DataFrame(data=rx_frag,index=rx_list)

/scratch/ben/miniconda3/lib/python3.4/site-packages/pandas/core/frame.py in __init__(self, data, index, columns, dtype, copy)
    241 
    242                 if is_list_like(data[0]) and getattr(data[0], 'ndim', 1) == 1:
--> 243                     arrays, columns = _to_arrays(data, columns, dtype=dtype)
    244                     columns = _ensure_index(columns)
    245 

/scratch/ben/miniconda3/lib/python3.4/site-packages/pandas/core/frame.py in _to_arrays(data, columns, coerce_float, dtype)
   4738     if isinstance(data[0], (list, tuple)):
   4739         return _list_to_arrays(data, columns, coerce_float=coerce_float,
-> 4740                                dtype=dtype)
   4741     elif isinstance(data[0], collections.Mapping):
   4742         return _list_of_dict_to_arrays(data, columns,

/scratch/ben/miniconda3/lib/python3.4/site-packages/pandas/core/frame.py in _list_to_arrays(data, columns, coerce_float, dtype)
   4815     else:
   4816         # list of lists
-> 4817         content = list(lib.to_object_array(data).T)
   4818     return _convert_object_array(content, columns, dtype=dtype,
   4819                                  coerce_float=coerce_float)

/scratch/ben/miniconda3/lib/python3.4/site-packages/pandas/lib.cpython-34m.so in pandas.lib.to_object_array (pandas/lib.c:53506)()

MemoryError: 

In [25]:
rx_list[1]


/scratch/ben/miniconda3/lib/python3.4/site-packages/IPython/core/formatters.py:239: FormatterWarning: Exception in image/svg+xml formatter: type object 'RestrictionType' has no attribute 'size'
  FormatterWarning,
/scratch/ben/miniconda3/lib/python3.4/site-packages/IPython/core/formatters.py:239: FormatterWarning: Exception in text/html formatter: type object 'RestrictionType' has no attribute 'size'
  FormatterWarning,
/scratch/ben/miniconda3/lib/python3.4/site-packages/IPython/core/formatters.py:239: FormatterWarning: Exception in image/jpeg formatter: type object 'RestrictionType' has no attribute 'size'
  FormatterWarning,
/scratch/ben/miniconda3/lib/python3.4/site-packages/IPython/core/formatters.py:239: FormatterWarning: Exception in image/png formatter: type object 'RestrictionType' has no attribute 'size'
  FormatterWarning,
/scratch/ben/miniconda3/lib/python3.4/site-packages/IPython/core/formatters.py:239: FormatterWarning: Exception in text/plain formatter: type object 'RestrictionType' has no attribute 'size'
  FormatterWarning,
/scratch/ben/miniconda3/lib/python3.4/site-packages/IPython/core/formatters.py:239: FormatterWarning: Exception in application/pdf formatter: type object 'RestrictionType' has no attribute 'size'
  FormatterWarning,
/scratch/ben/miniconda3/lib/python3.4/site-packages/IPython/core/formatters.py:239: FormatterWarning: Exception in application/json formatter: type object 'RestrictionType' has no attribute 'size'
  FormatterWarning,
/scratch/ben/miniconda3/lib/python3.4/site-packages/IPython/core/formatters.py:239: FormatterWarning: Exception in text/latex formatter: type object 'RestrictionType' has no attribute 'size'
  FormatterWarning,
/scratch/ben/miniconda3/lib/python3.4/site-packages/IPython/core/formatters.py:239: FormatterWarning: Exception in application/javascript formatter: type object 'RestrictionType' has no attribute 'size'
  FormatterWarning,
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-25-b569c84d0603> in <module>()
----> 1 rx_list[1]

/scratch/ben/miniconda3/lib/python3.4/site-packages/IPython/core/displayhook.py in __call__(self, result)
    253             self.write_format_data(format_dict, md_dict)
    254             self.update_user_ns(result)
--> 255             self.log_output(format_dict)
    256             self.finish_displayhook()
    257 

/scratch/ben/miniconda3/lib/python3.4/site-packages/IPython/core/displayhook.py in log_output(self, format_dict)
    225             self.shell.logger.log_write(format_dict['text/plain'], 'output')
    226         self.shell.history_manager.output_hist_reprs[self.prompt_count] = \
--> 227                                                     format_dict['text/plain']
    228 
    229     def finish_displayhook(self):

KeyError: 'text/plain'

In [26]:
string(rx_list[0])


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-26-375500c2858b> in <module>()
----> 1 string(rx_list[0])

NameError: name 'string' is not defined

In [27]:
str(rx_list[1])


Out[27]:
'AbaSI'

In [ ]: