In [6]:
import json
import math
import bigfloat
import numpy as np
#import pandas as pd
#import seaborn as sns
#import matplotlib.mlab as mlab
#import matplotlib.pyplot as plt
from collections import OrderedDict
from sh import RNAfold, RNAcofold

PARAMFILE = '/usr/share/ViennaRNA/dna_mathews2004.par'

#sns.set_style("whitegrid", {"font.family": "DejaVu Sans"})
#sns.set_context("poster")

from matplotlib import rc
rc('font',**{'family':'sans-serif','sans-serif':['Helvetica']})
## for Palatino and other serif fonts use:
#rc('font',**{'family':'serif','serif':['Palatino']})
rc('text', usetex=True)

In [25]:
def fold(seq1, seq2='', prog=RNAcofold, RNA=True):
    seq = str(seq1) if not seq2 else '%s&%s' % (seq1,seq2)
    out = prog(_in=seq, noPS=True) if RNA else prog(_in=seq, noPS=True, noconv=True, dangles=0, P=PARAMFILE)
    seq, fold, _ = out.split('\n')
    structure = str(fold.split()[0])
    energy = float(fold.partition(' ')[2][1:-1].strip())
    return structure, energy

def complement(s): 
    basecomplement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'} 
    letters = list(s) 
    letters = [basecomplement[base] for base in letters] 
    return ''.join(letters)
def revcom(s):
    return complement(s[::-1])

def read_scaffold(filename):
    f = open(filename, 'r')
    return f.readline().strip()

def read_staples(filename):
    f = open(filename, 'r')
    staples = []
    for line in f:
        arms = line.strip().split('|')
        staples.append(arms) 
    return staples

def scan_staple(staple):
    data = []
    for arm in staple:
        data.append((arm, scan_arm(arm)))
    return data

def scan_arm(arm_sequence):  
    #sliding window over scaffold
    arm_data = []
    for i in range(len(scaffold) - len(arm_sequence) + 1):
        target = scaffold[i:i+len(arm_sequence)]
        structure, energy = fold(arm_sequence, target, RNA=False)#[1]
        hamming = hamming_distance(revcom(arm_sequence), target)
        levenshtein = levenshtein_dist(revcom(arm_sequence), target)
        #print arm_sequence, target, mismatches
        arm_data.append((energy, hamming, levenshtein, target, structure))
        #print (energy, hamming, levenshtein, target, structure)
    #consider only local minima sites
    local_min = []
    for i in range(len(arm_data)):
        e_prev = arm_data[i-1][0] if i-1>0 else 0.0
        e = arm_data[i][0]
        e_next = arm_data[i+1][0] if i+1<len(arm_data) else 0.0
        if e < e_prev and e <= e_next:
            local_min.append(arm_data[i])
    sorted_by_energy = sorted(local_min, key=lambda tup: tup[0])
    return arm_data
    #return sorted_by_energy

In [17]:
def levenshtein_dist(source, target):
    if len(source) < len(target):
        return levenshtein(target, source)
    # So now we have len(source) >= len(target).
    if len(target) == 0:
        return len(source)
    # We call tuple() to force strings to be used as sequences
    # ('c', 'a', 't', 's') - numpy uses them as values by default.
    source = np.array(tuple(source))
    target = np.array(tuple(target))
    # We use a dynamic programming algorithm, but with the
    # added optimization that we only need the last two rows
    # of the matrix.
    previous_row = np.arange(target.size + 1)
    for s in source:
        # Insertion (target grows longer than source):
        current_row = previous_row + 1
        # Substitution or matching:
        # Target and source items are aligned, and either
        # are different (cost of 1), or are the same (cost of 0).
        current_row[1:] = np.minimum(
                current_row[1:],
                np.add(previous_row[:-1], target != s))
        # Deletion (target grows shorter than source):
        current_row[1:] = np.minimum(
                current_row[1:],
                current_row[0:-1] + 1)
        previous_row = current_row
    return previous_row[-1]

def hamming_distance(source, target):
    assert len(source) == len(target)
    count = 0
    for p, t in zip(source, target):
        count += 1 if p != t else 0
    return count

print hamming_distance('AAACGCGC', 'AAAGCGCG')
print levenshtein_dist('AAACGCGC', 'AAAGCGCG')
print levenshtein_dist(revcom('ACGTACGT'), 'ACGTACGT')
print revcom('ACGTACGT')


5
2
0
ACGTACGT

Energy Scan


In [27]:
def analyze(scaffold, staples, filename):
    with open(filename, 'w') as output:  
        for i, staple in enumerate(staples):
            data = scan_staple(staple)
            for arm, arm_data in data:
                output.write(arm + '\n')
                for energy, hamming, levenshtein, target, structure in arm_data:
                    output.write("{},{},{},{},{}\n".format(energy, hamming, levenshtein, target, structure))

path = 'data/raw/sequences/M13_medium/'
scaffold_filename = path + 'scaffold.txt'
staples_filename = path + 'DXstaples.txt'

scaffold = read_scaffold(scaffold_filename)
staples = read_staples(staples_filename)

analyze(scaffold, staples, 'data/verification/M13_medium_raw.csv')

path = 'data/raw/sequences/DBS_medium/'
scaffold_filename = path + 'scaffold.txt'
staples_filename = path + 'DXstaples.txt'

scaffold = read_scaffold(scaffold_filename)
staples = read_staples(staples_filename)

analyze(scaffold, staples, 'data/verification/DBS_medium_raw.csv')


---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-27-93ab9f27976c> in <module>()
     15 staples = read_staples(staples_filename)
     16 
---> 17 analyze(scaffold, staples, 'data/verification/M13_medium_raw.csv')
     18 
     19 path = 'data/raw/sequences/DBS_medium/'

<ipython-input-27-93ab9f27976c> in analyze(scaffold, staples, filename)
      2     with open(filename, 'w') as output:
      3         for i, staple in enumerate(staples):
----> 4             data = scan_staple(staple)
      5             for arm, arm_data in data:
      6                 output.write(arm + '\n')

<ipython-input-25-31b7613c8d68> in scan_staple(staple)
     30     data = []
     31     for arm in staple:
---> 32         data.append((arm, scan_arm(arm)))
     33     return data
     34 

<ipython-input-25-31b7613c8d68> in scan_arm(arm_sequence)
     38     for i in range(len(scaffold) - len(arm_sequence) + 1):
     39         target = scaffold[i:i+len(arm_sequence)]
---> 40         structure, energy = fold(arm_sequence, target, RNA=False)#[1]
     41         hamming = hamming_distance(revcom(arm_sequence), target)
     42         levenshtein = levenshtein_dist(revcom(arm_sequence), target)

<ipython-input-25-31b7613c8d68> in fold(seq1, seq2, prog, RNA)
      1 def fold(seq1, seq2='', prog=RNAcofold, RNA=True):
      2     seq = str(seq1) if not seq2 else '%s&%s' % (seq1,seq2)
----> 3     out = prog(_in=seq, noPS=True) if RNA else prog(_in=seq, noPS=True, noconv=True, dangles=0, P=PARAMFILE)
      4     seq, fold, _ = out.split('\n')
      5     structure = str(fold.split()[0])

/usr/local/lib/python2.7/dist-packages/sh.pyc in __call__(self, *args, **kwargs)
    767 
    768 
--> 769         return RunningCommand(cmd, call_args, stdin, stdout, stderr)
    770 
    771 

/usr/local/lib/python2.7/dist-packages/sh.pyc in __init__(self, cmd, call_args, stdin, stdout, stderr)
    328 
    329             if self.should_wait:
--> 330                 self.wait()
    331 
    332 

/usr/local/lib/python2.7/dist-packages/sh.pyc in wait(self)
    332 
    333     def wait(self):
--> 334         self._handle_exit_code(self.process.wait())
    335         return self
    336 

/usr/local/lib/python2.7/dist-packages/sh.pyc in wait(self)
   1150             if self.exit_code is None:
   1151                 self.log.debug("exit code not set, waiting on pid")
-> 1152                 pid, exit_code = os.waitpid(self.pid, 0)
   1153                 self.exit_code = self._handle_exit_code(exit_code)
   1154             else:

KeyboardInterrupt: 

Conversion to probability


In [ ]:
path = 'data/'
filename_DB = 'DeBruijn_alpha.json'
filename_pUC19 = 'pUC19_alpha.json'
filename_M13 = 'M13_square.json'
filename_DB7k = 'DB_7k_square.json'

#ids, sequences, energies
#_, _, energies_DB = read_data(path + filename_DB)
#_, _, energies_pUC19 = read_data(path + filename_pUC19)
#_, _, energies_M13 = read_data(path + filename_M13)

_, _, energies_DB_short = read_data(path + filename_DB, short=True)
_, _, energies_pUC19_short = read_data(path + filename_pUC19, short=True)
_, _, energies_M13_short = read_data(path + filename_M13, short=True)
_, _, energies_DB7k_short = read_data(path + filename_DB7k, short=True)

In [7]:
#def read_data(filename, short=False):
#    json_data = open(filename, 'r').read()
#    raw_data = json.loads(json_data)
#    seq = []
#    seq_id = []
#    energy = []
#    for k1, v1 in raw_data.iteritems():
#        if k1 == "name":
#            continue
#        stp_i = int(k1)
#        staple = v1
#        #print str(stp_i) + ' ' + v1['staple_sequence']
#        for k2, v2 in staple.iteritems():
#            if not k2.isdigit():
#                continue
#            arm_i = k2
#            arm = v2
#            #if short and len(arm['sequence']) > 8:
#            #    continue
#            dG = arm['dG']
#            min_dG = float(arm['min_dG'])
#            seq.append(arm['sequence'])
#            seq_id.append('stp_' + str(stp_i) + '_' + str(arm_i))
#            local_min = []
#            for i in range(len(dG)-1):
#                if dG[i] < dG[i-1] and dG[i] <= dG[i+1]:
#                    local_min.append(float(dG[i]))
#            sorted_by_energy = sorted(local_min)
#            energy.append(numpy.array(sorted_by_energy))
#    return seq_id, seq, energy