In [6]:
import json
import math
import bigfloat
import numpy as np
#import pandas as pd
#import seaborn as sns
#import matplotlib.mlab as mlab
#import matplotlib.pyplot as plt
from collections import OrderedDict
from sh import RNAfold, RNAcofold
PARAMFILE = '/usr/share/ViennaRNA/dna_mathews2004.par'
#sns.set_style("whitegrid", {"font.family": "DejaVu Sans"})
#sns.set_context("poster")
from matplotlib import rc
rc('font',**{'family':'sans-serif','sans-serif':['Helvetica']})
## for Palatino and other serif fonts use:
#rc('font',**{'family':'serif','serif':['Palatino']})
rc('text', usetex=True)
In [25]:
def fold(seq1, seq2='', prog=RNAcofold, RNA=True):
seq = str(seq1) if not seq2 else '%s&%s' % (seq1,seq2)
out = prog(_in=seq, noPS=True) if RNA else prog(_in=seq, noPS=True, noconv=True, dangles=0, P=PARAMFILE)
seq, fold, _ = out.split('\n')
structure = str(fold.split()[0])
energy = float(fold.partition(' ')[2][1:-1].strip())
return structure, energy
def complement(s):
basecomplement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'}
letters = list(s)
letters = [basecomplement[base] for base in letters]
return ''.join(letters)
def revcom(s):
return complement(s[::-1])
def read_scaffold(filename):
f = open(filename, 'r')
return f.readline().strip()
def read_staples(filename):
f = open(filename, 'r')
staples = []
for line in f:
arms = line.strip().split('|')
staples.append(arms)
return staples
def scan_staple(staple):
data = []
for arm in staple:
data.append((arm, scan_arm(arm)))
return data
def scan_arm(arm_sequence):
#sliding window over scaffold
arm_data = []
for i in range(len(scaffold) - len(arm_sequence) + 1):
target = scaffold[i:i+len(arm_sequence)]
structure, energy = fold(arm_sequence, target, RNA=False)#[1]
hamming = hamming_distance(revcom(arm_sequence), target)
levenshtein = levenshtein_dist(revcom(arm_sequence), target)
#print arm_sequence, target, mismatches
arm_data.append((energy, hamming, levenshtein, target, structure))
#print (energy, hamming, levenshtein, target, structure)
#consider only local minima sites
local_min = []
for i in range(len(arm_data)):
e_prev = arm_data[i-1][0] if i-1>0 else 0.0
e = arm_data[i][0]
e_next = arm_data[i+1][0] if i+1<len(arm_data) else 0.0
if e < e_prev and e <= e_next:
local_min.append(arm_data[i])
sorted_by_energy = sorted(local_min, key=lambda tup: tup[0])
return arm_data
#return sorted_by_energy
In [17]:
def levenshtein_dist(source, target):
if len(source) < len(target):
return levenshtein(target, source)
# So now we have len(source) >= len(target).
if len(target) == 0:
return len(source)
# We call tuple() to force strings to be used as sequences
# ('c', 'a', 't', 's') - numpy uses them as values by default.
source = np.array(tuple(source))
target = np.array(tuple(target))
# We use a dynamic programming algorithm, but with the
# added optimization that we only need the last two rows
# of the matrix.
previous_row = np.arange(target.size + 1)
for s in source:
# Insertion (target grows longer than source):
current_row = previous_row + 1
# Substitution or matching:
# Target and source items are aligned, and either
# are different (cost of 1), or are the same (cost of 0).
current_row[1:] = np.minimum(
current_row[1:],
np.add(previous_row[:-1], target != s))
# Deletion (target grows shorter than source):
current_row[1:] = np.minimum(
current_row[1:],
current_row[0:-1] + 1)
previous_row = current_row
return previous_row[-1]
def hamming_distance(source, target):
assert len(source) == len(target)
count = 0
for p, t in zip(source, target):
count += 1 if p != t else 0
return count
print hamming_distance('AAACGCGC', 'AAAGCGCG')
print levenshtein_dist('AAACGCGC', 'AAAGCGCG')
print levenshtein_dist(revcom('ACGTACGT'), 'ACGTACGT')
print revcom('ACGTACGT')
In [27]:
def analyze(scaffold, staples, filename):
with open(filename, 'w') as output:
for i, staple in enumerate(staples):
data = scan_staple(staple)
for arm, arm_data in data:
output.write(arm + '\n')
for energy, hamming, levenshtein, target, structure in arm_data:
output.write("{},{},{},{},{}\n".format(energy, hamming, levenshtein, target, structure))
path = 'data/raw/sequences/M13_medium/'
scaffold_filename = path + 'scaffold.txt'
staples_filename = path + 'DXstaples.txt'
scaffold = read_scaffold(scaffold_filename)
staples = read_staples(staples_filename)
analyze(scaffold, staples, 'data/verification/M13_medium_raw.csv')
path = 'data/raw/sequences/DBS_medium/'
scaffold_filename = path + 'scaffold.txt'
staples_filename = path + 'DXstaples.txt'
scaffold = read_scaffold(scaffold_filename)
staples = read_staples(staples_filename)
analyze(scaffold, staples, 'data/verification/DBS_medium_raw.csv')
In [ ]:
path = 'data/'
filename_DB = 'DeBruijn_alpha.json'
filename_pUC19 = 'pUC19_alpha.json'
filename_M13 = 'M13_square.json'
filename_DB7k = 'DB_7k_square.json'
#ids, sequences, energies
#_, _, energies_DB = read_data(path + filename_DB)
#_, _, energies_pUC19 = read_data(path + filename_pUC19)
#_, _, energies_M13 = read_data(path + filename_M13)
_, _, energies_DB_short = read_data(path + filename_DB, short=True)
_, _, energies_pUC19_short = read_data(path + filename_pUC19, short=True)
_, _, energies_M13_short = read_data(path + filename_M13, short=True)
_, _, energies_DB7k_short = read_data(path + filename_DB7k, short=True)
In [7]:
#def read_data(filename, short=False):
# json_data = open(filename, 'r').read()
# raw_data = json.loads(json_data)
# seq = []
# seq_id = []
# energy = []
# for k1, v1 in raw_data.iteritems():
# if k1 == "name":
# continue
# stp_i = int(k1)
# staple = v1
# #print str(stp_i) + ' ' + v1['staple_sequence']
# for k2, v2 in staple.iteritems():
# if not k2.isdigit():
# continue
# arm_i = k2
# arm = v2
# #if short and len(arm['sequence']) > 8:
# # continue
# dG = arm['dG']
# min_dG = float(arm['min_dG'])
# seq.append(arm['sequence'])
# seq_id.append('stp_' + str(stp_i) + '_' + str(arm_i))
# local_min = []
# for i in range(len(dG)-1):
# if dG[i] < dG[i-1] and dG[i] <= dG[i+1]:
# local_min.append(float(dG[i]))
# sorted_by_energy = sorted(local_min)
# energy.append(numpy.array(sorted_by_energy))
# return seq_id, seq, energy