In [1]:
from __future__ import division
%pylab inline
import pandas as pd
from Bio import SeqIO
import nwalign
import itertools
from scipy import signal
import mahotas
import sklearn
import glob
import os.path as op
import editdistance
In [36]:
def getDotAln(seqObj,window=180,step=1):
seqDict = {}
seq=str(seqObj.seq).upper()
idxs=range(0,len(seq)-window,step)
L = len(idxs)
for i in idxs:
section = seq[i:i+window]
try:
seqDict[section].append(i)
except KeyError:
seqDict[section] = [i]
M = np.zeros((L,L))
for s1,s2 in itertools.combinations(seqDict.keys(),2):
a1,a2 = nwalign.global_align(s1,s2,gap_open=-2,gap_extend=-1)
score = np.array([m!=n for m,n in zip(a1,a2)]).sum()
# score = editdistance.eval(s1,s2)
for idx1,idx2 in itertools.product(seqDict[s1],seqDict[s2]):
# if idx1 > idx2:
i = idxs.index(idx1)
j = idxs.index(idx2)
M[i,j] = score
M[j,i] = score
return M
In [28]:
fastaFn = '../test_data/athaliana_test/m131029_030920_42175_c100583692550000001823087704281457_s1_p0.3.subreads.cen180.fa'
for seqObj in SeqIO.parse(fastaFn, format='fasta'):
M = getDotAln(seqObj,30,30)
plt.figure(figsize=(20,20))
imshow(M,interpolation='none',cmap=cm.gray)
break
In [37]:
timeit getDotAln(seqObj,30,30)