In [1]:
from __future__ import division
%pylab inline
import pandas as pd
from Bio import SeqIO
import nwalign
import itertools
from scipy import signal
import mahotas
import sklearn
import glob
import os.path as op
import editdistance


Populating the interactive namespace from numpy and matplotlib

In [36]:
def getDotAln(seqObj,window=180,step=1):
    seqDict = {}
    seq=str(seqObj.seq).upper()
    
    idxs=range(0,len(seq)-window,step)
    L = len(idxs)
    
    for i in idxs:
        section = seq[i:i+window]
        try:
            seqDict[section].append(i)
        except KeyError:
            seqDict[section] = [i]    
    
    M = np.zeros((L,L))
    
    for s1,s2 in itertools.combinations(seqDict.keys(),2):
        a1,a2 = nwalign.global_align(s1,s2,gap_open=-2,gap_extend=-1)
        score = np.array([m!=n for m,n in zip(a1,a2)]).sum()
#         score = editdistance.eval(s1,s2)
        for idx1,idx2 in itertools.product(seqDict[s1],seqDict[s2]):
#             if idx1 > idx2:
            i = idxs.index(idx1)
            j = idxs.index(idx2)
            M[i,j] = score
            M[j,i] = score
    return M

In [28]:
fastaFn = '../test_data/athaliana_test/m131029_030920_42175_c100583692550000001823087704281457_s1_p0.3.subreads.cen180.fa'
for seqObj in SeqIO.parse(fastaFn, format='fasta'):
    M = getDotAln(seqObj,30,30)
    plt.figure(figsize=(20,20))
    imshow(M,interpolation='none',cmap=cm.gray)
    break



In [37]:
timeit getDotAln(seqObj,30,30)


1 loops, best of 3: 7.43 s per loop