In [5]:
%pylab
import editdistance
import itertools


Using matplotlib backend: MacOSX
Populating the interactive namespace from numpy and matplotlib

In [39]:
def getDotMatrix(seq, window=170,step=17):
    """Creates alignment matrix (dotplot) from FASTA file.

    Parameters
    ----------
    seqObj : str
        Nucleotide sequence
    window : int
        k-mer size for dotplot (default: 170)
    step : int
        step size for dotplot (default: 17)

    Returns
    -------
    M : np.ndarray"""

    seqDict = {}
    seq=seq.upper()

    idxs=range(0,len(seq)-window,step)
    L = len(idxs)

    for i in idxs:
        section = seq[i:i+window]
        try:
            seqDict[section].append(i)
        except KeyError:
            seqDict[section] = [i]

    M = np.zeros((L,L))

    for s1,s2 in itertools.combinations(seqDict.keys(),2):
        score = editdistance.eval(s1,s2)
        for idx1,idx2 in itertools.product(seqDict[s1],seqDict[s2]):
            i = idxs.index(idx1)
            j = idxs.index(idx2)
            M[i,j] = score
            M[j,i] = score
    return M

In [32]:
seq="TTCTTTTTATAGAATCTGGAAGTGGATATTTGGAAAGCTTTGAGGATTTCGTTGGAAACGGGAATATCTTCAAATCAAATCTAGCCAGAACATTCTCATAAACTAGTTTGTGATGTGTGTGCTTAACTAACAGAGCTGAACCTTTCTTTTCATAGAGCGGTTTTGAAACACTCTTTTTGTAGAATCTGCATGTGGATATTTGGAAAGCTTTGAGGATTTCTTTGGAAACGGGAATATCTTCACTTAAAATCTAGACAGAACATTCTCAGAGACTGCTCTGTGCTGTGTGCGTTCAACTCACAGTGTTTAAGTTTTCTTTTCATTCAGCAGTTTGGAAATGCTCTGTTTGTAACGTCTGCAAGTGGATATTTTGACCTCTTTGAGGCCTTCATTGGAAACGGATTTTTTTCAGGTAAGGCTATACAGAACATTCTCAGAAACTACTTTGTGATGTGTGCATTCAACTCACCGAATTGAACCTTCCTTTTGATACAGCAGTTTTGAAACACTCTTTGTTTAGAATCTGCAAGTGGATATTTGGAGCACATTTATGCCTGTGGTAGAAAAGGAAATATCTTCACATAAAAACTAGACAGAATAATCTCAGAAACATGTTTATGCTGTATCTACTCAACTAAGTGTGCTGAACATTTCTATTAATAGAGCAGTTTTGAGACACTCTTCTTTTCGAATCTGCAAGTGGATATTTGGCTAGATTTGAGGATTTCGTTGGAAACGGGATTATATATAAAAAGTAGACAGCACATTCTCAGAAAAATCTCTGTGAGGATGGCATTCAAGTGCCAGTGTTGAACATTCTCTTTCATAAAGCAGGTGTGAACACAAGATTTTGTAGTATATGGAACTGGACATTTGGGGTGCTTTGTGACCTATTGTGAAAAAGGAAATATCTTCCCATATAAACTACGCAGAACCTTCGCAGAAACACCTTTGTGATGTTTGCATTGAAGTCAGAGAGTTGTACATTCCCTTTCATAGAGCAGCTTTCAAACACTCTTTTTGTAGTATCTGGAGATGGACATTTACATCGCTTTGAGGCCTATGGTGAAATAGGAAATCTCTTCGCATAAAAACTAGACGGAACATTCTCAGAAACTTCTTTGTGATGTGGGCATTGAACTCACGGAGCTGAACCTTCCTTTGGATTGAGCAGTTTTGACAAACTCTTCCTTTATAATCTGCAGGTGGATATTTGGAGTGCTTTGAAGCCTTTTTGGAAACGGGAGTATCGTCACATAAAAATAGACAGAACATTCTCAGAACCTTGATTGTGATGTGTGTTCTCCACTAACAGGGTTGAACCTTTCTTTTGACAGAACTGTTGTGAAACATTCTTTTTATAGAATCTGGAAGTGGATATTTGGAAAGCTTTGAGGATTTCGTTGGAAACGGGAATATCTTCAAATCAAATCTAGCCAGAACATTCTCATAAACTAGTTTGTGATGTGTGTGCTTAACTAACAGAGCTGAACCTTTCTTTTCATAGAGCGGTTTTGAAACACTCTTTTTGTAGAATCTGCATGTGGATATTTGGAAAGCTTTGAGGATTTCTTTGGAAACGGGAATATCTTCACTTAAAATCTAGACAGAACATTCTCAGAGACTGCTCTGTGCTGTGTGCGTTCAACTCACAGTGTTTAAGTTTTCTTTTCATTCAGCAGTTTGGAAATGCTCTGTTTGTAACGTCTGCAAGTGGATATTTTGACCTCTTTGAGGCCTTCATTGGAAACGGATTTTTTTCAGGTAAGGCTATACAGAACATTCTCAGAAACTACTTTGTGATGTGTGCATTCAACTCACCGAATTGAACCTTCCTTTTGATACAGCAGTTTTGAAACACTCTTTGTTTAGAATCTGCAAGTGGATATTTGGAGCACATTTATGCCTGTGGTAGAAAAGGAAATATCTTCACATAAAAACTAGACAGAATAATCTCAGAAACATGTTTATGCTGTATCTACTCAACTAAGTGTGCTGAACATTTCTATTAATAGAGCAGTTTTGAGACACTCTTCTTTTCGAATCTGCAAGTGGATATTTGGCTAGATTTGAGGATTTCGTTGGAAACGGGATTATATATAAAAAGTAGACAGCACATTCTCAGAAAAATCTCTGTGAGGATGGCATTCAAGTGCCAGTGTTGAACATTCTCTTTCATAAAGCAGGTGTGAACACAAGATTTTGTAGTATATGGAACTGGACATTTGGGGTGCTTTGTGACCTATTGTGAAAAAGGAAATATCTTCCCATATAAACTACGCAGAACCTTCGCAGAAACACCTTTGTGATGTTTGCATTGAAGTCAGAGAGTTGTACATTCCCTTTCATAGAGCAGCTTTCAAACACTCTTTTTGTAGTATCTGGAGATGGACATTTACATCGCTTTGAGGCCTATGGTGAAATAGGAAATCTCTTCGCATAAAAACTAGACGGAACATTCTCAGAAACTTCTTTGTGATGTGGGCATTGAACTCACGGAGCTGAACCTTCCTTTGGATTGAGCAGTTTTGACAAACTCTTCCTTTATAATCTGCAGGTGGATATTTGGAGTGCTTTGAAGCCTTTTTGGAAACGGGAGTATCGTCACATAAAAATAGACAGAACATTCTCAGAACCTTGATTGTGATGTGTGTTCTCCACTAACAGGGTTGAACCTTTCTTTTGACAGAACTGTTGTGAAACATTCTTTTTATAGAATCTGGAAGTGGATATTTGGAAAGCTTTGAGGATTTCGTTGGAAACGGGAATATCTTCAAATCAAATCTAGCCAGAACATTCTCATAAACTAGTTTGTGATGTGTGTGCTTAACTAACAGAGCTGAACCTTTCTTTTCATAGAGCGGTTTTGAAACACTCTTTTTGTAGAATCTGCATGTGGATATTTGGAAAGCTTTGAGGATTTCTTTGGAAACGGGAATATCTTCACTTAAAATCTAGACAGAACATTCTCAGAGACTGCTCTGTGCTGTGTGCGTTCAACTCACAGTGTTTAAGTTTTCTTTTCATTCAGCAGTTTGGAAATGCTCTGTTTGTAACGTCTGCAAGTGGATATTTTGACCTCTTTGAGGCCTTCATTGGAAACGGATTTTTTTCAGGTAAGGCTATACAGAACATTCTCAGAAACTACTTTGTGATGTGTGCATTCAACTCACCGAATTGAACCTTCCTTTTGATACAGCAGTTTTGAAACACTCTTTGTTTAGAATCTGCAAGTGGATATTTGGAGCACATTTATGCCTGTGGTAGAAAAGGAAATATCTTCACATAAAAACTAGACAGAATAATCTCAGAAACATGTTTATGCTGTATCTACTCAACTAAGTGTGCTGAACATTTCTATTAATAGAGCAGTTTTGAGACACTCTTCTTTTCGAATCTGCAAGTGGATATTTGGCTAGATTTGAGGATTTCGTTGGAAACGGGATTATATATAAAAAGTAGACAGCACATTCTCAGAAAAATCTCTGTGAGGATGGCATTCAAGTGCCAGTGTTGAACATTCTCTTTCATAAAGCAGGTGTGAACACAAGATTTTGTAGTATATGGAACTGGACATTTGGGGTGCTTTGTGACCTATTGTGAAAAAGGAAATATCTTCCCATATAAACTACGCAGAACCTTCGCAGAAACACCTTTGTGATGTTTGCATTGAAGTCAGAGAGTTGTACATTCCCTTTCATAGAGCAGCTTTCAAACACTCTTTTTGTAGTATCTGGAGATGGACATTTACATCGCTTTGAGGCCTATGGTGAAATAGGAAATCTCTTCGCATAAAAACTAGACGGAACATTCTCAGAAACTTCTTTGTGATGTGGGCATTGAACTCACGGAGCTGAACCTTCCTTTGGATTGAGCAGTTTTGACAAACTCTTCCTTTATAATCTGCAGGTGGATATTTGGAGTGCTTTGAAGCCTTTTTGGAAACGGGAGTATCGTCACATAAAAATAGACAGAACATTCTCAGAACCTTGATTGTGATGTGTGTTCTCCACTAACAGGGTTGAACCTTTCTTTTGACAGAACTGTTGTGAAACATTCTTTTTATAGAATCTGGAAGTGGATATTTGGAAAGCTTTGAGGATTTCGTTGGAAACGGGAATATCTTCAAATCAAATCTAGCCAGAACATTCTCATAAACTAGTTTGTGATGTGTGTGCTTAACTAACAGAGCTGAACCTTTCTTTTCATAGAGCGGTTTTGAAACACTCTTTTTGTAGAATCTGCATGTGGATATTTGGAAAGCTTTGAGGATTTCTTTGGAAACGGGAATATCTTCACTTAAAATCTAGACAGAACATTCTCAGAGACTGCTCTGTGCTGTGTGCGTTCAACTCACAGTGTTTAAGTTTTCTTTTCATTCAGCAGTTTGGAAATGCTCTGTTTGTAACGTCTGCAAGTGGATATTTTGACCTCTTTGAGGCCTTCATTGGAAACGGATTTTTTTCAGGTAAGGCTATACAGAACATTCTCAGAAACTACTTTGTGATGTGTGCATTCAACTCACCGAATTGAACCTTCCTTTTGATACAGCAGTTTTGAAACACTCTTTGTTTAGAATCTGCAAGTGGATATTTGGAGCACATTTATGCCTGTGGTAGAAAAGGAAATATCTTCACATAAAAACTAGACAGAATAATCTCAGAAACATGTTTATGCTGTATCTACTCAACTAAGTGTGCTGAACATTTCTATTAATAGAGCAGTTTTGAGACACTCTTCTTTTCGAATCTGCAAGTGGATATTTGGCTAGATTTGAGGATTTCGTTGGAAACGGGATTATATATAAAAAGTAGACAGCACATTCTCAGAAAAATCTCTGTGAGGATGGCATTCAAGTGCCAGTGTTGAACATTCTCTTTCATAAAGCAGGTGTGAACACAAGATTTTGTAGTATATGGAACTGGACATTTGGGGTGCTTTGTGACCTATTGTGAAAAAGGAAATATCTTCCCATATAAACTACGCAGAACCTTCGCAGAAACACCTTTGTGATGTTTGCATTGAAGTCAGAGAGTTGTACATTCCCTTTCATAGAGCAGCTTTCAAACACTCTTTTTGTAGTATCTGGAGATGGACATTTACATCGCTTTGAGGCCTATGGTGAAATAGGAAATCTCTTCGCATAAAAACTAGACGGAACATTCTCAGAAACTTCTTTGTGATGTGGGCATTGAACTCACGGAGCTGAACCTTCCTTTGGATTGAGCAGTTTTGACAAACTCTTCCTTTATAATCTGCAGGTGGATATTTGGAGTGCTTTGAAGCCTTTTTGGAAACGGGAGTATCGTCACATAAAAATAGACAGAACATTCTCAGAACCTTGATTGTGATGTGTGTTCTCCACTAACAGGGTTGAACCTTTCTTTTGACAGAACTGTTGTGAAACATTCTTTTTATAGAATCTGGAAGTGGATATTTGGAAAGCTTTGAGGATTTCGTTGGAAACGGGAATATCTTCAAATCAAATCTAGCCAGAACATTCTCATAAACTAGTTTGTGATGTGTGTGCTTAACTAACAGAGCTGAACCTTTCTTTTCATAGAGCGGTTTTGAAACACTCTTTTTGTAGAATCTGCATGTGGATATTTGGAAAGCTTTGAGGATTTCTTTGGAAACGGGAATATCTTCACTTAAAATCTAGACAGAACATTCTCAGAGACTGCTCTGTGCTGTGTGCGTTCAACTCACAGTGTTTAAGTTTTCTTTTCATTCAGCAGTTTGGAAATGCTCTGTTTGTAACGTCTGCAAGTGGATATTTTGACCTCTTTGAGGCCTTCATTGGAAACGGATTTTTTTCAGGTAAGGCTATACAGAACATTCTCAGAAACTACTTTGTGATGTGTGCATTCAACTCACCGAATTGAACCTTCCTTTTGATACAGCAGTTTTGAAACACTCTTTGTTTAGAATCTGCAAGTGGATATTTGGAGCACATTTATGCCTGTGGTAGAAAAGGAAATATCTTCACATAAAAACTAGACAGAATAATCTCAGAAACATGTTTATGCTGTATCTACTCAACTAAGTGTGCTGAACATTTCTATTAATAGAGCAGTTTTGAGACACTCTTCTTTTCGAATCTGCAAGTGGATATTTGGCTAGATTTGAGGATTTCGTTGGAAACGGGATTATATATAAAAAGTAGACAGCACATTCTCAGAAAAATCTCTGTGAGGATGGCATTCAAGTGCCAGTGTTGAACATTCTCTTTCATAAAGCAGGTGTGAACACAAGATTTTGTAGTATATGGAACTGGACATTTGGGGTGCTTTGTGACCTATTGTGAAAAAGGAAATATCTTCCCATATAAACTACGCAGAACCTTCGCAGAAACACCTTTGTGATGTTTGCATTGAAGTCAGAGAGTTGTACATTCCCTTTCATAGAGCAGCTTTCAAACACTCTTTTTGTAGTATCTGGAGATGGACATTTACATCGCTTTGAGGCCTATGGTGAAATAGGAAATCTCTTCGCATAAAAACTAGACGGAACATTCTCAGAAACTTCTTTGTGATGTGGGCATTGAACTCACGGAGCTGAACCTTCCTTTGGATTGAGCAGTTTTGACAAACTCTTCCTTTATAATCTGCAGGTGGATATTTGGAGTGCTTTGAAGCCTTTTTGGAAACGGGAGTATCGTCACATAAAAATAGACAGAACATTCTCAGAACCTTGATTGTGATGTGTGTTCTCCACTAACAGGGTTGAACCTTTCTTTTGACAGAACTGTTGTGAAACATTCTTTTTATAGAATCTGGAAGTGGATATTTGGAAAGCTTTGAGGATTTCGTTGGAAACGGGAATATCTTCAAATCAAATCTAGCCAGAACATTCTCATAAACTAGTTTGTGATGTGTGTGCTTAACTAACAGAGCTGAACCTTTCTTTTCATAGAGCGGTTTTGAAACACTCTTTTTGTAGAATCTGCATGTGGATATTTGGAAAGCTTTGAGGATTTCTTTGGAAACGGGAATATCTTCACTTAAAATCTAGACAGAACATTCTCAGAGACTGCTCTGTGCTGTGTGCGTTCAACTCACAGTGTTTAAGTTTTCTTTTCATTCAGCAGTTTGGAAATGCTCTGTTTGTAACGTCTGCAAGTGGATATTTTGACCTCTTTGAGGCCTTCATTGGAAACGGATTTTTTTCAGGTAAGGCTATACAGAACATTCTCAGAAACTACTTTGTGATGTGTGCATTCAACTCACCGAATTGAACCTTCCTTTTGATACAGCAGTTTTGAAACACTCTTTGTTTAGAATCTGCAAGTGGATATTTGGAGCACATTTATGCCTGTGGTAGAAAAGGAAATATCTTCACATAAAAACTAGACAGAATAATCTCAGAAACATGTTTATGCTGTATCTACTCAACTAAGTGTGCTGAACATTTCTATTAATAGAGCAGTTTTGAGACACTCTTCTTTTCGAATCTGCAAGTGGATATTTGGCTAGATTTGAGGATTTCGTTGGAAACGGGATTATATATAAAAAGTAGACAGCACATTCTCAGAAAAATCTCTGTGAGGATGGCATTCAAGTGCCAGTGTTGAACATTCTCTTTCATAAAGCAGGTGTGAACACAAGATTTTGTAGTATATGGAACTGGACATTTGGGGTGCTTTGTGACCTATTGTGAAAAAGGAAATATCTTCCCATATAAACTACGCAGAACCTTCGCAGAAACACCTTTGTGATGTTTGCATTGAAGTCAGAGAGTTGTACATTCCCTTTCATAGAGCAGCTTTCAAACACTCTTTTTGTAGTATCTGGAGATGGACATTTACATCGCTTTGAGGCCTATGGTGAAATAGGAAATCTCTTCGCAT"

In [40]:
M=getDotMatrix(seq,window=170,step=17)
imshow(M,cmap=cm.gray)


Out[40]:
<matplotlib.image.AxesImage at 0x115ad31d0>