In [5]:
%pylab
import editdistance
import itertools
In [39]:
def getDotMatrix(seq, window=170,step=17):
"""Creates alignment matrix (dotplot) from FASTA file.
Parameters
----------
seqObj : str
Nucleotide sequence
window : int
k-mer size for dotplot (default: 170)
step : int
step size for dotplot (default: 17)
Returns
-------
M : np.ndarray"""
seqDict = {}
seq=seq.upper()
idxs=range(0,len(seq)-window,step)
L = len(idxs)
for i in idxs:
section = seq[i:i+window]
try:
seqDict[section].append(i)
except KeyError:
seqDict[section] = [i]
M = np.zeros((L,L))
for s1,s2 in itertools.combinations(seqDict.keys(),2):
score = editdistance.eval(s1,s2)
for idx1,idx2 in itertools.product(seqDict[s1],seqDict[s2]):
i = idxs.index(idx1)
j = idxs.index(idx2)
M[i,j] = score
M[j,i] = score
return M
In [32]:
seq="TTCTTTTTATAGAATCTGGAAGTGGATATTTGGAAAGCTTTGAGGATTTCGTTGGAAACGGGAATATCTTCAAATCAAATCTAGCCAGAACATTCTCATAAACTAGTTTGTGATGTGTGTGCTTAACTAACAGAGCTGAACCTTTCTTTTCATAGAGCGGTTTTGAAACACTCTTTTTGTAGAATCTGCATGTGGATATTTGGAAAGCTTTGAGGATTTCTTTGGAAACGGGAATATCTTCACTTAAAATCTAGACAGAACATTCTCAGAGACTGCTCTGTGCTGTGTGCGTTCAACTCACAGTGTTTAAGTTTTCTTTTCATTCAGCAGTTTGGAAATGCTCTGTTTGTAACGTCTGCAAGTGGATATTTTGACCTCTTTGAGGCCTTCATTGGAAACGGATTTTTTTCAGGTAAGGCTATACAGAACATTCTCAGAAACTACTTTGTGATGTGTGCATTCAACTCACCGAATTGAACCTTCCTTTTGATACAGCAGTTTTGAAACACTCTTTGTTTAGAATCTGCAAGTGGATATTTGGAGCACATTTATGCCTGTGGTAGAAAAGGAAATATCTTCACATAAAAACTAGACAGAATAATCTCAGAAACATGTTTATGCTGTATCTACTCAACTAAGTGTGCTGAACATTTCTATTAATAGAGCAGTTTTGAGACACTCTTCTTTTCGAATCTGCAAGTGGATATTTGGCTAGATTTGAGGATTTCGTTGGAAACGGGATTATATATAAAAAGTAGACAGCACATTCTCAGAAAAATCTCTGTGAGGATGGCATTCAAGTGCCAGTGTTGAACATTCTCTTTCATAAAGCAGGTGTGAACACAAGATTTTGTAGTATATGGAACTGGACATTTGGGGTGCTTTGTGACCTATTGTGAAAAAGGAAATATCTTCCCATATAAACTACGCAGAACCTTCGCAGAAACACCTTTGTGATGTTTGCATTGAAGTCAGAGAGTTGTACATTCCCTTTCATAGAGCAGCTTTCAAACACTCTTTTTGTAGTATCTGGAGATGGACATTTACATCGCTTTGAGGCCTATGGTGAAATAGGAAATCTCTTCGCATAAAAACTAGACGGAACATTCTCAGAAACTTCTTTGTGATGTGGGCATTGAACTCACGGAGCTGAACCTTCCTTTGGATTGAGCAGTTTTGACAAACTCTTCCTTTATAATCTGCAGGTGGATATTTGGAGTGCTTTGAAGCCTTTTTGGAAACGGGAGTATCGTCACATAAAAATAGACAGAACATTCTCAGAACCTTGATTGTGATGTGTGTTCTCCACTAACAGGGTTGAACCTTTCTTTTGACAGAACTGTTGTGAAACATTCTTTTTATAGAATCTGGAAGTGGATATTTGGAAAGCTTTGAGGATTTCGTTGGAAACGGGAATATCTTCAAATCAAATCTAGCCAGAACATTCTCATAAACTAGTTTGTGATGTGTGTGCTTAACTAACAGAGCTGAACCTTTCTTTTCATAGAGCGGTTTTGAAACACTCTTTTTGTAGAATCTGCATGTGGATATTTGGAAAGCTTTGAGGATTTCTTTGGAAACGGGAATATCTTCACTTAAAATCTAGACAGAACATTCTCAGAGACTGCTCTGTGCTGTGTGCGTTCAACTCACAGTGTTTAAGTTTTCTTTTCATTCAGCAGTTTGGAAATGCTCTGTTTGTAACGTCTGCAAGTGGATATTTTGACCTCTTTGAGGCCTTCATTGGAAACGGATTTTTTTCAGGTAAGGCTATACAGAACATTCTCAGAAACTACTTTGTGATGTGTGCATTCAACTCACCGAATTGAACCTTCCTTTTGATACAGCAGTTTTGAAACACTCTTTGTTTAGAATCTGCAAGTGGATATTTGGAGCACATTTATGCCTGTGGTAGAAAAGGAAATATCTTCACATAAAAACTAGACAGAATAATCTCAGAAACATGTTTATGCTGTATCTACTCAACTAAGTGTGCTGAACATTTCTATTAATAGAGCAGTTTTGAGACACTCTTCTTTTCGAATCTGCAAGTGGATATTTGGCTAGATTTGAGGATTTCGTTGGAAACGGGATTATATATAAAAAGTAGACAGCACATTCTCAGAAAAATCTCTGTGAGGATGGCATTCAAGTGCCAGTGTTGAACATTCTCTTTCATAAAGCAGGTGTGAACACAAGATTTTGTAGTATATGGAACTGGACATTTGGGGTGCTTTGTGACCTATTGTGAAAAAGGAAATATCTTCCCATATAAACTACGCAGAACCTTCGCAGAAACACCTTTGTGATGTTTGCATTGAAGTCAGAGAGTTGTACATTCCCTTTCATAGAGCAGCTTTCAAACACTCTTTTTGTAGTATCTGGAGATGGACATTTACATCGCTTTGAGGCCTATGGTGAAATAGGAAATCTCTTCGCATAAAAACTAGACGGAACATTCTCAGAAACTTCTTTGTGATGTGGGCATTGAACTCACGGAGCTGAACCTTCCTTTGGATTGAGCAGTTTTGACAAACTCTTCCTTTATAATCTGCAGGTGGATATTTGGAGTGCTTTGAAGCCTTTTTGGAAACGGGAGTATCGTCACATAAAAATAGACAGAACATTCTCAGAACCTTGATTGTGATGTGTGTTCTCCACTAACAGGGTTGAACCTTTCTTTTGACAGAACTGTTGTGAAACATTCTTTTTATAGAATCTGGAAGTGGATATTTGGAAAGCTTTGAGGATTTCGTTGGAAACGGGAATATCTTCAAATCAAATCTAGCCAGAACATTCTCATAAACTAGTTTGTGATGTGTGTGCTTAACTAACAGAGCTGAACCTTTCTTTTCATAGAGCGGTTTTGAAACACTCTTTTTGTAGAATCTGCATGTGGATATTTGGAAAGCTTTGAGGATTTCTTTGGAAACGGGAATATCTTCACTTAAAATCTAGACAGAACATTCTCAGAGACTGCTCTGTGCTGTGTGCGTTCAACTCACAGTGTTTAAGTTTTCTTTTCATTCAGCAGTTTGGAAATGCTCTGTTTGTAACGTCTGCAAGTGGATATTTTGACCTCTTTGAGGCCTTCATTGGAAACGGATTTTTTTCAGGTAAGGCTATACAGAACATTCTCAGAAACTACTTTGTGATGTGTGCATTCAACTCACCGAATTGAACCTTCCTTTTGATACAGCAGTTTTGAAACACTCTTTGTTTAGAATCTGCAAGTGGATATTTGGAGCACATTTATGCCTGTGGTAGAAAAGGAAATATCTTCACATAAAAACTAGACAGAATAATCTCAGAAACATGTTTATGCTGTATCTACTCAACTAAGTGTGCTGAACATTTCTATTAATAGAGCAGTTTTGAGACACTCTTCTTTTCGAATCTGCAAGTGGATATTTGGCTAGATTTGAGGATTTCGTTGGAAACGGGATTATATATAAAAAGTAGACAGCACATTCTCAGAAAAATCTCTGTGAGGATGGCATTCAAGTGCCAGTGTTGAACATTCTCTTTCATAAAGCAGGTGTGAACACAAGATTTTGTAGTATATGGAACTGGACATTTGGGGTGCTTTGTGACCTATTGTGAAAAAGGAAATATCTTCCCATATAAACTACGCAGAACCTTCGCAGAAACACCTTTGTGATGTTTGCATTGAAGTCAGAGAGTTGTACATTCCCTTTCATAGAGCAGCTTTCAAACACTCTTTTTGTAGTATCTGGAGATGGACATTTACATCGCTTTGAGGCCTATGGTGAAATAGGAAATCTCTTCGCATAAAAACTAGACGGAACATTCTCAGAAACTTCTTTGTGATGTGGGCATTGAACTCACGGAGCTGAACCTTCCTTTGGATTGAGCAGTTTTGACAAACTCTTCCTTTATAATCTGCAGGTGGATATTTGGAGTGCTTTGAAGCCTTTTTGGAAACGGGAGTATCGTCACATAAAAATAGACAGAACATTCTCAGAACCTTGATTGTGATGTGTGTTCTCCACTAACAGGGTTGAACCTTTCTTTTGACAGAACTGTTGTGAAACATTCTTTTTATAGAATCTGGAAGTGGATATTTGGAAAGCTTTGAGGATTTCGTTGGAAACGGGAATATCTTCAAATCAAATCTAGCCAGAACATTCTCATAAACTAGTTTGTGATGTGTGTGCTTAACTAACAGAGCTGAACCTTTCTTTTCATAGAGCGGTTTTGAAACACTCTTTTTGTAGAATCTGCATGTGGATATTTGGAAAGCTTTGAGGATTTCTTTGGAAACGGGAATATCTTCACTTAAAATCTAGACAGAACATTCTCAGAGACTGCTCTGTGCTGTGTGCGTTCAACTCACAGTGTTTAAGTTTTCTTTTCATTCAGCAGTTTGGAAATGCTCTGTTTGTAACGTCTGCAAGTGGATATTTTGACCTCTTTGAGGCCTTCATTGGAAACGGATTTTTTTCAGGTAAGGCTATACAGAACATTCTCAGAAACTACTTTGTGATGTGTGCATTCAACTCACCGAATTGAACCTTCCTTTTGATACAGCAGTTTTGAAACACTCTTTGTTTAGAATCTGCAAGTGGATATTTGGAGCACATTTATGCCTGTGGTAGAAAAGGAAATATCTTCACATAAAAACTAGACAGAATAATCTCAGAAACATGTTTATGCTGTATCTACTCAACTAAGTGTGCTGAACATTTCTATTAATAGAGCAGTTTTGAGACACTCTTCTTTTCGAATCTGCAAGTGGATATTTGGCTAGATTTGAGGATTTCGTTGGAAACGGGATTATATATAAAAAGTAGACAGCACATTCTCAGAAAAATCTCTGTGAGGATGGCATTCAAGTGCCAGTGTTGAACATTCTCTTTCATAAAGCAGGTGTGAACACAAGATTTTGTAGTATATGGAACTGGACATTTGGGGTGCTTTGTGACCTATTGTGAAAAAGGAAATATCTTCCCATATAAACTACGCAGAACCTTCGCAGAAACACCTTTGTGATGTTTGCATTGAAGTCAGAGAGTTGTACATTCCCTTTCATAGAGCAGCTTTCAAACACTCTTTTTGTAGTATCTGGAGATGGACATTTACATCGCTTTGAGGCCTATGGTGAAATAGGAAATCTCTTCGCATAAAAACTAGACGGAACATTCTCAGAAACTTCTTTGTGATGTGGGCATTGAACTCACGGAGCTGAACCTTCCTTTGGATTGAGCAGTTTTGACAAACTCTTCCTTTATAATCTGCAGGTGGATATTTGGAGTGCTTTGAAGCCTTTTTGGAAACGGGAGTATCGTCACATAAAAATAGACAGAACATTCTCAGAACCTTGATTGTGATGTGTGTTCTCCACTAACAGGGTTGAACCTTTCTTTTGACAGAACTGTTGTGAAACATTCTTTTTATAGAATCTGGAAGTGGATATTTGGAAAGCTTTGAGGATTTCGTTGGAAACGGGAATATCTTCAAATCAAATCTAGCCAGAACATTCTCATAAACTAGTTTGTGATGTGTGTGCTTAACTAACAGAGCTGAACCTTTCTTTTCATAGAGCGGTTTTGAAACACTCTTTTTGTAGAATCTGCATGTGGATATTTGGAAAGCTTTGAGGATTTCTTTGGAAACGGGAATATCTTCACTTAAAATCTAGACAGAACATTCTCAGAGACTGCTCTGTGCTGTGTGCGTTCAACTCACAGTGTTTAAGTTTTCTTTTCATTCAGCAGTTTGGAAATGCTCTGTTTGTAACGTCTGCAAGTGGATATTTTGACCTCTTTGAGGCCTTCATTGGAAACGGATTTTTTTCAGGTAAGGCTATACAGAACATTCTCAGAAACTACTTTGTGATGTGTGCATTCAACTCACCGAATTGAACCTTCCTTTTGATACAGCAGTTTTGAAACACTCTTTGTTTAGAATCTGCAAGTGGATATTTGGAGCACATTTATGCCTGTGGTAGAAAAGGAAATATCTTCACATAAAAACTAGACAGAATAATCTCAGAAACATGTTTATGCTGTATCTACTCAACTAAGTGTGCTGAACATTTCTATTAATAGAGCAGTTTTGAGACACTCTTCTTTTCGAATCTGCAAGTGGATATTTGGCTAGATTTGAGGATTTCGTTGGAAACGGGATTATATATAAAAAGTAGACAGCACATTCTCAGAAAAATCTCTGTGAGGATGGCATTCAAGTGCCAGTGTTGAACATTCTCTTTCATAAAGCAGGTGTGAACACAAGATTTTGTAGTATATGGAACTGGACATTTGGGGTGCTTTGTGACCTATTGTGAAAAAGGAAATATCTTCCCATATAAACTACGCAGAACCTTCGCAGAAACACCTTTGTGATGTTTGCATTGAAGTCAGAGAGTTGTACATTCCCTTTCATAGAGCAGCTTTCAAACACTCTTTTTGTAGTATCTGGAGATGGACATTTACATCGCTTTGAGGCCTATGGTGAAATAGGAAATCTCTTCGCATAAAAACTAGACGGAACATTCTCAGAAACTTCTTTGTGATGTGGGCATTGAACTCACGGAGCTGAACCTTCCTTTGGATTGAGCAGTTTTGACAAACTCTTCCTTTATAATCTGCAGGTGGATATTTGGAGTGCTTTGAAGCCTTTTTGGAAACGGGAGTATCGTCACATAAAAATAGACAGAACATTCTCAGAACCTTGATTGTGATGTGTGTTCTCCACTAACAGGGTTGAACCTTTCTTTTGACAGAACTGTTGTGAAACATTCTTTTTATAGAATCTGGAAGTGGATATTTGGAAAGCTTTGAGGATTTCGTTGGAAACGGGAATATCTTCAAATCAAATCTAGCCAGAACATTCTCATAAACTAGTTTGTGATGTGTGTGCTTAACTAACAGAGCTGAACCTTTCTTTTCATAGAGCGGTTTTGAAACACTCTTTTTGTAGAATCTGCATGTGGATATTTGGAAAGCTTTGAGGATTTCTTTGGAAACGGGAATATCTTCACTTAAAATCTAGACAGAACATTCTCAGAGACTGCTCTGTGCTGTGTGCGTTCAACTCACAGTGTTTAAGTTTTCTTTTCATTCAGCAGTTTGGAAATGCTCTGTTTGTAACGTCTGCAAGTGGATATTTTGACCTCTTTGAGGCCTTCATTGGAAACGGATTTTTTTCAGGTAAGGCTATACAGAACATTCTCAGAAACTACTTTGTGATGTGTGCATTCAACTCACCGAATTGAACCTTCCTTTTGATACAGCAGTTTTGAAACACTCTTTGTTTAGAATCTGCAAGTGGATATTTGGAGCACATTTATGCCTGTGGTAGAAAAGGAAATATCTTCACATAAAAACTAGACAGAATAATCTCAGAAACATGTTTATGCTGTATCTACTCAACTAAGTGTGCTGAACATTTCTATTAATAGAGCAGTTTTGAGACACTCTTCTTTTCGAATCTGCAAGTGGATATTTGGCTAGATTTGAGGATTTCGTTGGAAACGGGATTATATATAAAAAGTAGACAGCACATTCTCAGAAAAATCTCTGTGAGGATGGCATTCAAGTGCCAGTGTTGAACATTCTCTTTCATAAAGCAGGTGTGAACACAAGATTTTGTAGTATATGGAACTGGACATTTGGGGTGCTTTGTGACCTATTGTGAAAAAGGAAATATCTTCCCATATAAACTACGCAGAACCTTCGCAGAAACACCTTTGTGATGTTTGCATTGAAGTCAGAGAGTTGTACATTCCCTTTCATAGAGCAGCTTTCAAACACTCTTTTTGTAGTATCTGGAGATGGACATTTACATCGCTTTGAGGCCTATGGTGAAATAGGAAATCTCTTCGCAT"
In [40]:
M=getDotMatrix(seq,window=170,step=17)
imshow(M,cmap=cm.gray)
Out[40]: