In [1]:
cd ..
In [2]:
from difflib2.get_lcs_cut2 import _get_lcs_cut2
In [3]:
s1,s2 = '0-abcdferhzzzzttt','0+abcdbghhrzzzz$$$'
l, lcs = _get_lcs_cut2(s1,s2)
In [4]:
def unfold(lcs):
ll =[]
while True:
if lcs is None:
break
val = lcs[0]
lcs =lcs[1]
ll.append(val)
yield from reversed(ll)
y_ulcs = unfold(lcs)
ulcs = list(y_ulcs)
ulcs
Out[4]:
In [5]:
from difflib import Match
In [6]:
def _y_make_matching_blocks(ulcs,s1,s2):
ulcs = iter(ulcs)
pi, pj = next(ulcs)
oi, oj = pi, pj
leng = 1
for i,j in ulcs:
if i == pi+1 and j == pj+1:
pi,pj = i,j
leng +=1
else :
yield Match(oi,oj,leng)
oi,oj = i,j
pi,pj = i,j
leng = 1
yield Match(oi,oj,leng)
# stdlib return an empty match at the end :
yield Match(len(s1),len(s2),0)
def make_matching_blocks(ulcs,s1,s2):
return list(_y_make_matching_blocks(ulcs,s1,s2))
In [7]:
from difflib import SequenceMatcher
list(SequenceMatcher(None, s1,s2).get_matching_blocks()) , make_matching_blocks(ulcs,s1,s2)
Out[7]:
In [7]:
In [8]:
from difflib2.get_lcs_cut2 import _get_lcs_cut2
from difflib import SequenceMatcher
def _unfold(lcs):
""" unfold apsudo linked list,
generated by LCS alfirythmes.
"""
ll =[]
while True:
if lcs is None:
break
val = lcs[0]
lcs = lcs[1]
ll.append(val)
return list(reversed(ll))
def _make_matching_blocks(ulcs,s1,s2):
pi, pj = ulcs[0]
oi, oj = ulcs[0]
leng = 1
matches= []
for i,j in ulcs[1:]:
if i == pi+1 and j == pj+1:
pi,pj = i,j
leng +=1
else :
matches.append( Match(oi,oj,leng) )
oi,oj = i,j
pi,pj = i,j
leng = 1
matches.append( Match(oi,oj,leng) )
# stdlib return an empty match at the end :
matches.append( Match(len(s1),len(s2),0) )
return matches
class ExactSequenceMatcher( SequenceMatcher ):
def __init__(self, isjunk, a, b, autojunk=True):
super().__init__(isjunk, a, b, autojunk)
def find_longest_match(self, *args, **kwargs):
raise NotImplementedError('this is the longuest common substring problem')
def get_matching_blocks(self):
ulcs = self.get_matching_index(self.a,self.b)
return _make_matching_blocks(ulcs, self.a, self.b)
def get_matching_index(self, a, b):
"""
"""
return unfold(_get_lcs_cut2(self.a, self.b)[1])
In [9]:
s1 = "xada"
s2 = "bada"
#print(ExactSequenceMatcher(None, s1*10,s2*10).find_longest_match())
#list(SequenceMatcher(None, s1*10,s2*10).find_longest_match())
#_get_lcs_cut2(s1,s2)
In [10]:
list(filter( lambda x:x[0]!= '_' ,dir(SequenceMatcher)))
Out[10]:
In [11]:
from collections import Counter
In [12]:
Counter('abc')
Out[12]:
In [13]:
import pygments
import pygments.lexer
import pygments.lexers.agile
import pygments.formatters
from IPython.display import HTML
In [14]:
PL = pygments.lexers.agile.PythonLexer()
HF = pygments.formatters.HtmlFormatter(noclasses=True, nowrap=True)
In [15]:
s1 = """# really, making a copyright is useless
# but just to test multiline removel I'll do that
# this is copyrighted
import pygments
import pygments.lexer
import pygments.lexers.agile
'''
donez, donnez moi,
le pouvoir d'achat
donez, donnez moi,
le super pouvoir
le super pouvoir, de pouvoir marcher
tout en achetant du lait UHT
'''
from IPython.display import HTML
""".splitlines()
s2 = """import pygments
import pygments.lexers.agile
import pygments.formatters
'''
Donnez, donnez moi,
le pouvoir d'achat
donnez, donnez moi,
le super pouvoir
le super pouvoir, de pouvoir marcher
tout en achetant du lait UHT
'''
a multil line added
which shoudl make a green block
from IPython.display import HTML
""".splitlines()
hs1 = pygments.highlight('\n'.join(s1), PL, HF ).splitlines()
hs2 = pygments.highlight('\n'.join(s2), PL, HF ).splitlines()
In [16]:
hs1
Out[16]:
In [17]:
s1
Out[17]:
In [18]:
s2
Out[18]:
In [19]:
res = pygments.highlight("""print('hi')
1+1
""", PL, HF )
HTML(res)
Out[19]:
In [20]:
import difflib
In [21]:
SequenceMatcher
Out[21]:
In [22]:
fs1 = !git show HEAD:examples/MatchGroups.ipynb
fs2 = !git show HEAD~1:examples/MatchGroups.ipynb
In [23]:
ss1 = set(fs1)
ss2 = set(fs2)
us = ss1.intersection(ss2)
In [24]:
len(ss1),len(ss2),len(fs1),len(fs2),len(us)
Out[24]:
In [25]:
a,b = zip(*((i,e)for i,e in enumerate('abc')))
In [26]:
type(b)
Out[26]:
In [27]:
something = 'abcdefghij'
vowel = 'aeiouy'
condition = lambda x:x in vowel
#zip(* ((i,_) for i,_ in enumerate(something) if e in vowel) )
In [28]:
#import itertools
#a,b = itertools.tee(zip(*((i,e) for i,e in enumerate(something) if e in us)))
#a,b = itertools.tee(( (_,_) for _ in something))
#list(a),list(b)
In [29]:
zip(*((i,e) for i,e in enumerate(something) if e in vowel))
Out[29]:
In [30]:
a,b = zip(*((i,e) for i,e in enumerate(something) if e in vowel))
a,b
Out[30]:
In [31]:
from itertools import chain
def lcsc2(fs1,fs2):
""" optimise by removing non-common seq """
ss1 = set(fs1)
ss2 = set(fs2)
us = ss1.intersection(ss2)
s1m,seq1 = zip(*((ii,e) for ii,e in enumerate(fs1) if e in us))
s2m,seq2 = zip(*((ii,e) for ii,e in enumerate(fs2) if e in us))
delta = -1
for ca,cb in zip(seq1,seq2):
if ca==cb:
delta=delta+1
else:
break
seq1,seq2 = seq1[delta:],seq2[delta:]
length, back_index = _get_lcs_cut2(seq1,seq2)
back_index = unfold(back_index)
length = length+delta
bi = map(lambda x:(x[0]+delta,x[1]+delta), back_index)
back_index = chain(((i,i) for i in range(delta)),bi)
return length, map(lambda x:(s1m[x[0]],s2m[x[1]]),back_index )
In [32]:
#list(zip(*[(ii,_) for ii,_ in enumerate("hello") if _ in {'a','o','h','l'}]))
a,b = tuple(zip(*((ii,_) for ii,_ in enumerate('aaa') if _ in {'a'})))
#tuple(zip(*[(ii,_) for ii,_ in enumerate('aaa') if _ in {'a'}]))
In [33]:
lcsc2('aaa','aba')
Out[33]:
In [34]:
%timeit uf = unfold(_get_lcs_cut2(fs1,fs2)[1])
%timeit uf = lcsc2(fs1,fs2)
%timeit ufs = list(SequenceMatcher(None,fs1,fs2).get_matching_blocks())
In [35]:
lcsc2(fs1,fs2)
Out[35]:
In [36]:
(len(list(unfold(_get_lcs_cut2(fs1,fs2)[1]))),
len(list(lcsc2(fs1,fs2)[1])),
_get_lcs_cut2(fs1,fs2)[0],
lcsc2(fs1,fs2)[0])
Out[36]:
In [37]:
%matplotlib inline
In [38]:
import matplotlib.pyplot as plt
xa, ya = tuple(zip(*unfold(_get_lcs_cut2(fs1,fs2)[1])))
xb, yb = tuple(zip(*lcsc2(fs1,fs2)[1]))
fig,ax = plt.subplots(1,1)
ax.scatter(xa, ya ,linewidths=0)
ax.scatter(xb, yb ,c='red', linewidths=0, s=10)
ax.set_title('selected dominant match point in each case')
ax.set_aspect('equal')
plt.figure()
plt.scatter(xb,xa)
plt.figure()
plt.scatter(ya,yb)
Out[38]:
In [39]:
uf = lcsc2(fs1,fs2)[1]
ufs = list(SequenceMatcher(None,fs1,fs2).get_matching_blocks())
In [40]:
len(list(uf)),sum([x.size for x in list(ufs)])
Out[40]:
In [41]:
uf = unfold(_get_lcs_cut2(s1,s2)[1])
In [42]:
from collections import namedtuple
In [43]:
Match= namedtuple('Match',('old','new'))
Insertion = namedtuple('Insertion',('old','new'))
Deletion = namedtuple('Deletion',('old','new'))
class SInsertion(Insertion):pass
class SDeletion(Deletion):pass
def matchfill(seq):
nseq = []
pi,pj =(-1,-1)
for i,j in seq[0:]:
if i > pi+1:
for k in range(pi+1,i):
nseq.append(Deletion(k,None))
pi,pj = pi+1,pj
if j > pj+1:
for k in range(pj+1,j):
nseq.append(Insertion(None,k))
pi,pj = pi, pj+1
if (i,j) == (pi+1,pj+1):
nseq.append(Match(i,j))
pi,pj = i,j
return nseq
In [44]:
uf = list(uf)
In [45]:
matchfill(list(uf))
Out[45]:
In [46]:
list(zip(*matchfill(list(uf))))
Out[46]:
In [47]:
HTML("""
<style>
.d-add:before {
content: '+ ';
color:green;
}
.d-add, .d-noth , .d-rem{
width:100%;
display:block;
color:gray;
}
.d-add{
background-color:#DFD;
}
.d-rem:before {
content: '- ';
color:red;
}
.d-rem {
background-color:#FDD;
}
.d-noth:before {
content: ' ';
color:lightblue;
}
</style>
""")
Out[47]:
In [61]:
def mark_line_diff(mf):
sk =False
for i,l in enumerate(mf):
if sk:
yield SInsertion(*l)
sk=False
continue
if isinstance(l, Deletion) and \
isinstance(mf[i+1], Insertion) and \
isinstance(mf[i+2], Match):
sk = True
yield SDeletion(*l)
else :
yield l
In [64]:
ts = ''
mf = mark_line_diff(matchfill(list(uf)))
for m in mf :
i,j = m
if isinstance(m, SInsertion) :
ts = ts+'<span class="d-add">' +'| |' +' %02d +'%j+hs2[j]+'</span>'
elif isinstance(m, Insertion) :
ts = ts+'<span class="d-add">' +'| |' +' %02d |'%j+hs2[j]+'</span>'
elif isinstance(m, SDeletion) :
ts = ts+'<span class="d-rem">' +'| %02d |'%i +' -' +hs1[i]+'</span>'
elif isinstance(m, Deletion) :
ts = ts+'<span class="d-rem">' +'| %02d |'%i +' |' +hs1[i]+'</span>'
else :
ts = ts+'<span class="d-noth">'+'| %02d |'%i +' %02d |'%j+hs1[i]+'</span>'
HTML('<pre>'+ts+'</pre>')
Out[64]:
In [55]:
uf
Out[55]:
In [56]:
matchfill(uf)
Out[56]:
In [ ]:
list(mark_line_diff(matchfill(uf)))
In [ ]: