``````

In [1]:

cd ..

``````
``````

/Users/bussonniermatthias/difflib2

``````
``````

In [2]:

from difflib2.get_lcs_cut2 import _get_lcs_cut2

``````
``````

In [3]:

s1,s2 = '0-abcdferhzzzzttt','0+abcdbghhrzzzz\$\$\$'
l, lcs  = _get_lcs_cut2(s1,s2)

``````
``````

In [4]:

def unfold(lcs):
ll =[]
while True:
if lcs is None:
break
val = lcs[0]
lcs =lcs[1]
ll.append(val)
yield from reversed(ll)
y_ulcs = unfold(lcs)
ulcs = list(y_ulcs)
ulcs

``````
``````

Out[4]:

[(0, 0),
(2, 2),
(3, 3),
(4, 4),
(5, 5),
(8, 10),
(10, 11),
(11, 12),
(12, 13),
(13, 14)]

``````
``````

In [5]:

from difflib import Match

``````
``````

In [6]:

def _y_make_matching_blocks(ulcs,s1,s2):
ulcs = iter(ulcs)
pi, pj = next(ulcs)
oi, oj = pi, pj
leng = 1
for i,j in ulcs:
if i == pi+1 and j == pj+1:
pi,pj = i,j
leng +=1
else :
yield Match(oi,oj,leng)
oi,oj = i,j
pi,pj = i,j
leng = 1
yield Match(oi,oj,leng)
# stdlib return an empty match at the end :
yield Match(len(s1),len(s2),0)

def make_matching_blocks(ulcs,s1,s2):
return list(_y_make_matching_blocks(ulcs,s1,s2))

``````
``````

In [7]:

from difflib import SequenceMatcher
list(SequenceMatcher(None, s1,s2).get_matching_blocks()) , make_matching_blocks(ulcs,s1,s2)

``````
``````

Out[7]:

([Match(a=0, b=0, size=1),
Match(a=2, b=2, size=4),
Match(a=8, b=10, size=1),
Match(a=10, b=11, size=4),
Match(a=17, b=18, size=0)],
[Match(a=0, b=0, size=1),
Match(a=2, b=2, size=4),
Match(a=8, b=10, size=1),
Match(a=10, b=11, size=4),
Match(a=17, b=18, size=0)])

``````
``````

In [7]:

``````
``````

In [8]:

from difflib2.get_lcs_cut2 import _get_lcs_cut2
from difflib import  SequenceMatcher

def _unfold(lcs):
generated by LCS alfirythmes.
"""
ll =[]
while True:
if lcs is None:
break
val = lcs[0]
lcs = lcs[1]
ll.append(val)
return list(reversed(ll))

def _make_matching_blocks(ulcs,s1,s2):
pi, pj = ulcs[0]
oi, oj = ulcs[0]
leng = 1
matches= []
for i,j in ulcs[1:]:
if i == pi+1 and j == pj+1:
pi,pj = i,j
leng +=1
else :
matches.append( Match(oi,oj,leng) )
oi,oj = i,j
pi,pj = i,j
leng = 1
matches.append( Match(oi,oj,leng) )
# stdlib return an empty match at the end :
matches.append( Match(len(s1),len(s2),0) )
return matches

class ExactSequenceMatcher( SequenceMatcher ):

def __init__(self, isjunk, a, b, autojunk=True):
super().__init__(isjunk, a, b, autojunk)

def find_longest_match(self, *args, **kwargs):
raise NotImplementedError('this is the longuest common substring problem')

def get_matching_blocks(self):
ulcs = self.get_matching_index(self.a,self.b)
return _make_matching_blocks(ulcs, self.a, self.b)

def get_matching_index(self, a, b):
"""

"""
return unfold(_get_lcs_cut2(self.a, self.b)[1])

``````
``````

In [9]:

#print(ExactSequenceMatcher(None, s1*10,s2*10).find_longest_match())
#list(SequenceMatcher(None, s1*10,s2*10).find_longest_match())
#_get_lcs_cut2(s1,s2)

``````
``````

In [10]:

list(filter( lambda x:x[0]!= '_' ,dir(SequenceMatcher)))

``````
``````

Out[10]:

['find_longest_match',
'get_grouped_opcodes',
'get_matching_blocks',
'get_opcodes',
'quick_ratio',
'ratio',
'real_quick_ratio',
'set_seq1',
'set_seq2',
'set_seqs']

``````
``````

In [11]:

from collections import Counter

``````
``````

In [12]:

Counter('abc')

``````
``````

Out[12]:

Counter({'c': 1, 'b': 1, 'a': 1})

``````
``````

In [13]:

import pygments
import pygments.lexer
import pygments.lexers.agile
import pygments.formatters

from IPython.display import HTML

``````
``````

In [14]:

PL = pygments.lexers.agile.PythonLexer()
HF = pygments.formatters.HtmlFormatter(noclasses=True, nowrap=True)

``````
``````

In [15]:

s1 = """# really, making a copyright is useless
# but just to test multiline removel I'll do that

import pygments
import pygments.lexer
import pygments.lexers.agile

'''
donez, donnez moi,
le pouvoir d'achat
donez, donnez moi,
le super pouvoir
le super pouvoir, de pouvoir marcher
tout en achetant du lait UHT
'''

from IPython.display import HTML
""".splitlines()

s2 = """import pygments
import pygments.lexers.agile
import pygments.formatters
'''
Donnez, donnez moi,
le pouvoir d'achat
donnez, donnez moi,
le super pouvoir
le super pouvoir, de pouvoir marcher
tout en achetant du lait UHT
'''
which shoudl make a green block
from IPython.display import HTML
""".splitlines()

hs1 = pygments.highlight('\n'.join(s1), PL, HF ).splitlines()
hs2 = pygments.highlight('\n'.join(s2), PL, HF ).splitlines()

``````
``````

In [16]:

hs1

``````
``````

Out[16]:

['<span style="color: #408080; font-style: italic"># really, making a copyright is useless</span>',
'<span style="color: #408080; font-style: italic"># but just to test multiline removel I&#39;ll do that</span>',
'<span style="color: #408080; font-style: italic"># this is copyrighted</span>',
'',
'<span style="color: #008000; font-weight: bold">import</span> <span style="color: #0000FF; font-weight: bold">pygments</span>',
'<span style="color: #008000; font-weight: bold">import</span> <span style="color: #0000FF; font-weight: bold">pygments.lexer</span>',
'<span style="color: #008000; font-weight: bold">import</span> <span style="color: #0000FF; font-weight: bold">pygments.lexers.agile</span>',
'',
'<span style="color: #BA2121; font-style: italic">&#39;&#39;&#39;</span>',
'<span style="color: #BA2121; font-style: italic">donez, donnez moi, </span>',
'<span style="color: #BA2121; font-style: italic">le pouvoir d&#39;achat</span>',
'<span style="color: #BA2121; font-style: italic">donez, donnez moi, </span>',
'<span style="color: #BA2121; font-style: italic">le super pouvoir</span>',
'<span style="color: #BA2121; font-style: italic">le super pouvoir, de pouvoir marcher</span>',
'<span style="color: #BA2121; font-style: italic">tout en achetant du lait UHT</span>',
'<span style="color: #BA2121; font-style: italic">&#39;&#39;&#39;</span>',
'',
'<span style="color: #008000; font-weight: bold">from</span> <span style="color: #0000FF; font-weight: bold">IPython.display</span> <span style="color: #008000; font-weight: bold">import</span> HTML']

``````
``````

In [17]:

s1

``````
``````

Out[17]:

['# really, making a copyright is useless',
"# but just to test multiline removel I'll do that",
'',
'import pygments',
'import pygments.lexer',
'import pygments.lexers.agile',
'',
"'''",
'donez, donnez moi, ',
"le pouvoir d'achat",
'donez, donnez moi, ',
'le super pouvoir',
'le super pouvoir, de pouvoir marcher',
'tout en achetant du lait UHT',
"'''",
'',
'from IPython.display import HTML']

``````
``````

In [18]:

s2

``````
``````

Out[18]:

['import pygments',
'import pygments.lexers.agile',
'import pygments.formatters',
"'''",
'Donnez, donnez moi, ',
"le pouvoir d'achat",
'donnez, donnez moi, ',
'le super pouvoir',
'le super pouvoir, de pouvoir marcher',
'tout en achetant du lait UHT',
"'''",
'which shoudl make a green block',
'from IPython.display import HTML']

``````
``````

In [19]:

res = pygments.highlight("""print('hi')
1+1
""", PL, HF )

HTML(res)

``````
``````

Out[19]:

print('hi')
1+1

``````
``````

In [20]:

import difflib

``````
``````

In [21]:

SequenceMatcher

``````
``````

Out[21]:

difflib.SequenceMatcher

``````
``````

In [22]:

``````
``````

In [23]:

ss1 = set(fs1)
ss2 = set(fs2)
us = ss1.intersection(ss2)

``````
``````

In [24]:

len(ss1),len(ss2),len(fs1),len(fs2),len(us)

``````
``````

Out[24]:

(426, 455, 1134, 1143, 342)

``````
``````

In [25]:

a,b = zip(*((i,e)for i,e in enumerate('abc')))

``````
``````

In [26]:

type(b)

``````
``````

Out[26]:

tuple

``````
``````

In [27]:

something = 'abcdefghij'
vowel = 'aeiouy'
condition = lambda x:x in vowel
#zip(* ((i,_) for i,_ in enumerate(something) if e in vowel)   )

``````
``````

In [28]:

#import itertools
#a,b = itertools.tee(zip(*((i,e) for i,e in enumerate(something) if e in us)))
#a,b = itertools.tee(( (_,_) for _ in something))
#list(a),list(b)

``````
``````

In [29]:

zip(*((i,e) for i,e in enumerate(something) if e in vowel))

``````
``````

Out[29]:

<zip at 0x11036f808>

``````
``````

In [30]:

a,b = zip(*((i,e) for i,e in enumerate(something) if e in vowel))
a,b

``````
``````

Out[30]:

((0, 4, 8), ('a', 'e', 'i'))

``````
``````

In [31]:

from itertools import chain

def lcsc2(fs1,fs2):
""" optimise by removing non-common seq """
ss1 = set(fs1)
ss2 = set(fs2)
us = ss1.intersection(ss2)
s1m,seq1 = zip(*((ii,e) for ii,e in enumerate(fs1) if e in us))
s2m,seq2 = zip(*((ii,e) for ii,e in enumerate(fs2) if e in us))
delta = -1
for ca,cb in zip(seq1,seq2):
if ca==cb:
delta=delta+1
else:
break
seq1,seq2 = seq1[delta:],seq2[delta:]
length, back_index =  _get_lcs_cut2(seq1,seq2)
back_index = unfold(back_index)
length = length+delta
bi = map(lambda x:(x[0]+delta,x[1]+delta), back_index)
back_index = chain(((i,i) for i in range(delta)),bi)

return length, map(lambda x:(s1m[x[0]],s2m[x[1]]),back_index )

``````
``````

In [32]:

#list(zip(*[(ii,_) for ii,_ in enumerate("hello") if _ in {'a','o','h','l'}]))
a,b = tuple(zip(*((ii,_) for ii,_ in enumerate('aaa') if _ in {'a'})))

#tuple(zip(*[(ii,_) for ii,_ in enumerate('aaa') if _ in {'a'}]))

``````
``````

In [33]:

lcsc2('aaa','aba')

``````
``````

Out[33]:

(2, <map at 0x1103754a8>)

``````
``````

In [34]:

%timeit uf = unfold(_get_lcs_cut2(fs1,fs2)[1])
%timeit uf = lcsc2(fs1,fs2)
%timeit ufs = list(SequenceMatcher(None,fs1,fs2).get_matching_blocks())

``````
``````

1 loops, best of 3: 518 ms per loop
1 loops, best of 3: 348 ms per loop
100 loops, best of 3: 5.93 ms per loop

``````
``````

In [35]:

lcsc2(fs1,fs2)

``````
``````

Out[35]:

(931, <map at 0x110375390>)

``````
``````

In [36]:

(len(list(unfold(_get_lcs_cut2(fs1,fs2)[1]))),
len(list(lcsc2(fs1,fs2)[1])),
_get_lcs_cut2(fs1,fs2)[0],
lcsc2(fs1,fs2)[0])

``````
``````

Out[36]:

(931, 931, 931, 931)

``````
``````

In [37]:

%matplotlib inline

``````
``````

In [38]:

import matplotlib.pyplot as plt
xa, ya = tuple(zip(*unfold(_get_lcs_cut2(fs1,fs2)[1])))
xb, yb = tuple(zip(*lcsc2(fs1,fs2)[1]))
fig,ax = plt.subplots(1,1)
ax.scatter(xa, ya ,linewidths=0)
ax.scatter(xb, yb ,c='red', linewidths=0, s=10)
ax.set_title('selected dominant match point in each case')
ax.set_aspect('equal')
plt.figure()
plt.scatter(xb,xa)
plt.figure()
plt.scatter(ya,yb)

``````
``````

Out[38]:

<matplotlib.collections.PathCollection at 0x1116afe10>

``````
``````

In [39]:

uf = lcsc2(fs1,fs2)[1]
ufs = list(SequenceMatcher(None,fs1,fs2).get_matching_blocks())

``````
``````

In [40]:

len(list(uf)),sum([x.size for  x in list(ufs)])

``````
``````

Out[40]:

(931, 890)

``````
``````

In [41]:

uf = unfold(_get_lcs_cut2(s1,s2)[1])

``````
``````

In [42]:

from collections import namedtuple

``````
``````

In [43]:

Match= namedtuple('Match',('old','new'))
Insertion = namedtuple('Insertion',('old','new'))
Deletion = namedtuple('Deletion',('old','new'))
class SInsertion(Insertion):pass
class SDeletion(Deletion):pass

def matchfill(seq):
nseq = []
pi,pj =(-1,-1)
for i,j in seq[0:]:

if i > pi+1:
for k in range(pi+1,i):
nseq.append(Deletion(k,None))
pi,pj = pi+1,pj
if j > pj+1:
for k in range(pj+1,j):
nseq.append(Insertion(None,k))
pi,pj = pi, pj+1
if (i,j) == (pi+1,pj+1):
nseq.append(Match(i,j))
pi,pj = i,j

return nseq

``````
``````

In [44]:

uf = list(uf)

``````
``````

In [45]:

matchfill(list(uf))

``````
``````

Out[45]:

[Deletion(old=0, new=None),
Deletion(old=1, new=None),
Deletion(old=2, new=None),
Deletion(old=3, new=None),
Match(old=4, new=0),
Deletion(old=5, new=None),
Match(old=6, new=1),
Deletion(old=7, new=None),
Insertion(old=None, new=2),
Match(old=8, new=3),
Deletion(old=9, new=None),
Insertion(old=None, new=4),
Match(old=10, new=5),
Deletion(old=11, new=None),
Insertion(old=None, new=6),
Match(old=12, new=7),
Match(old=13, new=8),
Match(old=14, new=9),
Match(old=15, new=10),
Deletion(old=16, new=None),
Insertion(old=None, new=11),
Insertion(old=None, new=12),
Match(old=17, new=13)]

``````
``````

In [46]:

list(zip(*matchfill(list(uf))))

``````
``````

Out[46]:

[(0,
1,
2,
3,
4,
5,
6,
7,
None,
8,
9,
None,
10,
11,
None,
12,
13,
14,
15,
16,
None,
None,
17),
(None,
None,
None,
None,
0,
None,
1,
None,
2,
3,
None,
4,
5,
None,
6,
7,
8,
9,
10,
None,
11,
12,
13)]

``````
``````

In [47]:

HTML("""
<style>
content: '+ ';
color:green;
}

width:100%;
display:block;
color:gray;
}

background-color:#DFD;
}

.d-rem:before {
content: '- ';
color:red;

}

.d-rem {
background-color:#FDD;
}

.d-noth:before {
content: '  ';
color:lightblue;
}
</style>
""")

``````
``````

Out[47]:

content: '+ ';
color:green;
}

width:100%;
display:block;
color:gray;
}

background-color:#DFD;
}

.d-rem:before {
content: '- ';
color:red;

}

.d-rem {
background-color:#FDD;
}

.d-noth:before {
content: '  ';
color:lightblue;
}

``````
``````

In [61]:

def mark_line_diff(mf):
sk =False
for i,l in enumerate(mf):
if sk:
yield SInsertion(*l)
sk=False
continue
if isinstance(l, Deletion) and \
isinstance(mf[i+1], Insertion) and \
isinstance(mf[i+2], Match):
sk = True
yield SDeletion(*l)
else :
yield l

``````
``````

In [64]:

ts  = ''
mf = mark_line_diff(matchfill(list(uf)))
for m in mf :
i,j = m
if isinstance(m, SInsertion) :
ts = ts+'<span class="d-add">' +'|    |'     +' %02d +'%j+hs2[j]+'</span>'

elif isinstance(m, Insertion) :
ts = ts+'<span class="d-add">' +'|    |'     +' %02d |'%j+hs2[j]+'</span>'
elif isinstance(m, SDeletion) :
ts = ts+'<span class="d-rem">' +'| %02d |'%i +'    -'    +hs1[i]+'</span>'

elif isinstance(m, Deletion) :
ts = ts+'<span class="d-rem">' +'| %02d |'%i +'    |'    +hs1[i]+'</span>'

else :
ts = ts+'<span class="d-noth">'+'| %02d |'%i +' %02d |'%j+hs1[i]+'</span>'
HTML('<pre>'+ts+'</pre>')

``````
``````

Out[64]:

| 00 |    |# really, making a copyright is useless| 01 |    |# but just to test multiline removel I'll do that| 02 |    |# this is copyrighted| 03 |    || 04 | 00 |import pygments| 05 |    |import pygments.lexer| 06 | 01 |import pygments.lexers.agile| 07 |    -|    | 02 +import pygments.formatters| 08 | 03 |'''| 09 |    -donez, donnez moi, |    | 04 +Donnez, donnez moi, | 10 | 05 |le pouvoir d'achat| 11 |    -donez, donnez moi, |    | 06 +donnez, donnez moi, | 12 | 07 |le super pouvoir| 13 | 08 |le super pouvoir, de pouvoir marcher| 14 | 09 |tout en achetant du lait UHT| 15 | 10 |'''| 16 |    ||    | 11 |a multil line added|    | 12 |which shoudl make a green block| 17 | 13 |from IPython.display import HTML

``````
``````

In [55]:

uf

``````
``````

Out[55]:

[(4, 0),
(6, 1),
(8, 3),
(10, 5),
(12, 7),
(13, 8),
(14, 9),
(15, 10),
(17, 13)]

``````
``````

In [56]:

matchfill(uf)

``````
``````

Out[56]:

[Deletion(old=0, new=None),
Deletion(old=1, new=None),
Deletion(old=2, new=None),
Deletion(old=3, new=None),
Match(old=4, new=0),
Deletion(old=5, new=None),
Match(old=6, new=1),
Deletion(old=7, new=None),
Insertion(old=None, new=2),
Match(old=8, new=3),
Deletion(old=9, new=None),
Insertion(old=None, new=4),
Match(old=10, new=5),
Deletion(old=11, new=None),
Insertion(old=None, new=6),
Match(old=12, new=7),
Match(old=13, new=8),
Match(old=14, new=9),
Match(old=15, new=10),
Deletion(old=16, new=None),
Insertion(old=None, new=11),
Insertion(old=None, new=12),
Match(old=17, new=13)]

``````
``````

In [ ]:

list(mark_line_diff(matchfill(uf)))

``````
``````

In [ ]:

``````