In [1]:
cd ..


/Users/bussonniermatthias/difflib2

In [2]:
from difflib2.get_lcs_cut2 import _get_lcs_cut2

In [3]:
s1,s2 = '0-abcdferhzzzzttt','0+abcdbghhrzzzz$$$'
l, lcs  = _get_lcs_cut2(s1,s2)

In [4]:
def unfold(lcs):
    ll =[]
    while True:
        if lcs is None:
            break
        val = lcs[0]
        lcs =lcs[1]
        ll.append(val)
    yield from reversed(ll)
y_ulcs = unfold(lcs)
ulcs = list(y_ulcs)
ulcs


Out[4]:
[(0, 0),
 (2, 2),
 (3, 3),
 (4, 4),
 (5, 5),
 (8, 10),
 (10, 11),
 (11, 12),
 (12, 13),
 (13, 14)]

In [5]:
from difflib import Match

In [6]:
def _y_make_matching_blocks(ulcs,s1,s2):
    ulcs = iter(ulcs)
    pi, pj = next(ulcs)
    oi, oj = pi, pj
    leng = 1
    for i,j in ulcs:
        if i == pi+1 and j == pj+1:
            pi,pj = i,j
            leng +=1
        else :
            yield Match(oi,oj,leng)
            oi,oj = i,j
            pi,pj = i,j
            leng = 1
    yield Match(oi,oj,leng)
    # stdlib return an empty match at the end :
    yield Match(len(s1),len(s2),0)

def make_matching_blocks(ulcs,s1,s2):
    return list(_y_make_matching_blocks(ulcs,s1,s2))

In [7]:
from difflib import SequenceMatcher
list(SequenceMatcher(None, s1,s2).get_matching_blocks()) , make_matching_blocks(ulcs,s1,s2)


Out[7]:
([Match(a=0, b=0, size=1),
  Match(a=2, b=2, size=4),
  Match(a=8, b=10, size=1),
  Match(a=10, b=11, size=4),
  Match(a=17, b=18, size=0)],
 [Match(a=0, b=0, size=1),
  Match(a=2, b=2, size=4),
  Match(a=8, b=10, size=1),
  Match(a=10, b=11, size=4),
  Match(a=17, b=18, size=0)])

In [7]:


In [8]:
from difflib2.get_lcs_cut2 import _get_lcs_cut2
from difflib import  SequenceMatcher

def _unfold(lcs):
    """ unfold apsudo linked list, 
    generated by LCS alfirythmes. 
    """
    ll =[]
    while True:
        if lcs is None:
            break
        val = lcs[0]
        lcs = lcs[1]
        ll.append(val)
    return list(reversed(ll))


def _make_matching_blocks(ulcs,s1,s2):
    pi, pj = ulcs[0]
    oi, oj = ulcs[0]
    leng = 1
    matches= []
    for i,j in ulcs[1:]:
        if i == pi+1 and j == pj+1:
            pi,pj = i,j
            leng +=1
        else :
            matches.append( Match(oi,oj,leng) )
            oi,oj = i,j
            pi,pj = i,j
            leng = 1
    matches.append( Match(oi,oj,leng) )
    # stdlib return an empty match at the end :
    matches.append( Match(len(s1),len(s2),0) )
    return matches

class ExactSequenceMatcher( SequenceMatcher ):
    

    def __init__(self, isjunk, a, b, autojunk=True):
        super().__init__(isjunk, a, b, autojunk)
        
    def find_longest_match(self, *args, **kwargs):
        raise NotImplementedError('this is the longuest common substring problem')
        
    def get_matching_blocks(self):
        ulcs = self.get_matching_index(self.a,self.b)
        return _make_matching_blocks(ulcs, self.a, self.b)
        
    def get_matching_index(self, a, b):
        """
        
        """
        return unfold(_get_lcs_cut2(self.a, self.b)[1])

In [9]:
s1 = "xada"
s2 = "bada"
#print(ExactSequenceMatcher(None, s1*10,s2*10).find_longest_match())
#list(SequenceMatcher(None, s1*10,s2*10).find_longest_match())
#_get_lcs_cut2(s1,s2)

In [10]:
list(filter( lambda x:x[0]!= '_' ,dir(SequenceMatcher)))


Out[10]:
['find_longest_match',
 'get_grouped_opcodes',
 'get_matching_blocks',
 'get_opcodes',
 'quick_ratio',
 'ratio',
 'real_quick_ratio',
 'set_seq1',
 'set_seq2',
 'set_seqs']

In [11]:
from collections import Counter

In [12]:
Counter('abc')


Out[12]:
Counter({'c': 1, 'b': 1, 'a': 1})

In [13]:
import pygments
import pygments.lexer
import pygments.lexers.agile
import pygments.formatters

from IPython.display import HTML

In [14]:
PL = pygments.lexers.agile.PythonLexer()
HF = pygments.formatters.HtmlFormatter(noclasses=True, nowrap=True)

In [15]:
s1 = """# really, making a copyright is useless
# but just to test multiline removel I'll do that
# this is copyrighted

import pygments
import pygments.lexer
import pygments.lexers.agile

'''
donez, donnez moi, 
le pouvoir d'achat
donez, donnez moi, 
le super pouvoir
le super pouvoir, de pouvoir marcher
tout en achetant du lait UHT
'''

from IPython.display import HTML
""".splitlines()

s2 = """import pygments
import pygments.lexers.agile
import pygments.formatters
'''
Donnez, donnez moi, 
le pouvoir d'achat
donnez, donnez moi, 
le super pouvoir
le super pouvoir, de pouvoir marcher
tout en achetant du lait UHT
'''
a multil line added
which shoudl make a green block
from IPython.display import HTML
""".splitlines()

hs1 = pygments.highlight('\n'.join(s1), PL, HF ).splitlines()
hs2 = pygments.highlight('\n'.join(s2), PL, HF ).splitlines()

In [16]:
hs1


Out[16]:
['<span style="color: #408080; font-style: italic"># really, making a copyright is useless</span>',
 '<span style="color: #408080; font-style: italic"># but just to test multiline removel I&#39;ll do that</span>',
 '<span style="color: #408080; font-style: italic"># this is copyrighted</span>',
 '',
 '<span style="color: #008000; font-weight: bold">import</span> <span style="color: #0000FF; font-weight: bold">pygments</span>',
 '<span style="color: #008000; font-weight: bold">import</span> <span style="color: #0000FF; font-weight: bold">pygments.lexer</span>',
 '<span style="color: #008000; font-weight: bold">import</span> <span style="color: #0000FF; font-weight: bold">pygments.lexers.agile</span>',
 '',
 '<span style="color: #BA2121; font-style: italic">&#39;&#39;&#39;</span>',
 '<span style="color: #BA2121; font-style: italic">donez, donnez moi, </span>',
 '<span style="color: #BA2121; font-style: italic">le pouvoir d&#39;achat</span>',
 '<span style="color: #BA2121; font-style: italic">donez, donnez moi, </span>',
 '<span style="color: #BA2121; font-style: italic">le super pouvoir</span>',
 '<span style="color: #BA2121; font-style: italic">le super pouvoir, de pouvoir marcher</span>',
 '<span style="color: #BA2121; font-style: italic">tout en achetant du lait UHT</span>',
 '<span style="color: #BA2121; font-style: italic">&#39;&#39;&#39;</span>',
 '',
 '<span style="color: #008000; font-weight: bold">from</span> <span style="color: #0000FF; font-weight: bold">IPython.display</span> <span style="color: #008000; font-weight: bold">import</span> HTML']

In [17]:
s1


Out[17]:
['# really, making a copyright is useless',
 "# but just to test multiline removel I'll do that",
 '# this is copyrighted',
 '',
 'import pygments',
 'import pygments.lexer',
 'import pygments.lexers.agile',
 '',
 "'''",
 'donez, donnez moi, ',
 "le pouvoir d'achat",
 'donez, donnez moi, ',
 'le super pouvoir',
 'le super pouvoir, de pouvoir marcher',
 'tout en achetant du lait UHT',
 "'''",
 '',
 'from IPython.display import HTML']

In [18]:
s2


Out[18]:
['import pygments',
 'import pygments.lexers.agile',
 'import pygments.formatters',
 "'''",
 'Donnez, donnez moi, ',
 "le pouvoir d'achat",
 'donnez, donnez moi, ',
 'le super pouvoir',
 'le super pouvoir, de pouvoir marcher',
 'tout en achetant du lait UHT',
 "'''",
 'a multil line added',
 'which shoudl make a green block',
 'from IPython.display import HTML']

In [19]:
res = pygments.highlight("""print('hi')
                      1+1
""", PL, HF )

HTML(res)


Out[19]:
print('hi') 1+1

In [20]:
import difflib

In [21]:
SequenceMatcher


Out[21]:
difflib.SequenceMatcher

In [22]:
fs1 = !git show HEAD:examples/MatchGroups.ipynb
fs2 = !git show HEAD~1:examples/MatchGroups.ipynb

In [23]:
ss1 = set(fs1)
ss2 = set(fs2)
us = ss1.intersection(ss2)

In [24]:
len(ss1),len(ss2),len(fs1),len(fs2),len(us)


Out[24]:
(426, 455, 1134, 1143, 342)

In [25]:
a,b = zip(*((i,e)for i,e in enumerate('abc')))

In [26]:
type(b)


Out[26]:
tuple

In [27]:
something = 'abcdefghij'
vowel = 'aeiouy'
condition = lambda x:x in vowel
#zip(* ((i,_) for i,_ in enumerate(something) if e in vowel)   )

In [28]:
#import itertools
#a,b = itertools.tee(zip(*((i,e) for i,e in enumerate(something) if e in us)))
#a,b = itertools.tee(( (_,_) for _ in something))
#list(a),list(b)

In [29]:
zip(*((i,e) for i,e in enumerate(something) if e in vowel))


Out[29]:
<zip at 0x11036f808>

In [30]:
a,b = zip(*((i,e) for i,e in enumerate(something) if e in vowel))
a,b


Out[30]:
((0, 4, 8), ('a', 'e', 'i'))

In [31]:
from itertools import chain

def lcsc2(fs1,fs2):
    """ optimise by removing non-common seq """
    ss1 = set(fs1)
    ss2 = set(fs2)
    us = ss1.intersection(ss2)
    s1m,seq1 = zip(*((ii,e) for ii,e in enumerate(fs1) if e in us))
    s2m,seq2 = zip(*((ii,e) for ii,e in enumerate(fs2) if e in us))
    delta = -1
    for ca,cb in zip(seq1,seq2):
        if ca==cb:
            delta=delta+1
        else:
            break
    seq1,seq2 = seq1[delta:],seq2[delta:]
    length, back_index =  _get_lcs_cut2(seq1,seq2)
    back_index = unfold(back_index)
    length = length+delta
    bi = map(lambda x:(x[0]+delta,x[1]+delta), back_index)
    back_index = chain(((i,i) for i in range(delta)),bi)
    
    return length, map(lambda x:(s1m[x[0]],s2m[x[1]]),back_index )

In [32]:
#list(zip(*[(ii,_) for ii,_ in enumerate("hello") if _ in {'a','o','h','l'}]))
a,b = tuple(zip(*((ii,_) for ii,_ in enumerate('aaa') if _ in {'a'})))


#tuple(zip(*[(ii,_) for ii,_ in enumerate('aaa') if _ in {'a'}]))

In [33]:
lcsc2('aaa','aba')


Out[33]:
(2, <map at 0x1103754a8>)

In [34]:
%timeit uf = unfold(_get_lcs_cut2(fs1,fs2)[1])
%timeit uf = lcsc2(fs1,fs2)
%timeit ufs = list(SequenceMatcher(None,fs1,fs2).get_matching_blocks())


1 loops, best of 3: 518 ms per loop
1 loops, best of 3: 348 ms per loop
100 loops, best of 3: 5.93 ms per loop

In [35]:
lcsc2(fs1,fs2)


Out[35]:
(931, <map at 0x110375390>)

In [36]:
(len(list(unfold(_get_lcs_cut2(fs1,fs2)[1]))),
len(list(lcsc2(fs1,fs2)[1])),
_get_lcs_cut2(fs1,fs2)[0],
lcsc2(fs1,fs2)[0])


Out[36]:
(931, 931, 931, 931)

In [37]:
%matplotlib inline

In [38]:
import matplotlib.pyplot as plt
xa, ya = tuple(zip(*unfold(_get_lcs_cut2(fs1,fs2)[1]))) 
xb, yb = tuple(zip(*lcsc2(fs1,fs2)[1])) 
fig,ax = plt.subplots(1,1)
ax.scatter(xa, ya ,linewidths=0)
ax.scatter(xb, yb ,c='red', linewidths=0, s=10)
ax.set_title('selected dominant match point in each case')
ax.set_aspect('equal')
plt.figure()
plt.scatter(xb,xa)
plt.figure()
plt.scatter(ya,yb)


Out[38]:
<matplotlib.collections.PathCollection at 0x1116afe10>

In [39]:
uf = lcsc2(fs1,fs2)[1]
ufs = list(SequenceMatcher(None,fs1,fs2).get_matching_blocks())

In [40]:
len(list(uf)),sum([x.size for  x in list(ufs)])


Out[40]:
(931, 890)

In [41]:
uf = unfold(_get_lcs_cut2(s1,s2)[1])

In [42]:
from collections import namedtuple

In [43]:
Match= namedtuple('Match',('old','new'))
Insertion = namedtuple('Insertion',('old','new'))
Deletion = namedtuple('Deletion',('old','new'))
class SInsertion(Insertion):pass
class SDeletion(Deletion):pass


def matchfill(seq):
    nseq = []
    pi,pj =(-1,-1)
    for i,j in seq[0:]:

        if i > pi+1:
            for k in range(pi+1,i):
                nseq.append(Deletion(k,None))
                pi,pj = pi+1,pj
        if j > pj+1:
            for k in range(pj+1,j):
                nseq.append(Insertion(None,k))
                pi,pj = pi, pj+1
        if (i,j) == (pi+1,pj+1):
            nseq.append(Match(i,j))
            pi,pj = i,j               
                
    return nseq

In [44]:
uf = list(uf)

In [45]:
matchfill(list(uf))


Out[45]:
[Deletion(old=0, new=None),
 Deletion(old=1, new=None),
 Deletion(old=2, new=None),
 Deletion(old=3, new=None),
 Match(old=4, new=0),
 Deletion(old=5, new=None),
 Match(old=6, new=1),
 Deletion(old=7, new=None),
 Insertion(old=None, new=2),
 Match(old=8, new=3),
 Deletion(old=9, new=None),
 Insertion(old=None, new=4),
 Match(old=10, new=5),
 Deletion(old=11, new=None),
 Insertion(old=None, new=6),
 Match(old=12, new=7),
 Match(old=13, new=8),
 Match(old=14, new=9),
 Match(old=15, new=10),
 Deletion(old=16, new=None),
 Insertion(old=None, new=11),
 Insertion(old=None, new=12),
 Match(old=17, new=13)]

In [46]:
list(zip(*matchfill(list(uf))))


Out[46]:
[(0,
  1,
  2,
  3,
  4,
  5,
  6,
  7,
  None,
  8,
  9,
  None,
  10,
  11,
  None,
  12,
  13,
  14,
  15,
  16,
  None,
  None,
  17),
 (None,
  None,
  None,
  None,
  0,
  None,
  1,
  None,
  2,
  3,
  None,
  4,
  5,
  None,
  6,
  7,
  8,
  9,
  10,
  None,
  11,
  12,
  13)]

In [47]:
HTML("""
<style>
.d-add:before {
    content: '+ ';
    color:green;
}

.d-add, .d-noth , .d-rem{
    width:100%;
    display:block;
    color:gray;
}

.d-add{
    background-color:#DFD;
}



.d-rem:before {
    content: '- ';
    color:red;
    
}

.d-rem {
    background-color:#FDD;
}

.d-noth:before {
    content: '  ';
    color:lightblue;
}
</style>
""")


Out[47]:

In [61]:
def mark_line_diff(mf):
    sk =False
    for i,l in enumerate(mf):
        if sk:
            yield SInsertion(*l)
            sk=False
            continue
        if isinstance(l, Deletion) and \
           isinstance(mf[i+1], Insertion) and \
           isinstance(mf[i+2], Match):
            sk = True
            yield SDeletion(*l)
        else :
            yield l

In [64]:
ts  = ''
mf = mark_line_diff(matchfill(list(uf)))
for m in mf :
    i,j = m
    if isinstance(m, SInsertion) :
        ts = ts+'<span class="d-add">' +'|    |'     +' %02d +'%j+hs2[j]+'</span>'

    elif isinstance(m, Insertion) :
        ts = ts+'<span class="d-add">' +'|    |'     +' %02d |'%j+hs2[j]+'</span>'
    elif isinstance(m, SDeletion) :
        ts = ts+'<span class="d-rem">' +'| %02d |'%i +'    -'    +hs1[i]+'</span>'

    elif isinstance(m, Deletion) :
        ts = ts+'<span class="d-rem">' +'| %02d |'%i +'    |'    +hs1[i]+'</span>'

    else :
        ts = ts+'<span class="d-noth">'+'| %02d |'%i +' %02d |'%j+hs1[i]+'</span>'
HTML('<pre>'+ts+'</pre>')


Out[64]:
| 00 |    |# really, making a copyright is useless| 01 |    |# but just to test multiline removel I'll do that| 02 |    |# this is copyrighted| 03 |    || 04 | 00 |import pygments| 05 |    |import pygments.lexer| 06 | 01 |import pygments.lexers.agile| 07 |    -|    | 02 +import pygments.formatters| 08 | 03 |'''| 09 |    -donez, donnez moi, |    | 04 +Donnez, donnez moi, | 10 | 05 |le pouvoir d'achat| 11 |    -donez, donnez moi, |    | 06 +donnez, donnez moi, | 12 | 07 |le super pouvoir| 13 | 08 |le super pouvoir, de pouvoir marcher| 14 | 09 |tout en achetant du lait UHT| 15 | 10 |'''| 16 |    ||    | 11 |a multil line added|    | 12 |which shoudl make a green block| 17 | 13 |from IPython.display import HTML

In [55]:
uf


Out[55]:
[(4, 0),
 (6, 1),
 (8, 3),
 (10, 5),
 (12, 7),
 (13, 8),
 (14, 9),
 (15, 10),
 (17, 13)]

In [56]:
matchfill(uf)


Out[56]:
[Deletion(old=0, new=None),
 Deletion(old=1, new=None),
 Deletion(old=2, new=None),
 Deletion(old=3, new=None),
 Match(old=4, new=0),
 Deletion(old=5, new=None),
 Match(old=6, new=1),
 Deletion(old=7, new=None),
 Insertion(old=None, new=2),
 Match(old=8, new=3),
 Deletion(old=9, new=None),
 Insertion(old=None, new=4),
 Match(old=10, new=5),
 Deletion(old=11, new=None),
 Insertion(old=None, new=6),
 Match(old=12, new=7),
 Match(old=13, new=8),
 Match(old=14, new=9),
 Match(old=15, new=10),
 Deletion(old=16, new=None),
 Insertion(old=None, new=11),
 Insertion(old=None, new=12),
 Match(old=17, new=13)]

In [ ]:
list(mark_line_diff(matchfill(uf)))

In [ ]: