notebook.community

Edit and run



In [1]:

    
cd ..









    



/Users/bussonniermatthias/difflib2



In [2]:

    
from difflib2.get_lcs_cut2 import _get_lcs_cut2



In [3]:

    
s1,s2 = '0-abcdferhzzzzttt','0+abcdbghhrzzzz$$$'
l, lcs  = _get_lcs_cut2(s1,s2)



In [4]:

    
def unfold(lcs):
    ll =[]
    while True:
        if lcs is None:
            break
        val = lcs[0]
        lcs =lcs[1]
        ll.append(val)
    yield from reversed(ll)
y_ulcs = unfold(lcs)
ulcs = list(y_ulcs)
ulcs









    Out[4]:





[(0, 0),
 (2, 2),
 (3, 3),
 (4, 4),
 (5, 5),
 (8, 10),
 (10, 11),
 (11, 12),
 (12, 13),
 (13, 14)]



In [5]:

    
from difflib import Match



In [6]:

    
def _y_make_matching_blocks(ulcs,s1,s2):
    ulcs = iter(ulcs)
    pi, pj = next(ulcs)
    oi, oj = pi, pj
    leng = 1
    for i,j in ulcs:
        if i == pi+1 and j == pj+1:
            pi,pj = i,j
            leng +=1
        else :
            yield Match(oi,oj,leng)
            oi,oj = i,j
            pi,pj = i,j
            leng = 1
    yield Match(oi,oj,leng)
    # stdlib return an empty match at the end :
    yield Match(len(s1),len(s2),0)

def make_matching_blocks(ulcs,s1,s2):
    return list(_y_make_matching_blocks(ulcs,s1,s2))



In [7]:

    
from difflib import SequenceMatcher
list(SequenceMatcher(None, s1,s2).get_matching_blocks()) , make_matching_blocks(ulcs,s1,s2)









    Out[7]:





([Match(a=0, b=0, size=1),
  Match(a=2, b=2, size=4),
  Match(a=8, b=10, size=1),
  Match(a=10, b=11, size=4),
  Match(a=17, b=18, size=0)],
 [Match(a=0, b=0, size=1),
  Match(a=2, b=2, size=4),
  Match(a=8, b=10, size=1),
  Match(a=10, b=11, size=4),
  Match(a=17, b=18, size=0)])



In [7]:



In [8]:

    
from difflib2.get_lcs_cut2 import _get_lcs_cut2
from difflib import  SequenceMatcher

def _unfold(lcs):
    """ unfold apsudo linked list, 
    generated by LCS alfirythmes. 
    """
    ll =[]
    while True:
        if lcs is None:
            break
        val = lcs[0]
        lcs = lcs[1]
        ll.append(val)
    return list(reversed(ll))


def _make_matching_blocks(ulcs,s1,s2):
    pi, pj = ulcs[0]
    oi, oj = ulcs[0]
    leng = 1
    matches= []
    for i,j in ulcs[1:]:
        if i == pi+1 and j == pj+1:
            pi,pj = i,j
            leng +=1
        else :
            matches.append( Match(oi,oj,leng) )
            oi,oj = i,j
            pi,pj = i,j
            leng = 1
    matches.append( Match(oi,oj,leng) )
    # stdlib return an empty match at the end :
    matches.append( Match(len(s1),len(s2),0) )
    return matches

class ExactSequenceMatcher( SequenceMatcher ):
    

    def __init__(self, isjunk, a, b, autojunk=True):
        super().__init__(isjunk, a, b, autojunk)
        
    def find_longest_match(self, *args, **kwargs):
        raise NotImplementedError('this is the longuest common substring problem')
        
    def get_matching_blocks(self):
        ulcs = self.get_matching_index(self.a,self.b)
        return _make_matching_blocks(ulcs, self.a, self.b)
        
    def get_matching_index(self, a, b):
        """
        
        """
        return unfold(_get_lcs_cut2(self.a, self.b)[1])



In [9]:

    
s1 = "xada"
s2 = "bada"
#print(ExactSequenceMatcher(None, s1*10,s2*10).find_longest_match())
#list(SequenceMatcher(None, s1*10,s2*10).find_longest_match())
#_get_lcs_cut2(s1,s2)



In [10]:

    
list(filter( lambda x:x[0]!= '_' ,dir(SequenceMatcher)))









    Out[10]:





['find_longest_match',
 'get_grouped_opcodes',
 'get_matching_blocks',
 'get_opcodes',
 'quick_ratio',
 'ratio',
 'real_quick_ratio',
 'set_seq1',
 'set_seq2',
 'set_seqs']



In [11]:

    
from collections import Counter



In [12]:

    
Counter('abc')









    Out[12]:





Counter({'c': 1, 'b': 1, 'a': 1})



In [13]:

    
import pygments
import pygments.lexer
import pygments.lexers.agile
import pygments.formatters

from IPython.display import HTML



In [14]:

    
PL = pygments.lexers.agile.PythonLexer()
HF = pygments.formatters.HtmlFormatter(noclasses=True, nowrap=True)



In [15]:

    
s1 = """# really, making a copyright is useless
# but just to test multiline removel I'll do that
# this is copyrighted

import pygments
import pygments.lexer
import pygments.lexers.agile

'''
donez, donnez moi, 
le pouvoir d'achat
donez, donnez moi, 
le super pouvoir
le super pouvoir, de pouvoir marcher
tout en achetant du lait UHT
'''

from IPython.display import HTML
""".splitlines()

s2 = """import pygments
import pygments.lexers.agile
import pygments.formatters
'''
Donnez, donnez moi, 
le pouvoir d'achat
donnez, donnez moi, 
le super pouvoir
le super pouvoir, de pouvoir marcher
tout en achetant du lait UHT
'''
a multil line added
which shoudl make a green block
from IPython.display import HTML
""".splitlines()

hs1 = pygments.highlight('\n'.join(s1), PL, HF ).splitlines()
hs2 = pygments.highlight('\n'.join(s2), PL, HF ).splitlines()



In [16]:

    
hs1









    Out[16]:





['<span style="color: #408080; font-style: italic"># really, making a copyright is useless</span>',
 '<span style="color: #408080; font-style: italic"># but just to test multiline removel I&#39;ll do that</span>',
 '<span style="color: #408080; font-style: italic"># this is copyrighted</span>',
 '',
 '<span style="color: #008000; font-weight: bold">import</span> <span style="color: #0000FF; font-weight: bold">pygments</span>',
 '<span style="color: #008000; font-weight: bold">import</span> <span style="color: #0000FF; font-weight: bold">pygments.lexer</span>',
 '<span style="color: #008000; font-weight: bold">import</span> <span style="color: #0000FF; font-weight: bold">pygments.lexers.agile</span>',
 '',
 '<span style="color: #BA2121; font-style: italic">&#39;&#39;&#39;</span>',
 '<span style="color: #BA2121; font-style: italic">donez, donnez moi, </span>',
 '<span style="color: #BA2121; font-style: italic">le pouvoir d&#39;achat</span>',
 '<span style="color: #BA2121; font-style: italic">donez, donnez moi, </span>',
 '<span style="color: #BA2121; font-style: italic">le super pouvoir</span>',
 '<span style="color: #BA2121; font-style: italic">le super pouvoir, de pouvoir marcher</span>',
 '<span style="color: #BA2121; font-style: italic">tout en achetant du lait UHT</span>',
 '<span style="color: #BA2121; font-style: italic">&#39;&#39;&#39;</span>',
 '',
 '<span style="color: #008000; font-weight: bold">from</span> <span style="color: #0000FF; font-weight: bold">IPython.display</span> <span style="color: #008000; font-weight: bold">import</span> HTML']



In [17]:

    
s1









    Out[17]:





['# really, making a copyright is useless',
 "# but just to test multiline removel I'll do that",
 '# this is copyrighted',
 '',
 'import pygments',
 'import pygments.lexer',
 'import pygments.lexers.agile',
 '',
 "'''",
 'donez, donnez moi, ',
 "le pouvoir d'achat",
 'donez, donnez moi, ',
 'le super pouvoir',
 'le super pouvoir, de pouvoir marcher',
 'tout en achetant du lait UHT',
 "'''",
 '',
 'from IPython.display import HTML']



In [18]:

    
s2









    Out[18]:





['import pygments',
 'import pygments.lexers.agile',
 'import pygments.formatters',
 "'''",
 'Donnez, donnez moi, ',
 "le pouvoir d'achat",
 'donnez, donnez moi, ',
 'le super pouvoir',
 'le super pouvoir, de pouvoir marcher',
 'tout en achetant du lait UHT',
 "'''",
 'a multil line added',
 'which shoudl make a green block',
 'from IPython.display import HTML']



In [19]:

    
res = pygments.highlight("""print('hi')
                      1+1
""", PL, HF )

HTML(res)









    Out[19]:




print('hi')
                      1+1



In [20]:

    
import difflib



In [21]:

    
SequenceMatcher









    Out[21]:





difflib.SequenceMatcher



In [22]:

    
fs1 = !git show HEAD:examples/MatchGroups.ipynb
fs2 = !git show HEAD~1:examples/MatchGroups.ipynb



In [23]:

    
ss1 = set(fs1)
ss2 = set(fs2)
us = ss1.intersection(ss2)



In [24]:

    
len(ss1),len(ss2),len(fs1),len(fs2),len(us)









    Out[24]:





(426, 455, 1134, 1143, 342)



In [25]:

    
a,b = zip(*((i,e)for i,e in enumerate('abc')))



In [26]:

    
type(b)









    Out[26]:





tuple



In [27]:

    
something = 'abcdefghij'
vowel = 'aeiouy'
condition = lambda x:x in vowel
#zip(* ((i,_) for i,_ in enumerate(something) if e in vowel)   )



In [28]:

    
#import itertools
#a,b = itertools.tee(zip(*((i,e) for i,e in enumerate(something) if e in us)))
#a,b = itertools.tee(( (_,_) for _ in something))
#list(a),list(b)



In [29]:

    
zip(*((i,e) for i,e in enumerate(something) if e in vowel))









    Out[29]:





<zip at 0x11036f808>



In [30]:

    
a,b = zip(*((i,e) for i,e in enumerate(something) if e in vowel))
a,b









    Out[30]:





((0, 4, 8), ('a', 'e', 'i'))



In [31]:

    
from itertools import chain

def lcsc2(fs1,fs2):
    """ optimise by removing non-common seq """
    ss1 = set(fs1)
    ss2 = set(fs2)
    us = ss1.intersection(ss2)
    s1m,seq1 = zip(*((ii,e) for ii,e in enumerate(fs1) if e in us))
    s2m,seq2 = zip(*((ii,e) for ii,e in enumerate(fs2) if e in us))
    delta = -1
    for ca,cb in zip(seq1,seq2):
        if ca==cb:
            delta=delta+1
        else:
            break
    seq1,seq2 = seq1[delta:],seq2[delta:]
    length, back_index =  _get_lcs_cut2(seq1,seq2)
    back_index = unfold(back_index)
    length = length+delta
    bi = map(lambda x:(x[0]+delta,x[1]+delta), back_index)
    back_index = chain(((i,i) for i in range(delta)),bi)
    
    return length, map(lambda x:(s1m[x[0]],s2m[x[1]]),back_index )



In [32]:

    
#list(zip(*[(ii,_) for ii,_ in enumerate("hello") if _ in {'a','o','h','l'}]))
a,b = tuple(zip(*((ii,_) for ii,_ in enumerate('aaa') if _ in {'a'})))


#tuple(zip(*[(ii,_) for ii,_ in enumerate('aaa') if _ in {'a'}]))



In [33]:

    
lcsc2('aaa','aba')









    Out[33]:





(2, <map at 0x1103754a8>)



In [34]:

    
%timeit uf = unfold(_get_lcs_cut2(fs1,fs2)[1])
%timeit uf = lcsc2(fs1,fs2)
%timeit ufs = list(SequenceMatcher(None,fs1,fs2).get_matching_blocks())









    



1 loops, best of 3: 518 ms per loop
1 loops, best of 3: 348 ms per loop
100 loops, best of 3: 5.93 ms per loop



In [35]:

    
lcsc2(fs1,fs2)









    Out[35]:





(931, <map at 0x110375390>)



In [36]:

    
(len(list(unfold(_get_lcs_cut2(fs1,fs2)[1]))),
len(list(lcsc2(fs1,fs2)[1])),
_get_lcs_cut2(fs1,fs2)[0],
lcsc2(fs1,fs2)[0])









    Out[36]:





(931, 931, 931, 931)



In [37]:

    
%matplotlib inline



In [38]:

    
import matplotlib.pyplot as plt
xa, ya = tuple(zip(*unfold(_get_lcs_cut2(fs1,fs2)[1]))) 
xb, yb = tuple(zip(*lcsc2(fs1,fs2)[1])) 
fig,ax = plt.subplots(1,1)
ax.scatter(xa, ya ,linewidths=0)
ax.scatter(xb, yb ,c='red', linewidths=0, s=10)
ax.set_title('selected dominant match point in each case')
ax.set_aspect('equal')
plt.figure()
plt.scatter(xb,xa)
plt.figure()
plt.scatter(ya,yb)









    Out[38]:





<matplotlib.collections.PathCollection at 0x1116afe10>



In [39]:

    
uf = lcsc2(fs1,fs2)[1]
ufs = list(SequenceMatcher(None,fs1,fs2).get_matching_blocks())



In [40]:

    
len(list(uf)),sum([x.size for  x in list(ufs)])









    Out[40]:





(931, 890)



In [41]:

    
uf = unfold(_get_lcs_cut2(s1,s2)[1])



In [42]:

    
from collections import namedtuple



In [43]:

    
Match= namedtuple('Match',('old','new'))
Insertion = namedtuple('Insertion',('old','new'))
Deletion = namedtuple('Deletion',('old','new'))
class SInsertion(Insertion):pass
class SDeletion(Deletion):pass


def matchfill(seq):
    nseq = []
    pi,pj =(-1,-1)
    for i,j in seq[0:]:

        if i > pi+1:
            for k in range(pi+1,i):
                nseq.append(Deletion(k,None))
                pi,pj = pi+1,pj
        if j > pj+1:
            for k in range(pj+1,j):
                nseq.append(Insertion(None,k))
                pi,pj = pi, pj+1
        if (i,j) == (pi+1,pj+1):
            nseq.append(Match(i,j))
            pi,pj = i,j               
                
    return nseq



In [44]:

    
uf = list(uf)



In [45]:

    
matchfill(list(uf))









    Out[45]:





[Deletion(old=0, new=None),
 Deletion(old=1, new=None),
 Deletion(old=2, new=None),
 Deletion(old=3, new=None),
 Match(old=4, new=0),
 Deletion(old=5, new=None),
 Match(old=6, new=1),
 Deletion(old=7, new=None),
 Insertion(old=None, new=2),
 Match(old=8, new=3),
 Deletion(old=9, new=None),
 Insertion(old=None, new=4),
 Match(old=10, new=5),
 Deletion(old=11, new=None),
 Insertion(old=None, new=6),
 Match(old=12, new=7),
 Match(old=13, new=8),
 Match(old=14, new=9),
 Match(old=15, new=10),
 Deletion(old=16, new=None),
 Insertion(old=None, new=11),
 Insertion(old=None, new=12),
 Match(old=17, new=13)]



In [46]:

    
list(zip(*matchfill(list(uf))))









    Out[46]:





[(0,
  1,
  2,
  3,
  4,
  5,
  6,
  7,
  None,
  8,
  9,
  None,
  10,
  11,
  None,
  12,
  13,
  14,
  15,
  16,
  None,
  None,
  17),
 (None,
  None,
  None,
  None,
  0,
  None,
  1,
  None,
  2,
  3,
  None,
  4,
  5,
  None,
  6,
  7,
  8,
  9,
  10,
  None,
  11,
  12,
  13)]



In [47]:

    
HTML("""
<style>
.d-add:before {
    content: '+ ';
    color:green;
}

.d-add, .d-noth , .d-rem{
    width:100%;
    display:block;
    color:gray;
}

.d-add{
    background-color:#DFD;
}



.d-rem:before {
    content: '- ';
    color:red;
    
}

.d-rem {
    background-color:#FDD;
}

.d-noth:before {
    content: '  ';
    color:lightblue;
}
</style>
""")









    Out[47]:



In [61]:

    
def mark_line_diff(mf):
    sk =False
    for i,l in enumerate(mf):
        if sk:
            yield SInsertion(*l)
            sk=False
            continue
        if isinstance(l, Deletion) and \
           isinstance(mf[i+1], Insertion) and \
           isinstance(mf[i+2], Match):
            sk = True
            yield SDeletion(*l)
        else :
            yield l



In [64]:

    
ts  = ''
mf = mark_line_diff(matchfill(list(uf)))
for m in mf :
    i,j = m
    if isinstance(m, SInsertion) :
        ts = ts+'<span class="d-add">' +'|    |'     +' %02d +'%j+hs2[j]+'</span>'

    elif isinstance(m, Insertion) :
        ts = ts+'<span class="d-add">' +'|    |'     +' %02d |'%j+hs2[j]+'</span>'
    elif isinstance(m, SDeletion) :
        ts = ts+'<span class="d-rem">' +'| %02d |'%i +'    -'    +hs1[i]+'</span>'

    elif isinstance(m, Deletion) :
        ts = ts+'<span class="d-rem">' +'| %02d |'%i +'    |'    +hs1[i]+'</span>'

    else :
        ts = ts+'<span class="d-noth">'+'| %02d |'%i +' %02d |'%j+hs1[i]+'</span>'
HTML('<pre>'+ts+'</pre>')









    Out[64]:




| 00 |    |# really, making a copyright is useless| 01 |    |# but just to test multiline removel I'll do that| 02 |    |# this is copyrighted| 03 |    || 04 | 00 |import pygments| 05 |    |import pygments.lexer| 06 | 01 |import pygments.lexers.agile| 07 |    -|    | 02 +import pygments.formatters| 08 | 03 |'''| 09 |    -donez, donnez moi, |    | 04 +Donnez, donnez moi, | 10 | 05 |le pouvoir d'achat| 11 |    -donez, donnez moi, |    | 06 +donnez, donnez moi, | 12 | 07 |le super pouvoir| 13 | 08 |le super pouvoir, de pouvoir marcher| 14 | 09 |tout en achetant du lait UHT| 15 | 10 |'''| 16 |    ||    | 11 |a multil line added|    | 12 |which shoudl make a green block| 17 | 13 |from IPython.display import HTML



In [55]:

    
uf









    Out[55]:





[(4, 0),
 (6, 1),
 (8, 3),
 (10, 5),
 (12, 7),
 (13, 8),
 (14, 9),
 (15, 10),
 (17, 13)]



In [56]:

    
matchfill(uf)









    Out[56]:





[Deletion(old=0, new=None),
 Deletion(old=1, new=None),
 Deletion(old=2, new=None),
 Deletion(old=3, new=None),
 Match(old=4, new=0),
 Deletion(old=5, new=None),
 Match(old=6, new=1),
 Deletion(old=7, new=None),
 Insertion(old=None, new=2),
 Match(old=8, new=3),
 Deletion(old=9, new=None),
 Insertion(old=None, new=4),
 Match(old=10, new=5),
 Deletion(old=11, new=None),
 Insertion(old=None, new=6),
 Match(old=12, new=7),
 Match(old=13, new=8),
 Match(old=14, new=9),
 Match(old=15, new=10),
 Deletion(old=16, new=None),
 Insertion(old=None, new=11),
 Insertion(old=None, new=12),
 Match(old=17, new=13)]



In [ ]:

    
list(mark_line_diff(matchfill(uf)))



In [ ]: