In [262]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

from __future__ import (absolute_import, division,
                        print_function, unicode_literals)
from builtins import *

import re
from html import unescape
from collections import OrderedDict

from lxml import etree

#from longest_common_subsequence import lcs

def strip_tags(markup):
    """
    Remove tags from the markup string
    >>> strip_tags('<a href="123">hell<b>o</b></a><br/>')
    hello
    """
    tag = r'</?\w+( .*?)?/?>'
    return re.sub(tag, '', markup)

def peel(markup):
    """
    Remove the outmost markup tags
    """
    tagpair =r'(?s)^\s*<(\w+)( .*)?>(.*)</\1>\s*$'
    content = re.sub(tagpair, r'\3', markup)
    return content.strip()

def similarity(a, b):
    """
    Return the similarity of two strings based on longest_common_subsequence
    """
    return float(len(lcs(a, b))) / max(len(a), len(b))

def score(a, b):
    """
    Return a herustic score of a and b
    """
    return similarity(a, b)
    
filters = OrderedDict((
            ('none', lambda x: x),
            ('strip_tags', strip_tags),
            ('unescape', unescape)
            ))
class Model(dict):
    """
    The model is dict which holds a flat representation of xml or html tree, mapping from xpath to content
    """
    
    def search(self, query, n=None, threshold=None):
        """
        Given content, return the first n results ranked by their scores.
        If the threshold is set, only the results which score is greater than the threshold will be returned.
        """
        candidates = []
        for xpath, content in self.items():
            score2filter = dict()
            for name, filtfunc in list(filters.items())[::-1]:
                the_score = score(query, filtfunc(content))
                score2filter[the_score] = name
            best_result = sorted(list(score2filter.items()), key=lambda i: i[0], reverse=True)[0]
            best_score, best_filter = best_result
            candidates.append((best_score, best_filter, xpath))
        
        candidates.sort(key=lambda c: c[0], reverse=True)
        
        if threshold is not None:
            candidates = [c for c in candidates if c[0] >=threshold]
            
        if n is None:
            return candidates
        else:
            return candidates[:n]
            
        
    @classmethod
    def fromlxml(cls, xml):
        """
        A factory which returns a Model from an XML or HTML tree (from lxml)
        """
        model = cls()
        tree = etree.ElementTree(xml)
        for node in tree.iter():
            raw_content = etree.tostring(node, with_tail=False, encoding='unicode')
            xpath = tree.getpath(node)
            content = peel(raw_content)
            model[xpath] = content
            
        return model

In [263]:
text = """<html>
    <head>
    </head>
    <body>
        <h1>Title</h1>
        
        <p>hell<b>o</b></p>
        <p>hello</p>
        <p>w<b>o</b>rld</p>
        <br />
        <p>bob</p>
        tail
    </body>
</html>"""
xml = etree.fromstring(text)

model = Model.fromlxml(xml)

In [264]:
model.search('hello')


Out[264]:
[(1.0, 'none', '/html/body/p[2]'),
 (1.0, 'strip_tags', '/html/body/p[1]'),
 (0.2, 'none', '/html/body/h1'),
 (0.2, 'strip_tags', '/html/body/p[3]'),
 (0.2, 'none', '/html/body/p[1]/b'),
 (0.2, 'none', '/html/body/p[4]'),
 (0.2, 'none', '/html/body/p[3]/b'),
 (0.05555555555555555, 'strip_tags', '/html/body'),
 (0.043859649122807015, 'strip_tags', '/html'),
 (0.0, 'none', '/html/head'),
 (0.0, 'none', '/html/body/br')]

In [249]:
text = """<html>
    <head>
    </head>
    <body>
        <h1>Title</h1>
        
        <p>hell<b>o</b></p>
        <p>hello</p>
        <p>w<b>o</b>rld</p>
        <br />
        <p>bob</p>
        tail
    </body>
</html>"""
xml = etree.fromstring(text)
model = Model(xml)

In [251]:
model.search('hello')


Out[251]:
[]

In [169]:
def peel(markup):
    """
    Remove the outmost markup tags
    """
    tagpair =r'(?s)^\s*<(\w+)( .*)?>(.*)</\1>\s*$'
    content = re.sub(tagpair, r'\3', markup)
    return content.strip()

In [152]:
re.findall(tagpair, text)


Out[152]:
[('a', '', '\n  <b>x</b>\n')]

In [153]:
re.match(tagpair, text.strip())


Out[153]:
<_sre.SRE_Match object; span=(0, 19), match='<a>\n  <b>x</b>\n</a>'>

In [154]:
peel(text)


Out[154]:
'<b>x</b>'

In [157]:
bt = b'abc'

In [160]:
bt.decode()


Out[160]:
'abc'

In [ ]: