In [2]:
from lxml import etree

In [3]:
sample = """
<html>
    <head>
        <title>My page title</title>
    </head>
    <body>
        <h2>Hello <a href="#">World</a>
        <p>a paragraph</p>
        <input type="hidden" name="_xsrf" value="sdakjflkajdfskl" />
        <!-- comments -->
    </body>
</html>
"""

In [5]:
s1 = etree.HTML(sample)

In [6]:
s1.xpath('/html/head/title/text()')


Out[6]:
['My page title']

In [7]:
s1.xpath('//p/text()')


Out[7]:
['a paragraph']

In [11]:
s1.xpath('//h2/a/@href')


Out[11]:
['#']

In [13]:
s1.xpath('//input[@name="_xsrf"]/@value')


Out[13]:
['sdakjflkajdfskl']

In [14]:
s1.xpath('//comment()')


Out[14]:
[<!-- comments -->]