In [2]:
from lxml import etree
In [3]:
sample = """
<html>
<head>
<title>My page title</title>
</head>
<body>
<h2>Hello <a href="#">World</a>
<p>a paragraph</p>
<input type="hidden" name="_xsrf" value="sdakjflkajdfskl" />
<!-- comments -->
</body>
</html>
"""
In [5]:
s1 = etree.HTML(sample)
In [6]:
s1.xpath('/html/head/title/text()')
Out[6]:
In [7]:
s1.xpath('//p/text()')
Out[7]:
In [11]:
s1.xpath('//h2/a/@href')
Out[11]:
In [13]:
s1.xpath('//input[@name="_xsrf"]/@value')
Out[13]:
In [14]:
s1.xpath('//comment()')
Out[14]: