In [1]:
import requests
from lxml import html
In [2]:
%%HTML
<html>
<body>
<h1>Favorite Python Librarires</h1>
<ul>
<li>Numpy</li>
<li>Pandas</li>
<li>requests</li>
</ul>
</body>
</html>
In [3]:
html_code = In[2]
html_code = html_code[42:-2].replace("\\n","\n")
print(html_code)
doc = html.fromstring(html_code)
In [4]:
title = doc.xpath("/html/body/h1")[0]
title
Out[4]:
To read the text inside that tag you can use the text variable.
In [5]:
title.text
Out[5]:
Another way is read the text is to use the text()
function in xpath.
In [6]:
title = doc.xpath("/html/body/h1/text()")[0]
title
Out[6]:
In [7]:
item_list = doc.xpath("/html/body/ul/li")
item_list
Out[7]:
We can use text()
function with multiple items.
In [8]:
doc = html.fromstring(html_code)
item_list = doc.xpath("/html/body/ul/li/text()")
item_list
Out[8]:
In [9]:
doc = html.fromstring(html_code)
item_list = doc.xpath("//li/text()")
item_list
Out[9]:
In [10]:
doc = html.fromstring(html_code)
item_list = doc.xpath("/html/body/ul/li[1]/text()")
item_list
Out[10]:
In [11]:
%%HTML
<html>
<body>
<h1 class="text-muted">Favorite Python Librarires</h1>
<ul class="nav nav-pills nav-stacked">
<li role="presentation"><a href="http://www.numpy.org/">Numpy</a></li>
<li role="presentation"><a href="http://pandas.pydata.org/">Pandas</a></li>
<li role="presentation"><a href="http://python-requests.org/">requests</a></li>
</ul>
<h1 class="text-success">Favorite JS Librarires</h1>
<ul class="nav nav-tabs">
<li role="presentation"><a href="http://getbootstrap.com/">Bootstrap</a></li>
<li role="presentation"><a href="https://jquery.com/">jQuery</a></li>
<li role="presentation"><a href="http://d3js.org/">d3.js</a></li>
</ul>
</html>
In [12]:
html_code = In[11]
html_code = html_code[42:-2].replace("\\n","\n")
print(html_code)
doc = html.fromstring(html_code)
In [13]:
title = doc.xpath("/html/body/h1[@class='text-muted']/text()")[0]
title
Out[13]:
In [14]:
item_list = doc.xpath("/html/body/ul[contains(@class,'nav-stacked')]/li/a/text()")
item_list
Out[14]:
In [15]:
item_list = doc.xpath("/html/body/ul[contains(@class,'nav-stacked')]/li/a/@href")
item_list
Out[15]:
Read the list of languages with 1M+ articles on http://www.wikipedia.org/
In [16]:
response = requests.get("http://www.wikipedia.org")
doc = html.fromstring(response.content, parser=html.HTMLParser(encoding="utf-8"))
In [17]:
lang_list = doc.xpath("//div[@class='langlist langlist-large hlist'][1]/ul/li/a/text()")
lang_list
Out[17]:
In [ ]: