In [7]:
from lxml import etree
In [8]:
from bs4 import BeautifulSoup
In [24]:
import re
In [18]:
%matplotlib
In [9]:
html = '''
<!DOCTYPE html>
<html>
<head><meta http-equiv="content-type" content="text/html;charset=utf-8"><meta http-equiv="X-UA-Compatible" content="IE=Edge"><meta content="never" name="referrer"><title>百度一下,你就知道</title>
</head>
<body link="#0000cc" style="display:block">
<div id="wrapper">
<div id="head">
<div class="head_wrapper">
<div class="s_form">
<div class="s_form_wrapper">
<div id="lg">
<img hidefocus="true" src="//www.baidu.com/img/bd_logo1.png" width="270" height="129">
</div><a href="/" id="result_logo">
<img src="http://www.baidu.com/img/baidu_jgylogo3.gif" alt="到百度首页" title="到百度首页">
</a><form id="form" name="f" action="https://www.baidu.com/s" class="fm" onsubmit="os();return false;">
<input type="hidden" name="ie" value="utf-8">
<input type="hidden" name="f" value="8">
<input type="hidden" name="rsv_bp" value="1">
<input type="hidden" name="rsr" value="1">
<input type="hidden" name="rsv_idx" value="1">
<input type=hidden name="ch" value="">
<input type=hidden name="rv_sd" value="13660_13387_13439_13203_13287_13602_13161_13257_11558_13085_8498">
<input type=hidden name="xpth" value="http%3A%2F%2Fwww.baidu.com%2Fs%3Fie%3Dutf-8%26f%3D8%26rsv_bp%3D1%26tn%3D%26wd%3Djquery%2520get%26rsv_pq%3D85cb9ab80001cc79%26rsv_t%3D5e5dhOzL7ZokcuO8mFu469Q5e9REnkJL1CKxnvfTLS5NLzBtEZvCU0fuv4s%26rsv_enter%3D1%26rsv_sug3%3D11%26rsv_sug1%3D7%26rsv_sug2%3D0%26inputT%3D2736%26rsv_sug4%3D4394 ">
<input type=hidden name="tn" id="idtn" value="baidu">
<input type=hidden name="bar" value="">
<span class="bg s_ipt_wr">
<input id="kw" name="word" class="s_ipt" value="" maxlength="255" autocomplete="off">
</span>
<span class="bg s_btn_wr">
<input type="submit" id="su" value="百度一下" class="bg s_btn">
</span>
<span class="tools">
<span id="mHolder">
<div id="mCon">
<span>输入法</span>
</div>
<ul id="mMenu">
<li>
<a href="javascript:;" name="ime_hw">手写</a>
</li>
<li>
<a href="javascript:;" name="ime_py">拼音</a>
</li>
<li class="ln">
</li>
<li>
<a href="javascript:;" name="ime_cl">关闭</a>
</li>
</ul>
</span>
</span>
<input type="hidden" name="rn" value="">
</form>
</div>
</div>
<div id="u1">
<a href="http://news.baidu.com" name="tj_trnews" class="mnav">新闻</a>
<a href="http://www.hao123.com" name="tj_trhao123" class="mnav">hao123</a>
<a href="http://map.baidu.com" name="tj_trmap" class="mnav">地图</a>
<a href="http://v.baidu.com" name="tj_trvideo" class="mnav">视频</a>
<a href="http://tieba.baidu.com" name="tj_trtieba" class="mnav">贴吧</a>
<a href="http://www.baidu.com/more/" name="tj_briicon" class="bri" style="display: block;">更多产品</a>
</div>
</div>
</div>
<div id="ftCon">
<div id="ftConw">
<p id="lh">
<a href="http://home.baidu.com">关于百度</a>
<a href="http://ir.baidu.com">About Baidu</a>
</p>
<p id="cp">©2017 Baidu
<a href="http://www.baidu.com/duty/">使用百度前必读</a> <a href="http://jianyi.baidu.com/">意见反馈</a> 京ICP证030173号
<img src="http://www.baidu.com/img/gs.gif"></p>
</div>
</div>
</div>
</body>
</html>
'''
In [10]:
%timeit -n 1000 etree.HTML(html).xpath('//*[@id="lg"]/img/@src')
In [17]:
%timeit -n 1000 BeautifulSoup(html, 'lxml').find('img').get('src')
In [28]:
%timeit -n 1000 BeautifulSoup(html, 'html5lib').find('img').get('src')
In [29]:
%timeit -n 1000 BeautifulSoup(html, 'html.parser').find('img').get('src')
In [25]:
from pandas import Series
In [33]:
data = Series([285, 1820, 6350, 2550], index=['xpath', 'bs4_lxml', 'bs4_html5lib', 'bs4_html.parser'])
In [34]:
data.sort_values(inplace=True)
In [35]:
data.plot(kind='bar')
Out[35]:
In [ ]: