In [7]:
from lxml import etree

In [8]:
from bs4 import BeautifulSoup

In [24]:
import re

In [18]:
%matplotlib


Using matplotlib backend: TkAgg

In [9]:
html = '''
<!DOCTYPE html>
<html>
<head><meta http-equiv="content-type" content="text/html;charset=utf-8"><meta http-equiv="X-UA-Compatible" content="IE=Edge"><meta content="never" name="referrer"><title>百度一下,你就知道</title>
</head>
<body link="#0000cc" style="display:block">
  <div id="wrapper">
    <div id="head">
      <div class="head_wrapper">
        <div class="s_form">
          <div class="s_form_wrapper">
            <div id="lg">
              <img hidefocus="true" src="//www.baidu.com/img/bd_logo1.png" width="270" height="129">
            </div><a href="/" id="result_logo">
              <img src="http://www.baidu.com/img/baidu_jgylogo3.gif" alt="到百度首页" title="到百度首页">
            </a><form id="form" name="f" action="https://www.baidu.com/s" class="fm" onsubmit="os();return false;">
              <input type="hidden" name="ie" value="utf-8">
              <input type="hidden" name="f" value="8">
              <input type="hidden" name="rsv_bp" value="1">
              <input type="hidden" name="rsr" value="1">
              <input type="hidden" name="rsv_idx" value="1">
              <input type=hidden name="ch" value="">
              <input type=hidden name="rv_sd" value="13660_13387_13439_13203_13287_13602_13161_13257_11558_13085_8498">
              <input type=hidden name="xpth" value="http%3A%2F%2Fwww.baidu.com%2Fs%3Fie%3Dutf-8%26f%3D8%26rsv_bp%3D1%26tn%3D%26wd%3Djquery%2520get%26rsv_pq%3D85cb9ab80001cc79%26rsv_t%3D5e5dhOzL7ZokcuO8mFu469Q5e9REnkJL1CKxnvfTLS5NLzBtEZvCU0fuv4s%26rsv_enter%3D1%26rsv_sug3%3D11%26rsv_sug1%3D7%26rsv_sug2%3D0%26inputT%3D2736%26rsv_sug4%3D4394 ">
              <input type=hidden name="tn" id="idtn" value="baidu">
              <input type=hidden name="bar" value="">
              <span class="bg s_ipt_wr">
                <input id="kw" name="word" class="s_ipt" value="" maxlength="255" autocomplete="off">
              </span>
              <span class="bg s_btn_wr">
                <input type="submit" id="su" value="百度一下" class="bg s_btn">
              </span>
              <span class="tools">
                <span id="mHolder">
                  <div id="mCon">
                    <span>输入法</span>
                  </div>
                  <ul id="mMenu">
                    <li>
                      <a href="javascript:;" name="ime_hw">手写</a>
                    </li>
                    <li>
                      <a href="javascript:;" name="ime_py">拼音</a>
                    </li>
                    <li class="ln">

                    </li>
                    <li>
                      <a href="javascript:;" name="ime_cl">关闭</a>
                    </li>
                  </ul>
                </span>
              </span>
              <input type="hidden" name="rn" value="">
            </form>
          </div>
        </div>
        <div id="u1">
          <a href="http://news.baidu.com" name="tj_trnews" class="mnav">新闻</a>
          <a href="http://www.hao123.com" name="tj_trhao123" class="mnav">hao123</a>
          <a href="http://map.baidu.com" name="tj_trmap" class="mnav">地图</a>
          <a href="http://v.baidu.com" name="tj_trvideo" class="mnav">视频</a>
          <a href="http://tieba.baidu.com" name="tj_trtieba" class="mnav">贴吧</a>
          <a href="http://www.baidu.com/more/" name="tj_briicon" class="bri" style="display: block;">更多产品</a>
        </div>
      </div>
    </div>
    <div id="ftCon">
      <div id="ftConw">
        <p id="lh">
          <a href="http://home.baidu.com">关于百度</a>
          <a href="http://ir.baidu.com">About Baidu</a>
        </p>
        <p id="cp">&copy;2017&nbsp;Baidu&nbsp;
          <a href="http://www.baidu.com/duty/">使用百度前必读</a>&nbsp;<a href="http://jianyi.baidu.com/">意见反馈</a>&nbsp;京ICP证030173号&nbsp;
          <img src="http://www.baidu.com/img/gs.gif"></p>
        </div>
      </div>
    </div>
  </body>
  </html>

'''

In [10]:
%timeit -n 1000 etree.HTML(html).xpath('//*[@id="lg"]/img/@src')


1000 loops, best of 3: 285 µs per loop

In [17]:
%timeit -n 1000 BeautifulSoup(html, 'lxml').find('img').get('src')


1000 loops, best of 3: 1.82 ms per loop

In [28]:
%timeit -n 1000 BeautifulSoup(html, 'html5lib').find('img').get('src')


1000 loops, best of 3: 6.35 ms per loop

In [29]:
%timeit -n 1000 BeautifulSoup(html, 'html.parser').find('img').get('src')


1000 loops, best of 3: 2.55 ms per loop

In [25]:
from pandas import Series

In [33]:
data = Series([285, 1820, 6350, 2550], index=['xpath', 'bs4_lxml', 'bs4_html5lib', 'bs4_html.parser'])

In [34]:
data.sort_values(inplace=True)

In [35]:
data.plot(kind='bar')


Out[35]:
<matplotlib.axes._subplots.AxesSubplot at 0x7efe5c2f7fd0>

In [ ]: