In [14]:
import urllib2
from bs4 import BeautifulSoup
Beautiful Soup is a Python library designed for quick turnaround projects like screen-scraping. Three features make it powerful:
In [25]:
url = 'file:///Users/chengjun/GitHub/cjc2016/data/test.html'
content = urllib2.urlopen(url).read()
soup = BeautifulSoup(content, 'html.parser')
soup
Out[25]:
Beautiful Soup supports the html.parser included in Python’s standard library
but it also supports a number of third-party Python parsers. One is the lxml parser lxml
. Depending on your setup, you might install lxml with one of these commands:
$ apt-get install python-lxml
$ easy_install lxml
$ pip install lxml
In [26]:
print(soup.prettify())
In [72]:
for tag in soup.find_all(True):
print(tag.name)
In [58]:
soup('head') # or soup.head
Out[58]:
In [59]:
soup('body') # or soup.body
Out[59]:
In [29]:
soup('title') # or soup.title
Out[29]:
In [60]:
soup('p')
Out[60]:
In [62]:
soup.p
Out[62]:
In [30]:
soup.title.name
Out[30]:
In [31]:
soup.title.string
Out[31]:
In [48]:
soup.title.text
Out[48]:
In [32]:
soup.title.parent.name
Out[32]:
In [33]:
soup.p
Out[33]:
In [34]:
soup.p['class']
Out[34]:
In [50]:
soup.find_all('p', {'class', 'title'})
Out[50]:
In [78]:
soup.find_all('p', class_= 'title')
Out[78]:
In [49]:
soup.find_all('p', {'class', 'story'})
Out[49]:
In [57]:
soup.find_all('p', {'class', 'story'})[0].find_all('a')
Out[57]:
In [35]:
soup.a
Out[35]:
In [79]:
soup('a')
Out[79]:
In [37]:
soup.find(id="link3")
Out[37]:
In [36]:
soup.find_all('a')
Out[36]:
In [80]:
soup.find_all('a', {'class', 'sister'}) # compare with soup.find_all('a')
Out[80]:
In [81]:
soup.find_all('a', {'class', 'sister'})[0]
Out[81]:
In [44]:
soup.find_all('a', {'class', 'sister'})[0].text
Out[44]:
In [46]:
soup.find_all('a', {'class', 'sister'})[0]['href']
Out[46]:
In [47]:
soup.find_all('a', {'class', 'sister'})[0]['id']
Out[47]:
In [71]:
soup.find_all(["a", "b"])
Out[71]:
In [38]:
print(soup.get_text())
In [11]:
from IPython.display import display_html, HTML
HTML('<iframe src=http://mp.weixin.qq.com/s?__biz=MzA3MjQ5MTE3OA==&\
mid=206241627&idx=1&sn=471e59c6cf7c8dae452245dbea22c8f3&3rd=MzA3MDU4NTYzMw==&scene=6#rd\
width=500 height=500></iframe>')
# the webpage we would like to crawl
Out[11]:
In [15]:
url = "http://mp.weixin.qq.com/s?__biz=MzA3MjQ5MTE3OA==&\
mid=206241627&idx=1&sn=471e59c6cf7c8dae452245dbea22c8f3&3rd=MzA3MDU4NTYzMw==&scene=6#rd"
content = urllib2.urlopen(url).read() #获取网页的html文本
soup = BeautifulSoup(content, 'html.parser')
title = soup.title.text
rmml = soup.find('div', {'class', 'rich_media_meta_list'})
date = rmml.find(id = 'post-date').text
rmc = soup.find('div', {'class', 'rich_media_content'})
content = rmc.get_text()
print title
print date
print content