In [4]:
import urllib2
from bs4 import BeautifulSoup
Beautiful Soup is a Python library designed for quick turnaround projects like screen-scraping. Three features make it powerful:
In [25]:
url = 'file:///Users/chengjun/GitHub/cjc2016/data/test.html'
content = urllib2.urlopen(url).read()
soup = BeautifulSoup(content, 'html.parser')
soup
Out[25]:
In [26]:
print(soup.prettify())
In [72]:
for tag in soup.find_all(True):
print(tag.name)
In [58]:
soup('head') # or soup.head
Out[58]:
In [59]:
soup('body') # or soup.body
Out[59]:
In [29]:
soup('title') # or soup.title
Out[29]:
In [60]:
soup('p')
Out[60]:
In [62]:
soup.p
Out[62]:
In [30]:
soup.title.name
Out[30]:
In [31]:
soup.title.string
Out[31]:
In [48]:
soup.title.text
Out[48]:
In [32]:
soup.title.parent.name
Out[32]:
In [33]:
soup.p
Out[33]:
In [34]:
soup.p['class']
Out[34]:
In [50]:
soup.find_all('p', {'class', 'title'})
Out[50]:
In [78]:
soup.find_all('p', class_= 'title')
Out[78]:
In [49]:
soup.find_all('p', {'class', 'story'})
Out[49]:
In [57]:
soup.find_all('p', {'class', 'story'})[0].find_all('a')
Out[57]:
In [35]:
soup.a
Out[35]:
In [79]:
soup('a')
Out[79]:
In [37]:
soup.find(id="link3")
Out[37]:
In [36]:
soup.find_all('a')
Out[36]:
In [80]:
soup.find_all('a', {'class', 'sister'}) # compare with soup.find_all('a')
Out[80]:
In [81]:
soup.find_all('a', {'class', 'sister'})[0]
Out[81]:
In [44]:
soup.find_all('a', {'class', 'sister'})[0].text
Out[44]:
In [46]:
soup.find_all('a', {'class', 'sister'})[0]['href']
Out[46]:
In [47]:
soup.find_all('a', {'class', 'sister'})[0]['id']
Out[47]:
In [71]:
soup.find_all(["a", "b"])
Out[71]:
In [38]:
print(soup.get_text())
In [102]:
url = "http://mp.weixin.qq.com/s?__biz=MzA3MjQ5MTE3OA==&\
mid=206241627&idx=1&sn=471e59c6cf7c8dae452245dbea22c8f3&3rd=MzA3MDU4NTYzMw==&scene=6#rd"
content = urllib2.urlopen(url).read() #获取网页的html文本
soup = BeautifulSoup(content, 'html.parser')
print soup.title.text
print soup.find('div', {'class', 'rich_media_meta_list'}).find(id = 'post-date').text
print soup.find('div', {'class', 'rich_media_content'}).get_text()
In [2]:
from IPython.display import display_html, HTML
HTML('<iframe src=http://bbs.tianya.cn/list.jsp?item=free&nextid=%d&order=8&k=PX width=1000 height=500></iframe>')
# the webpage we would like to crawl
Out[2]:
In [5]:
page_num = 0
url = "http://bbs.tianya.cn/list.jsp?item=free&nextid=%d&order=8&k=PX" % page_num
content = urllib2.urlopen(url).read() #获取网页的html文本
soup = BeautifulSoup(content, "lxml")
articles = soup.find_all('tr')
In [7]:
print articles[0]
In [8]:
print articles[1]
In [9]:
len(articles[1:])
Out[9]:
http://bbs.tianya.cn/list.jsp?item=free&nextid=0&order=8&k=PX
In [20]:
for t in articles[1].find_all('td'): print t
In [21]:
td = articles[1].find_all('td')
In [23]:
print td[0]
In [28]:
print td[0]
In [29]:
print td[0].text
In [30]:
print td[0].text.strip()
In [31]:
print td[0].a['href']
In [24]:
print td[1]
In [25]:
print td[2]
In [26]:
print td[3]
In [27]:
print td[4]
In [11]:
records = []
for i in articles[1:]:
td = i.find_all('td')
title = td[0].text.strip()
title_url = td[0].a['href']
author = td[1].text
author_url = td[1].a['href']
views = td[2].text
replies = td[3].text
date = td[4]['title']
record = title + '\t' + title_url+ '\t' + author + '\t'+ author_url + '\t' + views+ '\t' + replies+ '\t'+ date
records.append(record)
In [16]:
print records[2]
In [85]:
def crawler(page_num, file_name):
try:
# open the browser
url = "http://bbs.tianya.cn/list.jsp?item=free&nextid=%d&order=8&k=PX" % page_num
content = urllib2.urlopen(url).read() #获取网页的html文本
soup = BeautifulSoup(content, "lxml")
articles = soup.find_all('tr')
# write down info
for i in articles[1:]:
td = i.find_all('td')
title = td[0].text.strip()
title_url = td[0].a['href']
author = td[1].text
author_url = td[1].a['href']
views = td[2].text
replies = td[3].text
date = td[4]['title']
record = title + '\t' + title_url+ '\t' + author + '\t'+ \
author_url + '\t' + views+ '\t' + replies+ '\t'+ date
with open(file_name,'a') as p: # '''Note''':Append mode, run only once!
p.write(record.encode('utf-8')+"\n") ##!!encode here to utf-8 to avoid encoding
except Exception, e:
print e
pass
In [97]:
# crawl all pages
for page_num in range(10):
print (page_num)
crawler(page_num, '/Users/chengjun/github/cjc2016/data/tianya_bbs_threads_list.txt')
In [304]:
import pandas as pd
df = pd.read_csv('/Users/chengjun/github/cjc2016/data/tianya_bbs_threads_list.txt', sep = "\t", header=None)
df[:2]
Out[304]:
In [305]:
len(df)
Out[305]:
In [306]:
df=df.rename(columns = {0:'title', 1:'link', 2:'author',3:'author_page', 4:'click', 5:'reply', 6:'time'})
df[:2]
Out[306]:
In [307]:
len(df.link)
Out[307]:
In [309]:
df.author_page[:5]
Out[309]:
In [408]:
user_info
Out[408]:
In [413]:
# user_info = soup.find('div', {'class', 'userinfo'})('p')
# user_infos = [i.get_text()[4:] for i in user_info]
def author_crawler(url, file_name):
try:
content = urllib2.urlopen(url).read() #获取网页的html文本
soup = BeautifulSoup(content, "lxml")
link_info = soup.find_all('div', {'class', 'link-box'})
followed_num, fans_num = [i.a.text for i in link_info]
try:
activity = soup.find_all('span', {'class', 'subtitle'})
post_num, reply_num = [j.text[2:] for i in activity[:1] for j in i('a')]
except:
post_num, reply_num = 1, 0
record = '\t'.join([url, followed_num, fans_num, post_num, reply_num])
with open(file_name,'a') as p: # '''Note''':Append mode, run only once!
p.write(record.encode('utf-8')+"\n") ##!!encode here to utf-8 to avoid encoding
except Exception, e:
print e, url
record = '\t'.join([url, 'na', 'na', 'na', 'na'])
with open(file_name,'a') as p: # '''Note''':Append mode, run only once!
p.write(record.encode('utf-8')+"\n") ##!!encode here to utf-8 to avoid encoding
pass
In [414]:
for k, url in enumerate(df.author_page):
if k % 10==0:
print k
author_crawler(url, '/Users/chengjun/github/cjc2016/data/tianya_bbs_threads_author_info.txt')
In [357]:
url = df.author_page[1]
content = urllib2.urlopen(url).read() #获取网页的html文本
soup1 = BeautifulSoup(content, "lxml")
In [359]:
user_info = soup.find('div', {'class', 'userinfo'})('p')
area, nid, freq_use, last_login_time, reg_time = [i.get_text()[4:] for i in user_info]
print area, nid, freq_use, last_login_time, reg_time
link_info = soup1.find_all('div', {'class', 'link-box'})
followed_num, fans_num = [i.a.text for i in link_info]
print followed_num, fans_num
In [393]:
activity = soup1.find_all('span', {'class', 'subtitle'})
post_num, reply_num = [j.text[2:] for i in activity[:1] for j in i('a')]
print post_num, reply_num
In [386]:
print activity[2]
In [370]:
link_info = soup.find_all('div', {'class', 'link-box'})
followed_num, fans_num = [i.a.text for i in link_info]
print followed_num, fans_num
In [369]:
link_info[0].a.text
Out[369]:
http://www.tianya.cn/50499450/follow
还可抓取他们的关注列表和粉丝列表
In [13]:
df.link[2]
Out[13]:
In [15]:
url = 'http://bbs.tianya.cn' + df.link[2]
url
Out[15]:
In [20]:
from IPython.display import display_html, HTML
HTML('<iframe src=http://bbs.tianya.cn/post-free-2848797-1.shtml width=1000 height=500></iframe>')
# the webpage we would like to crawl
Out[20]:
In [18]:
post = urllib2.urlopen(url).read() #获取网页的html文本
post_soup = BeautifulSoup(post, "lxml")
#articles = soup.find_all('tr')
In [123]:
print (post_soup.prettify())[:1000]
In [36]:
pa = post_soup.find_all('div', {'class', 'atl-item'})
len(pa)
Out[36]:
In [38]:
print pa[0]
In [39]:
print pa[1]
In [40]:
print pa[89]
作者:柠檬在追逐 时间:2012-10-28 21:33:55
@lice5 2012-10-28 20:37:17
作为宁波人 还是说一句:革命尚未成功 同志仍需努力
-----------------------------
对 现在说成功还太乐观,就怕说一套做一套
作者:lice5 时间:2012-10-28 20:37:17
作为宁波人 还是说一句:革命尚未成功 同志仍需努力
4 /post-free-4242156-1.shtml 2014-04-09 15:55:35 61943225 野渡自渡人 @Y雷政府34楼2014-04-0422:30:34 野渡君雄文!支持是必须的。 ----------------------------- @清坪过客16楼2014-04-0804:09:48 绝对的权力导致绝对的腐败! ----------------------------- @T大漠鱼T35楼2014-04-0810:17:27 @周丕东@普欣@拾月霜寒2012@小摸包@姚文嚼字@四號@凌宸@乔志峰@野渡自渡人@曾兵2010@缠绕夜色@曾颖@风青扬请关注
In [118]:
print pa[0].find('div', {'class', 'bbs-content'}).text.strip()
In [119]:
print pa[87].find('div', {'class', 'bbs-content'}).text.strip()
In [104]:
pa[1].a
Out[104]:
In [113]:
print pa[0].find('a', class_ = 'reportme a-link')
In [115]:
print pa[0].find('a', class_ = 'reportme a-link')['replytime']
In [114]:
print pa[0].find('a', class_ = 'reportme a-link')['author']
In [122]:
for i in pa[:10]:
p_info = i.find('a', class_ = 'reportme a-link')
p_time = p_info['replytime']
p_author_id = p_info['authorid']
p_author_name = p_info['author']
p_content = i.find('div', {'class', 'bbs-content'}).text.strip()
p_content = p_content.replace('\t', '')
print p_time, '--->', p_author_id, '--->', p_author_name,'--->', p_content, '\n'
http://bbs.tianya.cn/post-free-2848797-1.shtml
http://bbs.tianya.cn/post-free-2848797-2.shtml
http://bbs.tianya.cn/post-free-2848797-3.shtml
In [126]:
post_soup.find('div', {'class', 'atl-pages'})#['onsubmit']
Out[126]:
In [137]:
post_pages = post_soup.find('div', {'class', 'atl-pages'})
post_pages = post_pages.form['onsubmit'].split(',')[-1].split(')')[0]
post_pages
Out[137]:
In [144]:
url = 'http://bbs.tianya.cn' + df.link[2]
url_base = ''.join(url.split('-')[:-1]) + '-%d.shtml'
url_base
Out[144]:
In [415]:
def parsePage(pa):
records = []
for i in pa:
p_info = i.find('a', class_ = 'reportme a-link')
p_time = p_info['replytime']
p_author_id = p_info['authorid']
p_author_name = p_info['author']
p_content = i.find('div', {'class', 'bbs-content'}).text.strip()
p_content = p_content.replace('\t', '').replace('\n', '')#.replace(' ', '')
record = p_time + '\t' + p_author_id+ '\t' + p_author_name + '\t'+ p_content
records.append(record)
return records
import sys
def flushPrint(s):
sys.stdout.write('\r')
sys.stdout.write('%s' % s)
sys.stdout.flush()
In [246]:
url_1 = 'http://bbs.tianya.cn' + df.link[10]
content = urllib2.urlopen(url_1).read() #获取网页的html文本
post_soup = BeautifulSoup(content, "lxml")
pa = post_soup.find_all('div', {'class', 'atl-item'})
b = post_soup.find('div', class_= 'atl-pages')
b
Out[246]:
In [247]:
url_1 = 'http://bbs.tianya.cn' + df.link[0]
content = urllib2.urlopen(url_1).read() #获取网页的html文本
post_soup = BeautifulSoup(content, "lxml")
pa = post_soup.find_all('div', {'class', 'atl-item'})
a = post_soup.find('div', {'class', 'atl-pages'})
a
Out[247]:
In [251]:
a.form
Out[251]:
In [254]:
if b.form:
print 'true'
else:
print 'false'
In [416]:
import random
import time
def crawler(url, file_name):
try:
# open the browser
url_1 = 'http://bbs.tianya.cn' + url
content = urllib2.urlopen(url_1).read() #获取网页的html文本
post_soup = BeautifulSoup(content, "lxml")
# how many pages in a post
post_form = post_soup.find('div', {'class', 'atl-pages'})
if post_form.form:
post_pages = post_form.form['onsubmit'].split(',')[-1].split(')')[0]
post_pages = int(post_pages)
url_base = '-'.join(url_1.split('-')[:-1]) + '-%d.shtml'
else:
post_pages = 1
# for the first page
pa = post_soup.find_all('div', {'class', 'atl-item'})
records = parsePage(pa)
with open(file_name,'a') as p: # '''Note''':Append mode, run only once!
for record in records:
p.write('1'+ '\t' + url + '\t' + record.encode('utf-8')+"\n")
# for the 2nd+ pages
if post_pages > 1:
for page_num in range(2, post_pages+1):
time.sleep(random.random())
flushPrint(page_num)
url2 =url_base % page_num
content = urllib2.urlopen(url2).read() #获取网页的html文本
post_soup = BeautifulSoup(content, "lxml")
pa = post_soup.find_all('div', {'class', 'atl-item'})
records = parsePage(pa)
with open(file_name,'a') as p: # '''Note''':Append mode, run only once!
for record in records:
p.write(str(page_num) + '\t' +url + '\t' + record.encode('utf-8')+"\n")
else:
pass
except Exception, e:
print e
pass
In [182]:
url = 'http://bbs.tianya.cn' + df.link[2]
file_name = '/Users/chengjun/github/cjc2016/data/tianya_bbs_threads_test.txt'
crawler(url, file_name)
In [417]:
for k, link in enumerate(df.link):
flushPrint(link)
if k % 10== 0:
print 'This it the post of : ' + str(k)
file_name = '/Users/chengjun/github/cjc2016/data/tianya_bbs_threads_network.txt'
crawler(link, file_name)
In [418]:
dtt = []
with open('/Users/chengjun/github/cjc2016/data/tianya_bbs_threads_network.txt', 'r') as f:
for line in f:
pnum, link, time, author_id, author, content = line.replace('\n', '').split('\t')
dtt.append([pnum, link, time, author_id, author, content])
len(dtt)
Out[418]:
In [419]:
dt = pd.DataFrame(dtt)
dt[:5]
Out[419]:
In [420]:
dt=dt.rename(columns = {0:'page_num', 1:'link', 2:'time', 3:'author',4:'author_name', 5:'reply'})
dt[:5]
Out[420]:
In [421]:
dt.reply[:100]
Out[421]:
http://search.tianya.cn/bbs?q=PX 共有18459 条内容
In [14]:
18459/50
Out[14]:
实际上到第10页就没有了 http://bbs.tianya.cn/list.jsp?item=free&order=1&nextid=9&k=PX, 原来那只是天涯论坛,还有其它各种版块,如天涯聚焦: http://focus.tianya.cn/ 等等。
In [ ]: