In [1]:
import urllib2
from bs4 import BeautifulSoup
In [2]:
from IPython.display import display_html, HTML
HTML('<iframe src=http://www.hprc.org.cn/wxzl/wxysl/lczf/ width=1000 height=500></iframe>')
Out[2]:
In [8]:
url ="http://www.hprc.org.cn/wxzl/wxysl/lczf/"
content=urllib2.urlopen(url).read().decode('gb18030')
soup= BeautifulSoup(content,'html.parser')
links=soup.find_all('td',{'class','bl'})
print len(links)
In [12]:
hyperlinks=[url+i.a['href'].split('./')[1] for i in links]
In [15]:
hyperlinks
Out[15]:
In [18]:
from IPython.display import display_html, HTML
HTML('<iframe src=http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie/201003/t20100315_44772.html \
width=1000 height=500></iframe>')
Out[18]:
In [26]:
url_i="http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie/201003/t20100315_44772.html"
content = urllib2.urlopen(url_i).read().decode('gb18030')
soup = BeautifulSoup(content,'html.parser')
scripts=soup.find_all('script')
In [28]:
countPage= int(''.join(scripts[1]).split('countPage = ')[1].split('//')[0])
countPage
Out[28]:
In [ ]:
def crawler(url_i):
content = urllib2.urlopen(url_i).read().decode('gb18030')
soup = BeautifulSoup(content, 'html.parser')
year = soup.find('span', {'class', 'huang16c'}).text[:4]
year = int(year)
report = ''.join(s.text for s in soup('p'))
# 找到分页信息
scripts = soup.find_all('script')
countPage = int(''.join(scripts[1]).split('countPage = ')[1].split('//')[0])
if countPage == 1:
pass
else:
for i in range(1, countPage):
url_child = url_i.split('.html')[0] +'_'+str(i)+'.html'
content = urllib2.urlopen(url_child).read().decode('gb18030')
soup = BeautifulSoup(content)
report_child = ''.join(s.text for s in soup('p'))
report = report + report_child
return year, report
In [37]:
reports = {}
for link in hyperlinks:
year, report = crawler(link)
reports[year] = report
with open ('/Users/zhangyixin/Desktop/cjc2016-gh-pages/data/gov_reports1954-2016.txt','wb') as f:
for r in reports:
line =str(r)+'\t'+reports[r].replace('\n','\t')+'\n'
f.write(line.encode('utf-8'))
In [1]:
import urllib2
from bs4 import BeautifulSoup
In [2]:
from IPython.display import display_html, HTML
HTML('<iframe src=http://bbs.tianya.cn/list.jsp?item=free&nextid=%d&order=8&k=NBA width=1000 height=500></iframe>')
Out[2]:
In [39]:
page_num = 0
url = "http://bbs.tianya.cn/list.jsp?item=free&nextid=%d&order=8&k=NBA" % page_num
content = urllib2.urlopen(url).read()
soup = BeautifulSoup(content, "lxml")
articles = soup.find_all('tr')
In [40]:
len(articles)
Out[40]:
In [41]:
def crawler(page_num, file_name):
try:
url = "http://bbs.tianya.cn/list.jsp?item=free&nextid=%d&order=8&k=NBA" % page_num
content = urllib2.urlopen(url).read()
soup = BeautifulSoup(content, "lxml")
articles = soup.find_all('tr')
for i in articles[1:]:
td = i.find_all('td')
title = td[0].text.strip()
title_url = td[0].a['href']
author = td[1].text
author_url = td[1].a['href']
views = td[2].text
replies = td[3].text
date = td[4]['title']
record = title + '\t' + title_url+ '\t' + author + '\t'+ \
author_url + '\t' + views+ '\t' + replies+ '\t'+ date
with open(file_name,'a') as p:
p.write(record.encode('utf-8')+"\n")
except Exception, e:
print e
pass
In [42]:
for page_num in range(10):
print (page_num)
crawler(page_num, '/Users/zhangyixin/Documents/tianya_bbs_threads_list.txt')
In [7]:
import pandas as pd
df = pd.read_csv('/Users/zhangyixin/Documents/tianya_bbs_threads_list.txt', sep = "\t", header=None)
df[:5]
Out[7]:
In [8]:
len(df)
Out[8]:
In [9]:
df=df.rename(columns = {0:'title', 1:'link', 2:'author',3:'author_page', 4:'click', 5:'reply', 6:'time'})
df[:10]
Out[9]:
In [10]:
def author_crawler(url, file_name):
try:
content = urllib2.urlopen(url).read() #获取网页的html文本
soup = BeautifulSoup(content, "lxml")
link_info = soup.find_all('div', {'class', 'link-box'})
followed_num, fans_num = [i.a.text for i in link_info]
try:
activity = soup.find_all('span', {'class', 'subtitle'})
post_num, reply_num = [j.text[2:] for i in activity[:1] for j in i('a')]
except:
post_num, reply_num = 1, 0
record = '\t'.join([url, followed_num, fans_num, post_num, reply_num])
with open(file_name,'a') as p: # '''Note''':Append mode, run only once!
p.write(record.encode('utf-8')+"\n") ##!!encode here to utf-8 to avoid encoding
except Exception, e:
print e, url
record = '\t'.join([url, 'na', 'na', 'na', 'na'])
with open(file_name,'a') as p: # '''Note''':Append mode, run only once!
p.write(record.encode('utf-8')+"\n") ##!!encode here to utf-8 to avoid encoding
pass
In [11]:
for k, url in enumerate(df.author_page):
if k % 50==0:
print k
author_crawler(url, '/Users/zhangyixin/Documents/tianya_bbs_threads_author_info.txt')
In [12]:
df.author_page[:5]
Out[12]:
In [13]:
url = df.author_page[1]
content = urllib2.urlopen(url).read()
soup1 = BeautifulSoup(content, "lxml")
In [14]:
user_info = soup1.find('div', {'class', 'userinfo'})('p')
area, nid, freq_use, last_login_time, reg_time = [i.get_text()[4:] for i in user_info]
print area, nid, freq_use, last_login_time, reg_time
link_info = soup1.find_all('div', {'class', 'link-box'})
followed_num, fans_num = [i.a.text for i in link_info]
print followed_num, fans_num
activity = soup1.find_all('span', {'class', 'subtitle'})
post_num, reply_num = [j.text[2:] for i in activity[:1] for j in i('a')]
print post_num, reply_num
In [15]:
df.link[0]
url = 'http://bbs.tianya.cn' + df.link[0]
url
Out[15]:
In [16]:
from IPython.display import display_html, HTML
HTML('<iframe src=http://bbs.tianya.cn/post-free-1270973-1.shtml width=1000 height=500></iframe>')
Out[16]:
In [17]:
post = urllib2.urlopen(url).read()
post_soup = BeautifulSoup(post, "lxml")
print (post_soup.prettify())[:1000]
In [18]:
pa = post_soup.find_all('div', {'class', 'atl-item'})
len(pa)
Out[18]:
In [19]:
print pa[0].find('div', {'class', 'bbs-content'}).text.strip()
In [20]:
pa[1].a
Out[20]:
In [21]:
print pa[0].find('a', class_ = 'reportme a-link')
print pa[0].find('a', class_ = 'reportme a-link')['replytime']
print pa[0].find('a', class_ = 'reportme a-link')['author']
In [22]:
for i in pa[:20]:
p_info = i.find('a', class_ = 'reportme a-link')
p_time = p_info['replytime']
p_author_id = p_info['authorid']
p_author_name = p_info['author']
p_content = i.find('div', {'class', 'bbs-content'}).text.strip()
p_content = p_content.replace('\t', '')
print p_time, '--->', p_author_id, '--->', p_author_name,'--->', p_content, '\n'
In [23]:
post_soup.find('div', {'class', 'atl-pages'})
post_pages = post_soup.find('div', {'class', 'atl-pages'})
post_pages = post_pages.form['onsubmit'].split(',')[-1].split(')')[0]
post_pages
Out[23]:
In [24]:
url = 'http://bbs.tianya.cn' + df.link[0]
url_base = ''.join(url.split('-')[:-1]) + '-%d.shtml'
url_base
Out[24]:
In [25]:
def parsePage(pa):
records = []
for i in pa:
p_info = i.find('a', class_ = 'reportme a-link')
p_time = p_info['replytime']
p_author_id = p_info['authorid']
p_author_name = p_info['author']
p_content = i.find('div', {'class', 'bbs-content'}).text.strip()
p_content = p_content.replace('\t', '').replace('\n', '')#.replace(' ', '')
record = p_time + '\t' + p_author_id+ '\t' + p_author_name + '\t'+ p_content
records.append(record)
return records
import sys
def flushPrint(s):
sys.stdout.write('\r')
sys.stdout.write('%s' % s)
sys.stdout.flush()
In [26]:
import random
import time
def crawler(url, file_name):
try:
url_1 = 'http://bbs.tianya.cn' + url
content = urllib2.urlopen(url_1).read()
post_soup = BeautifulSoup(content, "lxml")
post_form = post_soup.find('div', {'class', 'atl-pages'})
if post_form.form:
post_pages = post_form.form['onsubmit'].split(',')[-1].split(')')[0]
post_pages = int(post_pages)
url_base = '-'.join(url_1.split('-')[:-1]) + '-%d.shtml'
else:
post_pages = 1
pa = post_soup.find_all('div', {'class', 'atl-item'})
records = parsePage(pa)
with open(file_name,'a') as p:
for record in records:
p.write('1'+ '\t' + url + '\t' + record.encode('utf-8')+"\n")
# for the 2nd+ pages
if post_pages > 1:
for page_num in range(2, post_pages+1):
time.sleep(random.random())
flushPrint(page_num)
url2 =url_base % page_num
content = urllib2.urlopen(url2).read()
post_soup = BeautifulSoup(content, "lxml")
pa = post_soup.find_all('div', {'class', 'atl-item'})
records = parsePage(pa)
with open(file_name,'a') as p:
for record in records:
p.write(str(page_num) + '\t' +url + '\t' + record.encode('utf-8')+"\n")
else:
pass
except Exception, e:
print e
pass
In [28]:
for k, link in enumerate(df.link):
flushPrint(link)
if k % 10== 0:
print 'This it the post of : ' + str(k)
file_name = '/Users/zhangyixin/Documents/tianya_bbs_threads_network.txt'
crawler(link, file_name)
In [30]:
dtt = []
with open('/Users/zhangyixin/Documents/tianya_bbs_threads_network.txt', 'r') as f:
for line in f:
pnum, link, time, author_id, author, content = line.replace('\n', '').split('\t')
dtt.append([pnum, link, time, author_id, author, content])
len(dtt)
Out[30]:
In [31]:
dt = pd.DataFrame(dtt)
dt[:5]
Out[31]:
In [32]:
dt=dt.rename(columns = {0:'page_num', 1:'link', 2:'time', 3:'author',4:'author_name', 5:'reply'})
dt[:5]
Out[32]:
In [33]:
dt.reply[:100]
Out[33]:
In [ ]: