In [33]:
    
import urllib2
from bs4 import BeautifulSoup
    
In [2]:
    
from IPython.display import display_html, HTML
HTML('<iframe src=http://bbs.tianya.cn/list.jsp?item=free&nextid=%d&order=8&k=PX width=1000 height=500></iframe>')
# the webpage we would like to crawl
    
    Out[2]:
In [5]:
    
page_num = 0
url = "http://bbs.tianya.cn/list.jsp?item=free&nextid=%d&order=8&k=PX" % page_num
content = urllib2.urlopen(url).read() #获取网页的html文本
soup = BeautifulSoup(content, "lxml") 
articles = soup.find_all('tr')
    
In [7]:
    
print articles[0]
    
    
In [8]:
    
print articles[1]
    
    
In [9]:
    
len(articles[1:])
    
    Out[9]:
http://bbs.tianya.cn/list.jsp?item=free&nextid=0&order=8&k=PX
In [20]:
    
for t in articles[1].find_all('td'): print t
    
    
In [21]:
    
td = articles[1].find_all('td')
    
In [23]:
    
print td[0]
    
    
In [28]:
    
print td[0]
    
    
In [29]:
    
print td[0].text
    
    
In [30]:
    
print td[0].text.strip()
    
    
In [31]:
    
print td[0].a['href']
    
    
In [24]:
    
print td[1]
    
    
In [25]:
    
print td[2]
    
    
In [26]:
    
print td[3]
    
    
In [27]:
    
print td[4]
    
    
In [11]:
    
records = []
for i in articles[1:]:
    td = i.find_all('td')
    title = td[0].text.strip()
    title_url = td[0].a['href']
    author = td[1].text
    author_url = td[1].a['href']
    views = td[2].text
    replies = td[3].text
    date = td[4]['title']
    record = title + '\t' + title_url+ '\t' + author + '\t'+ author_url + '\t' + views+ '\t'  + replies+ '\t'+ date
    records.append(record)
    
In [16]:
    
print records[2]
    
    
In [85]:
    
def crawler(page_num, file_name):
    try:
        # open the browser
        url = "http://bbs.tianya.cn/list.jsp?item=free&nextid=%d&order=8&k=PX" % page_num
        content = urllib2.urlopen(url).read() #获取网页的html文本
        soup = BeautifulSoup(content, "lxml") 
        articles = soup.find_all('tr')
        # write down info
        for i in articles[1:]:
            td = i.find_all('td')
            title = td[0].text.strip()
            title_url = td[0].a['href']
            author = td[1].text
            author_url = td[1].a['href']
            views = td[2].text
            replies = td[3].text
            date = td[4]['title']
            record = title + '\t' + title_url+ '\t' + author + '\t'+ \
                        author_url + '\t' + views+ '\t'  + replies+ '\t'+ date
            with open(file_name,'a') as p: # '''Note''':Append mode, run only once!
                        p.write(record.encode('utf-8')+"\n") ##!!encode here to utf-8 to avoid encoding
    except Exception, e:
        print e
        pass
    
In [97]:
    
# crawl all pages
for page_num in range(10):
    print (page_num)
    crawler(page_num, '/Users/chengjun/github/cjc2016/data/tianya_bbs_threads_list.txt')
    
    
In [304]:
    
import pandas as pd
df = pd.read_csv('/Users/chengjun/github/cjc2016/data/tianya_bbs_threads_list.txt', sep = "\t", header=None)
df[:2]
    
    Out[304]:
In [305]:
    
len(df)
    
    Out[305]:
In [306]:
    
df=df.rename(columns = {0:'title', 1:'link', 2:'author',3:'author_page', 4:'click', 5:'reply', 6:'time'})
df[:2]
    
    Out[306]:
In [307]:
    
len(df.link)
    
    Out[307]:
In [309]:
    
df.author_page[:5]
    
    Out[309]:
In [408]:
    
user_info
    
    Out[408]:
In [413]:
    
# user_info = soup.find('div',  {'class', 'userinfo'})('p')
# user_infos = [i.get_text()[4:] for i in user_info]
            
def author_crawler(url, file_name):
    try:
        content = urllib2.urlopen(url).read() #获取网页的html文本
        soup = BeautifulSoup(content, "lxml")
        link_info = soup.find_all('div', {'class', 'link-box'})
        followed_num, fans_num = [i.a.text for i in link_info]
        try:
            activity = soup.find_all('span', {'class', 'subtitle'})
            post_num, reply_num = [j.text[2:] for i in activity[:1] for j in i('a')]
        except:
            post_num, reply_num = 1, 0
        record =  '\t'.join([url, followed_num, fans_num, post_num, reply_num])
        with open(file_name,'a') as p: # '''Note''':Append mode, run only once!
                    p.write(record.encode('utf-8')+"\n") ##!!encode here to utf-8 to avoid encoding
    except Exception, e:
        print e, url
        record =  '\t'.join([url, 'na', 'na', 'na', 'na'])
        with open(file_name,'a') as p: # '''Note''':Append mode, run only once!
                    p.write(record.encode('utf-8')+"\n") ##!!encode here to utf-8 to avoid encoding
        pass
    
In [414]:
    
for k, url in enumerate(df.author_page):
    if k % 10==0:
        print k
    author_crawler(url, '/Users/chengjun/github/cjc2016/data/tianya_bbs_threads_author_info.txt')
    
    
In [357]:
    
url = df.author_page[1]
content = urllib2.urlopen(url).read() #获取网页的html文本
soup1 = BeautifulSoup(content, "lxml")
    
In [359]:
    
user_info = soup.find('div',  {'class', 'userinfo'})('p')
area, nid, freq_use, last_login_time, reg_time = [i.get_text()[4:] for i in user_info]
print area, nid, freq_use, last_login_time, reg_time 
link_info = soup1.find_all('div', {'class', 'link-box'})
followed_num, fans_num = [i.a.text for i in link_info]
print followed_num, fans_num
    
    
In [393]:
    
activity = soup1.find_all('span', {'class', 'subtitle'})
post_num, reply_num = [j.text[2:] for i in activity[:1] for j in i('a')]
print post_num, reply_num
    
    
In [386]:
    
print activity[2]
    
    
In [370]:
    
link_info = soup.find_all('div', {'class', 'link-box'})
followed_num, fans_num = [i.a.text for i in link_info]
print followed_num, fans_num
    
    
In [369]:
    
link_info[0].a.text
    
    Out[369]:
http://www.tianya.cn/50499450/follow
还可抓取他们的关注列表和粉丝列表
In [13]:
    
df.link[2]
    
    Out[13]:
In [15]:
    
url = 'http://bbs.tianya.cn' + df.link[2]
url
    
    Out[15]:
In [20]:
    
from IPython.display import display_html, HTML
HTML('<iframe src=http://bbs.tianya.cn/post-free-2848797-1.shtml width=1000 height=500></iframe>')
# the webpage we would like to crawl
    
    Out[20]:
In [18]:
    
post = urllib2.urlopen(url).read() #获取网页的html文本
post_soup = BeautifulSoup(post, "lxml") 
#articles = soup.find_all('tr')
    
In [123]:
    
print (post_soup.prettify())[:1000]
    
    
In [36]:
    
pa = post_soup.find_all('div', {'class', 'atl-item'})
len(pa)
    
    Out[36]:
In [38]:
    
print pa[0]
    
    
In [39]:
    
print pa[1]
    
    
In [40]:
    
print pa[89]
    
    
作者:柠檬在追逐 时间:2012-10-28 21:33:55
@lice5 2012-10-28 20:37:17
作为宁波人 还是说一句:革命尚未成功 同志仍需努力
-----------------------------
对 现在说成功还太乐观,就怕说一套做一套
作者:lice5 时间:2012-10-28 20:37:17
作为宁波人 还是说一句:革命尚未成功 同志仍需努力
4 /post-free-4242156-1.shtml 2014-04-09 15:55:35 61943225 野渡自渡人 @Y雷政府34楼2014-04-0422:30:34 野渡君雄文!支持是必须的。 ----------------------------- @清坪过客16楼2014-04-0804:09:48 绝对的权力导致绝对的腐败! ----------------------------- @T大漠鱼T35楼2014-04-0810:17:27 @周丕东@普欣@拾月霜寒2012@小摸包@姚文嚼字@四號@凌宸@乔志峰@野渡自渡人@曾兵2010@缠绕夜色@曾颖@风青扬请关注
In [118]:
    
print pa[0].find('div', {'class', 'bbs-content'}).text.strip()
    
    
In [119]:
    
print pa[87].find('div', {'class', 'bbs-content'}).text.strip()
    
    
In [104]:
    
pa[1].a
    
    Out[104]:
In [113]:
    
print pa[0].find('a', class_ = 'reportme a-link')
    
    
In [115]:
    
print pa[0].find('a', class_ = 'reportme a-link')['replytime']
    
    
In [114]:
    
print pa[0].find('a', class_ = 'reportme a-link')['author']
    
    
In [122]:
    
for i in pa[:10]:
    p_info = i.find('a', class_ = 'reportme a-link')
    p_time = p_info['replytime']
    p_author_id = p_info['authorid']
    p_author_name = p_info['author']
    p_content = i.find('div', {'class', 'bbs-content'}).text.strip()
    p_content = p_content.replace('\t', '')
    print p_time, '--->', p_author_id, '--->', p_author_name,'--->', p_content, '\n'
    
    
http://bbs.tianya.cn/post-free-2848797-1.shtml
http://bbs.tianya.cn/post-free-2848797-2.shtml
    http://bbs.tianya.cn/post-free-2848797-3.shtml
In [126]:
    
post_soup.find('div', {'class', 'atl-pages'})#['onsubmit']
    
    Out[126]:
In [137]:
    
post_pages = post_soup.find('div', {'class', 'atl-pages'})
post_pages = post_pages.form['onsubmit'].split(',')[-1].split(')')[0]
post_pages
    
    Out[137]:
In [144]:
    
url = 'http://bbs.tianya.cn' + df.link[2]
url_base = ''.join(url.split('-')[:-1]) + '-%d.shtml'
url_base
    
    Out[144]:
In [415]:
    
def parsePage(pa):
    records = []
    for i in pa:
        p_info = i.find('a', class_ = 'reportme a-link')
        p_time = p_info['replytime']
        p_author_id = p_info['authorid']
        p_author_name = p_info['author']
        p_content = i.find('div', {'class', 'bbs-content'}).text.strip()
        p_content = p_content.replace('\t', '').replace('\n', '')#.replace(' ', '')
        record = p_time + '\t' + p_author_id+ '\t' + p_author_name + '\t'+ p_content
        records.append(record)
    return records
import sys
def flushPrint(s):
    sys.stdout.write('\r')
    sys.stdout.write('%s' % s)
    sys.stdout.flush()
    
In [246]:
    
url_1 = 'http://bbs.tianya.cn' + df.link[10]
content = urllib2.urlopen(url_1).read() #获取网页的html文本
post_soup = BeautifulSoup(content, "lxml") 
pa = post_soup.find_all('div', {'class', 'atl-item'})
b = post_soup.find('div', class_= 'atl-pages')
b
    
    Out[246]:
In [247]:
    
url_1 = 'http://bbs.tianya.cn' + df.link[0]
content = urllib2.urlopen(url_1).read() #获取网页的html文本
post_soup = BeautifulSoup(content, "lxml") 
pa = post_soup.find_all('div', {'class', 'atl-item'})
a = post_soup.find('div', {'class', 'atl-pages'})
a
    
    Out[247]:
In [251]:
    
a.form
    
    Out[251]:
In [254]:
    
if b.form:
    print 'true'
else:
    print 'false'
    
    
In [32]:
    
import random
import time
def crawler(url, file_name):
    try:
        # open the browser
        url_1 = 'http://bbs.tianya.cn' + url
        content = urllib2.urlopen(url_1).read() #获取网页的html文本
        post_soup = BeautifulSoup(content, "lxml") 
        # how many pages in a post
        post_form = post_soup.find('div', {'class', 'atl-pages'})
        if post_form.form:
            post_pages = post_form.form['onsubmit'].split(',')[-1].split(')')[0]
            post_pages = int(post_pages)
            url_base = '-'.join(url_1.split('-')[:-1]) + '-%d.shtml'
        else:
            post_pages = 1
        # for the first page
        pa = post_soup.find_all('div', {'class', 'atl-item'})
        records = parsePage(pa)
        with open(file_name,'a') as p: # '''Note''':Append mode, run only once!
            for record in records:    
                p.write('1'+ '\t' + url + '\t' + record.encode('utf-8')+"\n") 
        # for the 2nd+ pages
        if post_pages > 1:
            for page_num in range(2, post_pages+1):
                time.sleep(random.random())
                flushPrint(page_num)
                url2 =url_base  % page_num
                content = urllib2.urlopen(url2).read() #获取网页的html文本
                post_soup = BeautifulSoup(content, "lxml") 
                pa = post_soup.find_all('div', {'class', 'atl-item'})
                records = parsePage(pa)
                with open(file_name,'a') as p: # '''Note''':Append mode, run only once!
                    for record in records:    
                        p.write(str(page_num) + '\t' +url + '\t' + record.encode('utf-8')+"\n") 
        else:
            pass
    except Exception, e:
        print e
        pass
    
In [182]:
    
url = 'http://bbs.tianya.cn' + df.link[2]
file_name = '/Users/chengjun/github/cjc2016/data/tianya_bbs_threads_test.txt'
crawler(url, file_name)
    
    
In [417]:
    
for k, link in enumerate(df.link):
    flushPrint(link)
    if k % 10== 0:
        print 'This it the post of : ' + str(k)
    file_name = '/Users/chengjun/github/cjc2016/data/tianya_bbs_threads_network.txt'
    crawler(link, file_name)
    
    
In [418]:
    
dtt = []
with open('/Users/chengjun/github/cjc2016/data/tianya_bbs_threads_network.txt', 'r') as f:
    for line in f:
        pnum, link, time, author_id, author, content = line.replace('\n', '').split('\t')
        dtt.append([pnum, link, time, author_id, author, content])
len(dtt)
    
    Out[418]:
In [419]:
    
dt = pd.DataFrame(dtt)
dt[:5]
    
    Out[419]:
In [420]:
    
dt=dt.rename(columns = {0:'page_num', 1:'link', 2:'time', 3:'author',4:'author_name', 5:'reply'})
dt[:5]
    
    Out[420]:
In [421]:
    
dt.reply[:100]
    
    Out[421]:
http://search.tianya.cn/bbs?q=PX 共有18459 条内容
In [14]:
    
18459/50
    
    Out[14]:
实际上到第10页就没有了 http://bbs.tianya.cn/list.jsp?item=free&order=1&nextid=9&k=PX, 原来那只是天涯论坛,还有其它各种版块,如天涯聚焦: http://focus.tianya.cn/ 等等。
In [ ]: