In [1]:
import re
import requests
import pandas as pd 
import time

from bs4 import BeautifulSoup   
import bs4

In [2]:
def gen_homeURL(start_index):
    homeURL = ('http://bbs.nju.edu.cn/bbsdoc?board=NJU_HOME&start={}&type=doc').format(start_index)
    return homeURL

In [3]:
def check_link(url):  
    try:    
        r = requests.get(url)  
        r.raise_for_status()  
        r.encoding = r.apparent_encoding  
        return r.text  
    except:  
        print('无法链接服务器!!!')

In [4]:
def get_map(start_index):
    home_url = gen_homeURL(start_index)
    rs = check_link(home_url)  

    soup = BeautifulSoup(rs,'lxml')  
    trs = soup.find_all('a')[4:]

    user_list_raw = trs[0:40:2]
    text_list = trs[1:41:2]

    user_list = [x.string for x in user_list_raw]
    title_list = [x.string for x in text_list]
    href_list = [x.get('href') for x in text_list]
    url_list = ['http://bbs.nju.edu.cn/' + x for x in href_list]
    return user_list, title_list, url_list

In [16]:
def get_article(url):
    raw_text = requests.get(url, verify=False)
    soup = BeautifulSoup(raw_text.text, 'lxml')
    articles = soup.find_all('textarea')
    result = articles[0].string
    return result

In [17]:
# def main():
start_num = range(20, 1801, 20)
user_list = []
title_list = []
url_list = []
content_list = []
for i,start_index in enumerate(start_num):
    users, titles, urls = get_map(start_index)
    user_list.extend(users)
    title_list.extend(titles)
    url_list.extend(urls)
    if (i+1) % 10 == 0:
        print('fetching url list, %d of %d finished!, size = %d' % (i+1, len(start_num), len(user_list)))
    time.sleep(0.5)

for i,url in enumerate(url_list):
    try:
        text = get_article(url)
    except:
        text = '空'
    content_list.append(text)
    if (i+1) % 100 == 0:
        print('fetching articles, %d of %d finished!, size = %d' % (i+1, len(url_list), len(content_list)))
    time.sleep(0.5)
result = pd.DataFrame({'title':title_list, 'user':user_list, 'url':url_list, 'content':content_list}, 
                  columns = ['title', 'user', 'url', 'content'])
# main()


fetching url list, 10 of 90 finished!, size = 200
fetching url list, 20 of 90 finished!, size = 400
fetching url list, 30 of 90 finished!, size = 600
fetching url list, 40 of 90 finished!, size = 800
fetching url list, 50 of 90 finished!, size = 1000
fetching url list, 60 of 90 finished!, size = 1200
fetching url list, 70 of 90 finished!, size = 1400
fetching url list, 80 of 90 finished!, size = 1600
fetching url list, 90 of 90 finished!, size = 1800
fetching articles, 100 of 1800 finished!, size = 100
fetching articles, 200 of 1800 finished!, size = 200
fetching articles, 300 of 1800 finished!, size = 300
fetching articles, 400 of 1800 finished!, size = 400
fetching articles, 500 of 1800 finished!, size = 500
fetching articles, 600 of 1800 finished!, size = 600
fetching articles, 700 of 1800 finished!, size = 700
fetching articles, 800 of 1800 finished!, size = 800
fetching articles, 900 of 1800 finished!, size = 900
fetching articles, 1000 of 1800 finished!, size = 1000
fetching articles, 1100 of 1800 finished!, size = 1100
fetching articles, 1200 of 1800 finished!, size = 1200
fetching articles, 1300 of 1800 finished!, size = 1300
fetching articles, 1400 of 1800 finished!, size = 1400
fetching articles, 1500 of 1800 finished!, size = 1500
fetching articles, 1600 of 1800 finished!, size = 1600
fetching articles, 1700 of 1800 finished!, size = 1700
fetching articles, 1800 of 1800 finished!, size = 1800

In [20]:
print('save result to excel...')
writer = pd.ExcelWriter('fetch_data.xlsx')
result.to_excel(writer, sheet_name='bbs', encoding = 'utf-8', index = False)
writer.save()
print('task done.')


save result to excel...
task done.

In [23]:
print(re.sub('\W', '', result.content[0]))
result.content[0]


发信人dreamfly宁静致远信区NJU_HOME本篇人气772标题南大和园仙林三室两厅毛坯房出租发信站南京大学小百合站TueMar11640122011南大和园仙林三室两厅110平米毛坯房出租有意者请站内联系谢谢134m来源南京大学小百合站httpbbsnjueducnFROM202119569m
Out[23]:
'发信人: dreamfly (宁静致远), 信区: NJU_HOME. 本篇人气: 772\n标  题: 南大和园(仙林)三室两厅毛坯房出租\n发信站: 南京大学小百合站 (Tue Mar  1 16:40:12 2011)\n\n\r\n    南大和园(仙林)三室两厅(110平米)毛坯房出租,有意者请站内联系,谢谢。\r\n\r\n\n--\n\n[1;34m※ 来源:.南京大学小百合站 http://bbs.nju.edu.cn [FROM: 202.119.56.9][m\n'