In [129]:
import re
import requests
import pandas as pd 
import time

In [56]:
def filter_userid(text):
    pattern = re.compile(u'<a href=blogdoc\?userid=.+?>')
    id_list = pattern.findall(text)
    return id_list

def trim_id(long_id):
    temp_id = re.sub('<a href=blogdoc\?userid=', '', long_id)
    result_id = re.sub('>$', '', temp_id)
    return result_id

def get_users(title):
    url = 'http://bbs.nju.edu.cn/blogfind?type=1'
    info_list=[]
    headers={
        "Host":"bbs.nju.edu.cn",
        "User-Agent":"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
        "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
        "Accept-Language":"en-US,en;q=0.5",
        "Accept-Encoding":"gzip, deflate",
        "Content-Type":"application/x-www-form-urlencoded",
        "Content-Length":"28",
        "Referer":"http://bbs.nju.edu.cn/blogfind",
        "Cookie":"FOOTKEY=453552640",
        "DNT":"1",
        "Connection":"keep-alive",
        "Upgrade-Insecure-Requests":"1"
    }
    data={
        "userid":"",
        "bname":"",
        "btitle":title
    }
    try:
        content = requests.post(url, data=data, headers=headers).text
    except Exception as e:
        print(e)
    id_set = set(filter_userid(content))
    id_list = list(map(trim_id, id_set))
    return id_list

a = get_users('math') 
a


Out[56]:
['wbsxyph',
 'vitanova',
 'phoenixlwei',
 'honney',
 'cosly',
 'EthelC',
 'huanchen',
 'Davenport',
 'wenweihu']

In [128]:
# test_id = '7866321'
test_id = 'levlandau'

def get_data_by_id(userID):
    # 构建URL
    url = ('http://bbs.nju.edu.cn/blogdoc?userid={}').format(
        userID
    )
    r = requests.get(url, verify=False)
    return r.text
test = get_data_by_id(test_id)

'''
用于解析标题,链接,用户
'''

def parse_title(text):
    title = re.sub('<td><a href=blogcon\?userid=.+?&file=[0-9]+?>|</a>|\W', '', text)
    return title

def parse_fileIndex(text):
    file_index = re.findall('&file=[0-9]+?>', text)[0]
    file_index = re.findall('[0-9]+', file_index)[0]
    return file_index

def get_content(user_id, file_index):
    url = ('http://bbs.nju.edu.cn/blogcon?userid={}&file={}').format(user_id, file_index)
    response = requests.get(url, verify=False)
    result = response.text
    return result

def extract_user_data(user_id):
    raw_text = get_data_by_id(user_id)
    pattern = re.compile(('<td><a href=blogcon\?userid={}&file=[0-9]+?>.+?</a>').format(user_id))
    text_list = pattern.findall(raw_text)
    title_list = list(map(parse_title, text_list))    
    fileIndex_list = list(map(parse_fileIndex, text_list))
#     content_list = [get_content(title, fileIndex) for (title, fileIndex) in zip(title_list, fileIndex_list)]
    
    result = pd.DataFrame({
            'user_id':user_id,
            'title':title_list, 
            'file_index':fileIndex_list},
#                            'content':content_list}, 
                          columns = ['user_id', 'title', 'file_index'])
#     print(content_list)
    return result
    
result_table = extract_user_data(test_id)
result_table


Out[128]:
user_id title file_index
0 levlandau 转载南大和园朝南卧室出租 1426070997
1 levlandau 转载出租仙林和园朝南朝北房间各一间 1426071361
2 levlandau 转载和园好房出租一天起租 1426758181
3 levlandau 转载和园精装房出租一天起租 1426940366
4 levlandau 转载主卧出租精装修南大和园 1426940578
5 levlandau 转载出租仙林南大和园单间400元 1426940665
6 levlandau 转载出租仙林南大和园单间400元 1426940739

In [133]:
# get_content(result_table.user_id[0], result_table.file_index[0])
result_list = []
for i in range(result_table.shape[0]):
    content = get_content(result_table.user_id[i], result_table.file_index[i])
    result_list.append(content)
    print(i)
    time.sleep(1)
    
result_list
result_table['content'] = result_list
result_table


0
1
2
3
4
5
6
Out[133]:
user_id title file_index content
0 levlandau 转载南大和园朝南卧室出租 1426070997 <html><head>\n<meta HTTP-EQUIV="Content-Type" ...
1 levlandau 转载出租仙林和园朝南朝北房间各一间 1426071361 <html><head>\n<meta HTTP-EQUIV="Content-Type" ...
2 levlandau 转载和园好房出租一天起租 1426758181 <html><head>\n<meta HTTP-EQUIV="Content-Type" ...
3 levlandau 转载和园精装房出租一天起租 1426940366 <html><head>\n<meta HTTP-EQUIV="Content-Type" ...
4 levlandau 转载主卧出租精装修南大和园 1426940578 <html><head>\n<meta HTTP-EQUIV="Content-Type" ...
5 levlandau 转载出租仙林南大和园单间400元 1426940665 <html><head>\n<meta HTTP-EQUIV="Content-Type" ...
6 levlandau 转载出租仙林南大和园单间400元 1426940739 <html><head>\n<meta HTTP-EQUIV="Content-Type" ...

In [139]:
result_table.content[0]


Out[139]:
'<html><head>\n<meta HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=gb2312">\n<link rel=stylesheet type=text/css href="/images/bbs.css?Net_3">\n</head><script src="/js/bbs.js?Net_9"></script>\n<center><br>\n<font color=red style=\'font-size:18px\'><b><a href=bbsqry?userid=levlandau>levlandau</a>的blog阅读</b></font><br><br>\n\n[<a href=blogdoc?userid=levlandau>levlandau的blog目录</a>]\n[<a href=bbspstmail?userid=levlandau&title="没主题" >给作者写信</a>]\n[<a href=blogcomment?userid=levlandau&file=1426070997>发表评论</a>]\n(<a href=blogcocon?userid=levlandau&file=1426070997>现有评论0条</a>)\n<br><br>\n<script>Net.User.init({WHEEL:0,FACE:1})</script><table width=610 class=main><tr><td id=NET_1 width=600><textarea id=NET-1>作  者: [uid]levlandau[/uid]\n标  题: [转载] 南大和园朝南卧室出租\n时  间: Wed Mar 11 18:49:57 2015\n点  击: 66\n\n\x1b[37;1m【 以下文字转载自 \x1b[32mNJ_HOUSE \x1b[37m讨论区 】\n\x1b[37;1m【 原文由 \x1b[32mxlstoneage \x1b[37m所发表 】\x1b[m\n\n\n\r\n4月5日起可以入住,房租550,是朝南的一间小卧室,带小阳台,可以晒到满满的阳光哦。\r\n房子里有洗衣机、热水器、空调一应俱全。卫生间是蹲坑很卫生哦。目前房子里租客很少\r\n且都是学生党,只有三个房间有人住。大家都很爱干净而且修养很好很安静。欢迎来看房\r\n,联系13851668873即可。\r\n\r\n我本来是租到5月4日的,因为突然有事,4月初就要走了,所以这一个月希望有人可以租掉\r\n它哦,然后如果要续租我可以直接帮你跟房东联系,当然你也可以系自己联系啦。\r\n\r\n房东人超好的,我超级推荐租他的房子哦,房东老婆很可爱,当初我一个人睡在房子里的\r\n时候很害怕,房东老婆还来陪我睡了一阵子直到有人来住,这样的房东还有话说吗,绝对\r\n不是打广告,你看见我就知道我是不会说谎的人!\r\n\r\n\n--\n\n\x1b[1;37m※ 来源:.南京大学小百合站 http://bbs.nju.edu.cn [FROM: 114.212.118.116]\x1b[m\n\n--\n\n\x1b[1;31m※ 来源:.南京大学小百合站 http://bbs.nju.edu.cn [FROM: 172.26.67.248]\x1b[m\n </textarea></table><script>Net.Html.make(1)</script><br><br>\n[<a href=blogcon?userid=levlandau&file=1426071361>下一篇</a>]\n[<a href=blogdoc?userid=levlandau>levlandau的blog目录</a>]\n<br>[<a href=bbspstmail?userid=levlandau&title=问候>给作者写信</a>]\n[<a href=blogcomment?userid=levlandau&file=1426070997>发表评论</a>]\n(现有评论0条)\n[<a href=blog2b?userid=levlandau&file=1426070997>转载</a>]\n[<a href=blog2m?userid=levlandau&file=1426070997>转寄</a>]\n'

In [138]:
result_table.content[1]


Out[138]:
'<html><head>\n<meta HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=gb2312">\n<link rel=stylesheet type=text/css href="/images/bbs.css?Net_3">\n</head><script src="/js/bbs.js?Net_9"></script>\n<center><br>\n<font color=red style=\'font-size:18px\'><b><a href=bbsqry?userid=levlandau>levlandau</a>的blog阅读</b></font><br><br>\n\n[<a href=blogdoc?userid=levlandau>levlandau的blog目录</a>]\n[<a href=bbspstmail?userid=levlandau&title="没主题" >给作者写信</a>]\n[<a href=blogcomment?userid=levlandau&file=1426071361>发表评论</a>]\n(<a href=blogcocon?userid=levlandau&file=1426071361>现有评论0条</a>)\n<br><br>\n<script>Net.User.init({WHEEL:0,FACE:1})</script><table width=610 class=main><tr><td id=NET_1 width=600><textarea id=NET-1>作  者: [uid]levlandau[/uid]\n标  题: [转载] 出租仙林和园朝南,朝北房间各一间\n时  间: Wed Mar 11 18:56:01 2015\n点  击: 55\n\n\x1b[37;1m【 以下文字转载自 \x1b[32mNJ_HOUSE \x1b[37m讨论区 】\n\x1b[37;1m【 原文由 \x1b[32m528528 \x1b[37m所发表 】\x1b[m\n\n\n\r\n该房离地铁口近,安静采光好,房间大,基本设施全,热水器,洗衣机,网络。房内其他\r\n租客素质很好。\r\n\r\n要求:考研学生最佳\r\n\r\n租金:朝南  550元/月\r\n\r\n      朝北  400元/月\r\n\r\n      押1付3\r\n\r\n入住时间:随时\r\n\r\n联系方式:梁老师  1385 158 1558\n--\n\n\x1b[1;37m※ 来源:.南京大学小百合站 http://bbs.nju.edu.cn [FROM: 60.12.76.108]\x1b[m\n\n--\n\n\x1b[1;33m※ 来源:.南京大学小百合站 http://bbs.nju.edu.cn [FROM: 172.26.67.248]\x1b[m\n </textarea></table><script>Net.Html.make(1)</script><br><br>\n[<a href=blogcon?userid=levlandau&file=1426070997>上一篇</a>]\n[<a href=blogcon?userid=levlandau&file=1426758181>下一篇</a>]\n[<a href=blogdoc?userid=levlandau>levlandau的blog目录</a>]\n<br>[<a href=bbspstmail?userid=levlandau&title=问候>给作者写信</a>]\n[<a href=blogcomment?userid=levlandau&file=1426071361>发表评论</a>]\n(现有评论0条)\n[<a href=blog2b?userid=levlandau&file=1426071361>转载</a>]\n[<a href=blog2m?userid=levlandau&file=1426071361>转寄</a>]\n'

In [136]:
print('save result to excel...')
writer = pd.ExcelWriter('/home/da/spyder_result.xlsx')
result_table.to_excel(writer, sheet_name='bbs', encoding = 'utf-8', index = False)
writer.save()
print('task done.')


save result to excel...
task done.