In [129]:
import re
import requests
import pandas as pd
import time
In [56]:
def filter_userid(text):
pattern = re.compile(u'<a href=blogdoc\?userid=.+?>')
id_list = pattern.findall(text)
return id_list
def trim_id(long_id):
temp_id = re.sub('<a href=blogdoc\?userid=', '', long_id)
result_id = re.sub('>$', '', temp_id)
return result_id
def get_users(title):
url = 'http://bbs.nju.edu.cn/blogfind?type=1'
info_list=[]
headers={
"Host":"bbs.nju.edu.cn",
"User-Agent":"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
"Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language":"en-US,en;q=0.5",
"Accept-Encoding":"gzip, deflate",
"Content-Type":"application/x-www-form-urlencoded",
"Content-Length":"28",
"Referer":"http://bbs.nju.edu.cn/blogfind",
"Cookie":"FOOTKEY=453552640",
"DNT":"1",
"Connection":"keep-alive",
"Upgrade-Insecure-Requests":"1"
}
data={
"userid":"",
"bname":"",
"btitle":title
}
try:
content = requests.post(url, data=data, headers=headers).text
except Exception as e:
print(e)
id_set = set(filter_userid(content))
id_list = list(map(trim_id, id_set))
return id_list
a = get_users('math')
a
Out[56]:
In [128]:
# test_id = '7866321'
test_id = 'levlandau'
def get_data_by_id(userID):
# 构建URL
url = ('http://bbs.nju.edu.cn/blogdoc?userid={}').format(
userID
)
r = requests.get(url, verify=False)
return r.text
test = get_data_by_id(test_id)
'''
用于解析标题,链接,用户
'''
def parse_title(text):
title = re.sub('<td><a href=blogcon\?userid=.+?&file=[0-9]+?>|</a>|\W', '', text)
return title
def parse_fileIndex(text):
file_index = re.findall('&file=[0-9]+?>', text)[0]
file_index = re.findall('[0-9]+', file_index)[0]
return file_index
def get_content(user_id, file_index):
url = ('http://bbs.nju.edu.cn/blogcon?userid={}&file={}').format(user_id, file_index)
response = requests.get(url, verify=False)
result = response.text
return result
def extract_user_data(user_id):
raw_text = get_data_by_id(user_id)
pattern = re.compile(('<td><a href=blogcon\?userid={}&file=[0-9]+?>.+?</a>').format(user_id))
text_list = pattern.findall(raw_text)
title_list = list(map(parse_title, text_list))
fileIndex_list = list(map(parse_fileIndex, text_list))
# content_list = [get_content(title, fileIndex) for (title, fileIndex) in zip(title_list, fileIndex_list)]
result = pd.DataFrame({
'user_id':user_id,
'title':title_list,
'file_index':fileIndex_list},
# 'content':content_list},
columns = ['user_id', 'title', 'file_index'])
# print(content_list)
return result
result_table = extract_user_data(test_id)
result_table
Out[128]:
In [133]:
# get_content(result_table.user_id[0], result_table.file_index[0])
result_list = []
for i in range(result_table.shape[0]):
content = get_content(result_table.user_id[i], result_table.file_index[i])
result_list.append(content)
print(i)
time.sleep(1)
result_list
result_table['content'] = result_list
result_table
Out[133]:
In [139]:
result_table.content[0]
Out[139]:
In [138]:
result_table.content[1]
Out[138]:
In [136]:
print('save result to excel...')
writer = pd.ExcelWriter('/home/da/spyder_result.xlsx')
result_table.to_excel(writer, sheet_name='bbs', encoding = 'utf-8', index = False)
writer.save()
print('task done.')