In [1]:
# -*- coding: utf-8 -*-
# chenchen19951110@sina.com
import threading
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
import re
import os
import time
'''
获取html原网页文本
参数:url,即要打开的网页链接
返回值:为html网页文本
'''
class Spider:
def __init__(self):
self.headers = {'user-agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:54.0) Gecko/20100101 Firefox/54.0'}
def get_url_soup(self, url, encoding='gbk'):
response = requests.get(url, headers=self.headers)
response.raise_for_status()
# r.encoding = 'utf-8'
response.encoding = encoding
soup = BeautifulSoup(response.text, 'lxml')
return soup
def get_html_text(self, url, encoding='gbk'):
try:
r = requests.get(url, timeout = 30)
r.raise_for_status()
r.encoding = encoding
return r.text
except:
return ""
def parse_table(self, text):
data = []
table = text.find('table')
table_body = table.find('tbody')
rows = table_body.find_all('tr')
for row in rows:
cols = row.find_all('td')
cols = [ele.text.strip() for ele in cols]
data.append([ele for ele in cols if ele])
return data
def text_trim(self, text):
pattern = re.compile(',|<.+?>|\\u3000')
text = pattern.sub(',', str(text))
text = re.sub(',+|,+', ',', text)
text = re.sub('^,|,$', '', text)
return text
def write_list_txt(self, data, file_name):
assert isinstance(data, list)
assert file_name.endswith('.txt')
with open(file_name, 'w') as f:
f.writelines('\n'.join(data))
def write_txt(self, data, file_name):
assert isinstance(data, str)
assert file_name.endswith('.txt')
with open(file_name, 'w') as f:
f.write(data)
def getPhantomSoup(url):
browser = webdriver.PhantomJS()
browser.get(url)
soup = BeautifulSoup(browser.page_source, 'html.parser')
return soup
def get_url_cookie(url, encoding='gbk'):
headers = {"Accept": "text/css,*/*;q=0.1",
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:54.0) Gecko/20100101 Firefox/54.0",
"Cookie": '__cfduid=d399b76e493b94ca4ea30c25e027b331f1511504324; testcookie=yes; Hm_lvt_bc3b748c21fe5cf393d26c12b2c38d99=1511533259; Hm_lpvt_bc3b748c21fe5cf393d26c12b2c38d99=1511533491; clicktype=; need_auth_checked=yexuan2955@sina.com%3B-1%3B1511533299521%3Busername%3Dyexuan2955@sina.com%3Bisneed%3Dfalse%3Bip%3D202.119.46.229%3Bipcount%3D0%3Busernamecount%3D0; nicknameAndsign=2%257E%2529%2524%25E9%25A3%258E%25E8%25BF%2587; token=MzM4NDI0NHwxMzc3YzYyYTRkNmE2ZDA2Y2FiYTZiMzhmYzI5MmIwMXx8eWV4KioqKioqKkBzaW5hLmNvbXwzNzk3NzN8fDF8MzM4NDI0NHx85qyi6L%2BO5oKo77yM5pmL5rGf55So5oi3fDF8ZW1haWw%3D; ispayuser=3384244-1; foreverreader=3384244; sms_total=1'
}
r = requests.get(url, headers=headers)
# r.raise_for_status()
r.encoding = encoding
soup = BeautifulSoup(r.text, 'lxml')
return soup
In [4]:
base_url = "http://www.jjwxc.net/bookbase_slave.php?booktype=&opt=&page=1&endstr=true&orderstr=4"
spider = Spider()
base_soup = spider.get_url_soup(base_url)
In [9]:
table = base_soup.find_all('table', attrs={"class":"cytable"})[0]
In [29]:
pattern_id = re.compile('novelid=([0-9]+)"')
a = table.find_all('a', attrs={"class":"tooltip"})
[(x.string, pattern_id.findall(str(x))[0]) for x in a]
# a[0].string
Out[29]:
In [174]:
spider = Spider()
novel_id = 2368172
url = 'http://www.jjwxc.net/onebook.php?novelid={}'.format(novel_id)
# soup = getPhantomSoup(url)
soup = getPhantomSoup(url)
In [6]:
novel_title = soup.title.get_text()
In [27]:
# 需求一:文案
des = soup.find_all('div', attrs={'id':"novelintro", 'itemprop':"description"})[0]
result_desp = spider.text_trim(des)
if not os.path.exists('save/' + novel_title + '/task1'):
os.makedirs('save/' + novel_title + '/task1')
spider.write_txt(result_desp, 'save/' + novel_title + '/task1/description.txt')
In [28]:
# 需求一:标签
# soup.find_all('div', attrs={"class":"smallreadbody"})[1]
result_tag = [x.get_text() for x in soup.find_all('font', attrs={"color":"#FF0000"})]
spider.write_list_txt(result_tag, 'save/' + novel_title + '/task1/tags.txt')
In [29]:
# 需求一:关键字
result_keywords = soup.find_all('span', attrs={"class":"bluetext"})[0].get_text()
spider.write_txt(result_keywords, 'save/' + novel_title + '/task1/keywords.txt')
In [30]:
# 需求一:基本信息
def extract_info(text):
filter_pattern = re.compile('<.+?>|\s')
text = filter_pattern.sub(',', str(text))
text = re.sub('^,+|,+$', '', text)
text = re.sub(',+', ',', text)
text = re.split(',', text)
result = ''.join(text[1:])
return result
info = soup.find_all('ul', attrs={"class":"rightul", "name":"printright"})[0].find_all('li')
result_info = [extract_info(x) for x in info]
# for i,x in enumerate(info):
# print(i, extract_info(x))
spider.write_list_txt(result_info, 'save/' + novel_title + '/task1/basic_info.txt')
In [31]:
# 需求一:地雷
text = soup.find_all("div", attrs={"id":"ticketsrank_box"})[0]
result_mine = ['\t'.join(x) for x in spider.parse_table(text)]
spider.write_list_txt(result_mine, 'save/' + novel_title + '/task1/mine.txt')
# text
In [34]:
# 需求二:章节信息
if not os.path.exists('save/' + novel_title + '/task2'):
os.makedirs('save/' + novel_title + '/task2')
raw_table = soup.find_all("meta", attrs={"itemprop":"dateModified"})[0]
chapter_table = spider.parse_table(raw_table)[3:]
result_chap_table = ['\t'.join(x) for x in chapter_table]
spider.write_list_txt(result_chap_table, 'save/' + novel_title + '/task2/chapter_table.txt')
In [35]:
# 需求二:成绩
score = soup.find_all("td", attrs={"colspan":"6", "class":"sptd"})[1]
text = score.get_text().strip()
text = re.sub('\n|\u3000', '', text)
result = re.split('\s+', text)
spider.write_list_txt(result, 'save/' + novel_title + '/task2/score.txt')
In [123]:
# 需求三:评论正文,客户端
comment_url = "http://www.jjwxc.net/comment.php?novelid={}&page=1".format(novel_id)
comment_soup = getPhantomSoup(comment_url)
print(comment_url)
In [124]:
# 需求三:评论正文,客户端
def extract_single_reply(soup):
try:
info = spider.text_trim(re.findall('<div class="replybody">(.+?)<font color="#ABABAB">', str(soup))[0])
except:
info = "error"
try:
content = re.findall('</font><br>(.+?)<br><br>', str(soup))[0]
except:
content = "error"
try:
device = soup.find("font", attrs={"color":"#009900", "size":"2"}).get_text()
except:
device = "error"
return '\t'.join([info, content, device])
def extract_all_reply(soup):
re_list = soup.find_all('div', attrs={"class":"replybody"})
result = [extract_single_reply(x) for x in re_list]
return '\t'.join(result)
def extract_comment(soup):
target = soup.find("div", attrs={"class":"readbody"})
try:
device = target.find("font").get_text()
except:
device = "error"
comment = re.sub(device, "", target.get_text().strip())
try:
user_name = soup.find('a', attrs={"target":"_blank"}).get_text()
except:
user_name = "error"
try:
user_id = re.findall('readerid=([0-9]+)$', soup.find('a', attrs={"target":"_blank"})['href'])[0]
except:
user_id = "error"
try:
time_stamp = re.findall('发表时间:(.+?)<font', str(soup.find("span", attrs={"class":"coltext"})))[0].strip()
except:
time_stamp = "error"
main_result = '\t'.join([user_name, user_id, comment, device, time_stamp])
if soup.find('div', attrs={"class":"replybody"}):
main_result = main_result + '\t' + extract_all_reply(soup)
return main_result
comments = comment_soup.find_all("div", attrs={"id":re.compile("comment_[0-9]{6}")})
In [125]:
for i,x in enumerate(comments):
print(i, extract_comment(x))
In [178]:
# 需求四
raw_table = soup.find_all("meta", attrs={"itemprop":"dateModified"})[0]
chapter_info = raw_table.find_all("tr", attrs={"itemprop":["chapter", "chapter newestChapter"]})
for i,x in enumerate(chapter_info):
try:
href = x.a['href']
except:
href = x.a['rel'][0]
# print(href)
chapter_table[i].append(href)
In [183]:
chapter_table[118]
Out[183]:
In [147]:
def get_text_writer(chap_url):
if re.findall('vip', chap_url):
chap_soup = get_url_cookie(chap_url)
chap_text = str(chap_soup.find_all("div", attrs={"class":"noveltext"})[0])
chap_text = chap_text.split('<div id="show"></div>')[1]
chap_text = chap_text.split('<div')[0]
result_text = spider.text_trim(chap_text)
else:
chap_soup = spider.get_url_soup(chap_url)
chap_text = str(chap_soup.find_all("div", attrs={"class":"noveltext"})[0])
chap_text = chap_text.split('<div style="clear:both;"></div>')[1]
chap_text = chap_text.split('<div')[0]
result_text = spider.text_trim(chap_text)
try:
writer_comment = chap_soup.find_all("div", attrs={"class":"readsmall"})[0]
result_writer = spider.text_trim(writer_comment)
except:
result_writer = "None"
return result_text, result_writer
In [195]:
if not os.path.exists('save/' + novel_title + '/main'):
os.makedirs('save/' + novel_title + '/main')
for i,chap in enumerate(chapter_table):
chap_title = chap[1]
chap_main, writer_main = get_text_writer(chap[-1])
spider.write_list_txt([chap_main.strip(), writer_main], 'save/' + novel_title +'/main/' + chap[0] + chap_title + '.txt')
print('{} : {} saved!'.format(chap[0], chap_title))
time.sleep(3)
In [193]:
chapter_table[0]
Out[193]:
In [135]:
# 需求六
user_href = 'http://www.jjwxc.net/onereader.php?readerid=26429798'
user_soup = getPhantomSoup(user_href)
In [137]:
# 需求六:最近订阅的作品
user_soup
recent_books = user_soup.find_all("div", attrs={"id":"load_show_vipServer"})[0]
spider.parse_table(recent_books)[1::2]
Out[137]:
In [140]:
# 需求六:收藏的作品
url = 'http://www.jjwxc.net/onereader_ajax.php?readerid=20981786&action=show_novelsa'
# favo_soup = BeautifulSoup(getHTMLText(url), 'html.parser')
favo_soup = spider.get_url_soup(url, encoding='utf-8')
In [141]:
data = []
rows = favo_soup.find_all('tr')
for row in rows:
cols = row.find_all('td')
cols = [ele.text.strip() for ele in cols]
data.append([ele for ele in cols if ele])
# favo_soup
data
Out[141]: