In [1]:
import requests
import re
import json
from bs4 import BeautifulSoup, NavigableString
from pprint import pprint
from urllib.parse import urlencode, urljoin
In [2]:
QUERY_TITLE = '[新聞] 2噸水晶球沿街滾 撞壞5輛汽機車和民宅'
cookies = {'over18': '1'}
In [3]:
encoding_title = urlencode({'q': QUERY_TITLE})
query = 'https://www.ptt.cc/bbs/Gossiping/search?{}'.format(encoding_title)
print(query)
In [4]:
resp_article_list = requests.get(query, cookies=cookies)
soup_article_list = BeautifulSoup(resp_article_list.text, 'lxml')
In [5]:
def crawl_article(url):
resp = requests.get(url, cookies={'over18': '1'})
if resp.status_code != 200:
return
soup = BeautifulSoup(resp.text, 'lxml')
print('Start to Crawling', url)
# ##############################
# crawl article
# ##############################
article = {
'author_id': '',
'author_nickname': '',
'title': '',
'timestamp': '',
'contents': '',
'ip': ''
}
article_body = soup.find(id='main-content')
# article header
article_head = article_body.findAll('div', class_='article-metaline')
for metaline in article_head:
meta_tag = metaline.find(class_='article-meta-tag').text
meta_value = metaline.find(class_='article-meta-value').text
if meta_tag == '作者':
compile_nickname = re.compile('\((.*)\)').search(meta_value)
article['author_id'] = meta_value.split('(')[0].strip(' ')
article['author_nickname'] = compile_nickname.group(1) if compile_nickname else ''
elif meta_tag == '標題':
article['title'] = meta_value
elif meta_tag == '時間':
article['timestamp'] = meta_value
# article content
contents = [expr for expr in article_body.contents if isinstance(expr, NavigableString)]
contents = [re.sub('\n', '', expr) for expr in contents]
contents = [i for i in contents if i]
contents = '\n'.join(contents)
article['contents'] = contents
# article publish ip
article_ip = article_body.find(class_='f2').text
compile_ip = re.compile('[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}').search(article_ip)
article['ip'] = compile_ip.group(0) if compile_ip else ''
# ##############################
# crawl comments
# ##############################
comments = []
for comment in article_body.findAll('div', class_='push'):
tag = comment.find(class_='push-tag').text
guest_id = comment.find(class_='push-userid').text
guest_content = comment.find(class_='push-content').text
guest_ipdatetime = comment.find(class_='push-ipdatetime').text
compile_ip = re.compile('[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}').search(guest_ipdatetime)
guest_ip = compile_ip.group(0) if compile_ip else ''
guest_timestamp = re.sub(guest_ip, '', guest_ipdatetime).strip()
comments.append({
'tag': tag,
'id': guest_id,
'content': guest_content,
'ip': guest_ip,
'timestamp': guest_timestamp
})
article['comments'] = comments
article['url'] = url
return article
In [6]:
data = []
for article_line in soup_article_list.findAll('div', class_='r-ent'):
title_tag = article_line.find('div', class_='title')
article_url = title_tag.find('a')['href']
article_url = urljoin(resp_article_list.url, article_url)
article_data = crawl_article(article_url)
data.append(article_data)
with open('search_api_by_title.json', 'w+', encoding='utf-8') as f:
json.dump(data, f, indent=2, ensure_ascii=False)
print('Save - search_api_by_title.json')