URL https://www.ptt.cc/bbs/Gossiping/M.1537847530.A.E12.html
BACKUP https://afuntw.github.io/Test-Crawling-Website/pages/ptt/M.1537847530.A.E12.html
In [1]:
import requests
import re
import json
from bs4 import BeautifulSoup, NavigableString
from pprint import pprint
In [2]:
ARTICLE_URL = 'https://www.ptt.cc/bbs/Gossiping/M.1537847530.A.E12.html'
In [3]:
resp = requests.get(ARTICLE_URL)
if resp.status_code == 200:
print(resp.text)
In [4]:
cookies = {'over18': '1'}
resp = requests.get(ARTICLE_URL, cookies=cookies)
if resp.status_code == 200:
print(resp.text)
In [5]:
soup = BeautifulSoup(resp.text, 'lxml')
In [6]:
article = {
'author_id': '',
'author_nickname': '',
'title': '',
'timestamp': '',
'contents': '',
'ip': ''
}
article_body = soup.find(id='main-content')
# article header
article_head = article_body.findAll('div', class_='article-metaline')
for metaline in article_head:
meta_tag = metaline.find(class_='article-meta-tag').text
meta_value = metaline.find(class_='article-meta-value').text
if meta_tag == '作者':
compile_nickname = re.compile('\((.*)\)').search(meta_value)
article['author_id'] = meta_value.split('(')[0].strip(' ')
article['author_nickname'] = compile_nickname.group(1) if compile_nickname else ''
elif meta_tag == '標題':
article['title'] = meta_value
elif meta_tag == '時間':
article['timestamp'] = meta_value
# article content
contents = [expr for expr in article_body.contents if isinstance(expr, NavigableString)]
contents = [re.sub('\n', '', expr) for expr in contents]
contents = [i for i in contents if i]
contents = '\n'.join(contents)
article['contents'] = contents
# article publish ip
article_ip = article_body.find(class_='f2').text
compile_ip = re.compile('[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}').search(article_ip)
article['ip'] = compile_ip.group(0) if compile_ip else ''
pprint(article)
In [7]:
comments = []
for comment in article_body.findAll('div', class_='push'):
tag = comment.find(class_='push-tag').text
guest_id = comment.find(class_='push-userid').text
guest_content = comment.find(class_='push-content').text
guest_ipdatetime = comment.find(class_='push-ipdatetime').text
compile_ip = re.compile('[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}').search(guest_ipdatetime)
guest_ip = compile_ip.group(0) if compile_ip else ''
guest_timestamp = re.sub(guest_ip, '', guest_ipdatetime).strip()
comments.append({
'tag': tag,
'id': guest_id,
'content': guest_content,
'ip': guest_ip,
'timestamp': guest_timestamp
})
pprint(comments)
In [8]:
article['comments'] = comments
data = [article]
with open('M.1537847530.A.E12.json', 'w+', encoding='utf-8') as f:
json.dump(data, f, indent=2, ensure_ascii=False)