In [1]:
import requests
import re
import json
from bs4 import BeautifulSoup, NavigableString
from datetime import datetime
from pprint import pprint
from urllib.parse import urljoin
In [2]:
base_url = 'https://www.ptt.cc/bbs/Gossiping/index.html'
ptt_today = datetime.now()
ptt_today_str = ptt_today.strftime('%m/%d')
print(ptt_today_str)
In [3]:
resp_base = requests.get(base_url, cookies={'over18': '1'})
assert resp_base.status_code == 200
soup_base = BeautifulSoup(resp_base.text, 'lxml')
In [4]:
paging_tag = soup_base.find(class_='btn-group-paging')
total_page = None
for btn_tag in paging_tag.findAll('a'):
if btn_tag.text == '‹ 上頁':
compile_page = re.search('(\d+)', btn_tag['href'])
if compile_page:
total_page = int(compile_page.group(0)) + 1
print('total page =', total_page)
In [5]:
def crawl_article(url):
resp = requests.get(url, cookies={'over18': '1'})
if resp.status_code != 200:
return
soup = BeautifulSoup(resp.text, 'lxml')
print('Start to Crawling', url)
# ##############################
# crawl article
# ##############################
article = {
'author_id': '',
'author_nickname': '',
'title': '',
'timestamp': '',
'contents': '',
'ip': ''
}
article_body = soup.find(id='main-content')
# article header
article_head = article_body.findAll('div', class_='article-metaline')
for metaline in article_head:
meta_tag = metaline.find(class_='article-meta-tag').text
meta_value = metaline.find(class_='article-meta-value').text
if meta_tag == '作者':
compile_nickname = re.compile('\((.*)\)').search(meta_value)
article['author_id'] = meta_value.split('(')[0].strip(' ')
article['author_nickname'] = compile_nickname.group(1) if compile_nickname else ''
elif meta_tag == '標題':
article['title'] = meta_value
elif meta_tag == '時間':
article['timestamp'] = meta_value
# article content
contents = [expr for expr in article_body.contents if isinstance(expr, NavigableString)]
contents = [re.sub('\n', '', expr) for expr in contents]
contents = [i for i in contents if i]
contents = '\n'.join(contents)
article['contents'] = contents
# article publish ip
article_ip = article_body.find(class_='f2').text
compile_ip = re.compile('[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}').search(article_ip)
article['ip'] = compile_ip.group(0) if compile_ip else ''
# ##############################
# crawl comments
# ##############################
comments = []
for comment in article_body.findAll('div', class_='push'):
tag = comment.find(class_='push-tag').text
guest_id = comment.find(class_='push-userid').text
guest_content = comment.find(class_='push-content').text
guest_ipdatetime = comment.find(class_='push-ipdatetime').text
compile_ip = re.compile('[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}').search(guest_ipdatetime)
guest_ip = compile_ip.group(0) if compile_ip else ''
guest_timestamp = re.sub(guest_ip, '', guest_ipdatetime).strip()
comments.append({
'tag': tag,
'id': guest_id,
'content': guest_content,
'ip': guest_ip,
'timestamp': guest_timestamp
})
article['comments'] = comments
article['url'] = url
return article
In [6]:
DATE_GRATER=1
DATE_EQUAL=0
DATE_LESS=-1
def compare_timestamp_md(src, dest):
"""
greater: 1
equal: 0
less: -1
"""
date_src = datetime.strptime(src, '%m/%d')
date_dest = datetime.strptime(dest, '%m/%d')
if date_dest > date_src:
return 1
elif date_dest == date_src:
return 0
else:
return -1
In [7]:
data = []
for page in range(total_page, 1, -1):
current_url = 'https://www.ptt.cc/bbs/Gossiping/index{}.html'.format(page)
resp_page = requests.get(current_url, cookies={'over18': '1'})
if resp_page.status_code != 200:
continue
soup_page = BeautifulSoup(resp_page.text, 'lxml')
# ##############################
# check the first article date
# ##############################
container_tag = soup_page.find('div', class_='r-list-container')
first_article = container_tag.find('div', class_='r-ent')
first_article_date = first_article.find('div', class_='date').text.strip()
compare_datetime = compare_timestamp_md(ptt_today_str, first_article_date)
print('{} - date {} result {}'.format(current_url, first_article_date, compare_datetime))
if compare_datetime == 1:
continue
else:
# only crawling today's article before r-list-sep line
for article_row_tag in container_tag.findChildren('div', recursive=False):
if 'r-list-sep' in article_row_tag['class']:
break
if 'r-ent' in article_row_tag['class']:
article_date = article_row_tag.find('div', class_='date').text.strip()
article_date_compare = compare_timestamp_md(ptt_today_str, article_date)
if article_date_compare != 0:
continue
article_tag = article_row_tag.find('a', href=True)
article_url = urljoin(base_url, article_tag['href'])
article_data = crawl_article(article_url)
data.append(article_data)
# if the first article date is earlier than current date, should break the iteration
if compare_datetime == -1:
break
with open('today_articles.json', 'w+', encoding='utf-8') as f:
json.dump(data, f, indent=2, ensure_ascii=False)
print('Save - today_articles.json')