爬取今天到目前為止的所有文章

https://www.ptt.cc/bbs/Gossiping/index.html


In [1]:
import requests
import re
import json

from bs4 import BeautifulSoup, NavigableString
from datetime import datetime
from pprint import pprint
from urllib.parse import urljoin

In [2]:
base_url = 'https://www.ptt.cc/bbs/Gossiping/index.html'
ptt_today = datetime.now()
ptt_today_str = ptt_today.strftime('%m/%d')
print(ptt_today_str)


09/27

取得總頁碼

從 html 上一頁的按鈕中取得 n-1 page 的頁碼,在將該頁碼加一就是總頁碼了


In [3]:
resp_base = requests.get(base_url, cookies={'over18': '1'})
assert resp_base.status_code == 200
soup_base = BeautifulSoup(resp_base.text, 'lxml')

In [4]:
paging_tag = soup_base.find(class_='btn-group-paging')
total_page = None
for btn_tag in paging_tag.findAll('a'):
    if btn_tag.text == '‹ 上頁':
        compile_page = re.search('(\d+)', btn_tag['href'])
        if compile_page:
            total_page = int(compile_page.group(0)) + 1
print('total page =', total_page)


total page = 39228

往回檢查日期並爬取文章

最舊的文章頁面,頁碼為 1


In [5]:
def crawl_article(url):
    resp = requests.get(url, cookies={'over18': '1'})
    if resp.status_code != 200:
        return
    soup = BeautifulSoup(resp.text, 'lxml')
    print('Start to Crawling', url)

    # ##############################
    # crawl article
    # ##############################
    article = {
        'author_id': '',
        'author_nickname': '',
        'title': '',
        'timestamp': '',
        'contents': '',
        'ip': ''
    }
    article_body = soup.find(id='main-content')

    # article header
    article_head = article_body.findAll('div', class_='article-metaline')
    for metaline in article_head:
        meta_tag = metaline.find(class_='article-meta-tag').text
        meta_value = metaline.find(class_='article-meta-value').text
        if meta_tag == '作者':
            compile_nickname = re.compile('\((.*)\)').search(meta_value)
            article['author_id'] = meta_value.split('(')[0].strip(' ')
            article['author_nickname'] = compile_nickname.group(1) if compile_nickname else ''
        elif meta_tag == '標題':
            article['title'] = meta_value
        elif meta_tag == '時間':
            article['timestamp'] = meta_value

    #  article content
    contents = [expr for expr in article_body.contents if isinstance(expr, NavigableString)]
    contents = [re.sub('\n', '', expr) for expr in contents]
    contents = [i for i in contents if i]
    contents = '\n'.join(contents)
    article['contents'] = contents

    # article publish ip
    article_ip = article_body.find(class_='f2').text
    compile_ip = re.compile('[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}').search(article_ip)
    article['ip'] = compile_ip.group(0) if compile_ip else ''

    # ##############################
    # crawl comments
    # ##############################
    comments = []
    for comment in article_body.findAll('div', class_='push'):
        tag = comment.find(class_='push-tag').text
        guest_id = comment.find(class_='push-userid').text
        guest_content = comment.find(class_='push-content').text
        guest_ipdatetime = comment.find(class_='push-ipdatetime').text
        compile_ip = re.compile('[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}').search(guest_ipdatetime)
        guest_ip = compile_ip.group(0) if compile_ip else ''
        guest_timestamp = re.sub(guest_ip, '', guest_ipdatetime).strip()
        comments.append({
            'tag': tag,
            'id': guest_id,
            'content': guest_content,
            'ip': guest_ip,
            'timestamp': guest_timestamp
        })
    
    article['comments'] = comments
    article['url'] = url
    return article

In [6]:
DATE_GRATER=1
DATE_EQUAL=0
DATE_LESS=-1

def compare_timestamp_md(src, dest):
    """
    greater: 1
    equal: 0
    less: -1
    """
    date_src = datetime.strptime(src, '%m/%d')
    date_dest = datetime.strptime(dest, '%m/%d')
    if date_dest > date_src:
        return 1
    elif date_dest == date_src:
        return 0
    else:
        return -1

In [7]:
data = []
for page in range(total_page, 1, -1):
    current_url = 'https://www.ptt.cc/bbs/Gossiping/index{}.html'.format(page)
    resp_page = requests.get(current_url, cookies={'over18': '1'})
    if resp_page.status_code != 200:
        continue
    soup_page = BeautifulSoup(resp_page.text, 'lxml')
    
    # ##############################
    # check the first article date
    # ##############################
    container_tag = soup_page.find('div', class_='r-list-container')
    first_article = container_tag.find('div', class_='r-ent')
    first_article_date = first_article.find('div', class_='date').text.strip()
    compare_datetime = compare_timestamp_md(ptt_today_str, first_article_date)
    print('{} - date {} result {}'.format(current_url, first_article_date, compare_datetime))
    
    if compare_datetime == 1:
        continue
    else:
        # only crawling today's article before r-list-sep line
        for article_row_tag in container_tag.findChildren('div', recursive=False):
            if 'r-list-sep' in article_row_tag['class']:
                break
            if 'r-ent' in article_row_tag['class']:
                article_date = article_row_tag.find('div', class_='date').text.strip()
                article_date_compare = compare_timestamp_md(ptt_today_str, article_date)
                if article_date_compare != 0:
                    continue
                article_tag = article_row_tag.find('a', href=True)
                article_url = urljoin(base_url, article_tag['href'])
                article_data = crawl_article(article_url)
                data.append(article_data)

        # if the first article date is earlier than current date, should break the iteration
        if compare_datetime == -1:
            break

with open('today_articles.json', 'w+', encoding='utf-8') as f:
    json.dump(data, f, indent=2, ensure_ascii=False)
    print('Save - today_articles.json')


https://www.ptt.cc/bbs/Gossiping/index39228.html - date 9/27 result 0
Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978608.A.325.html
Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978662.A.45A.html
Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978695.A.9A7.html
Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978699.A.194.html
Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978724.A.356.html
Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978750.A.39A.html
Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978768.A.08B.html
Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978815.A.5B2.html
Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978820.A.119.html
Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978934.A.F8E.html
Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978941.A.754.html
Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978960.A.779.html
Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978973.A.B90.html
Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978993.A.F88.html
Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537979013.A.67C.html
https://www.ptt.cc/bbs/Gossiping/index39227.html - date 9/27 result 0
Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537977913.A.4EE.html
Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537977930.A.01B.html
Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537977933.A.013.html
Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537977952.A.904.html
Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537977959.A.A7B.html
Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537977966.A.77C.html
Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978043.A.03E.html
Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978060.A.9DF.html
Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978098.A.D36.html
Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978140.A.C44.html
Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978152.A.31C.html
Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978156.A.B1A.html
Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978179.A.844.html
Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978195.A.D33.html
Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978272.A.533.html
Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978295.A.B6A.html
Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978350.A.D02.html
Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978378.A.746.html
Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978494.A.B6B.html
Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978521.A.06B.html
https://www.ptt.cc/bbs/Gossiping/index39226.html - date 9/26 result -1
Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537977639.A.3F8.html
Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537977693.A.A67.html
Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537977700.A.FD6.html
Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537977711.A.493.html
Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537977729.A.BE4.html
Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537977740.A.534.html
Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537977827.A.B50.html
Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537977851.A.17A.html
Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537977857.A.B1D.html
Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537977877.A.292.html
Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537977878.A.13E.html
Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537977910.A.566.html
Save - today_articles.json