練習


In [1]:
import requests
import re
import os

from PIL import Image
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
from urllib.parse import urljoin
from pprint import pprint

url = 'https://afuntw.github.io/Test-Crawling-Website/pages/blog/index.html'

In [2]:
fu = UserAgent()
headers = {'User-Agent': fu.random}
resp = requests.get(url, headers=headers)
soup = BeautifulSoup(resp.text, 'lxml')

In [3]:
wait_list = []
view_list = []
links = soup.find_all('a')
links = [link['href'] for link in links]
links = [urljoin(resp.url, link) for link in links]
links = list(set(links))
wait_list += links

In [4]:
all_h1_text = []

while wait_list:

    link = wait_list.pop()
    if link in view_list:
        continue
    
    print(link)
    view_list.append(link)
    
    page_resp = requests.get(link, headers=headers)
    page_soup = BeautifulSoup(page_resp.text, 'lxml')
    
    # get h1 tag on current page
    h1s = page_soup.find_all('h1')
    h1s = [h1.text for h1 in h1s]
    all_h1_text += h1s
    
    # search new links in current page
    links = page_soup.find_all('a')
    links = [link['href'] for link in links]
    links = [urljoin(page_resp.url, link) for link in links]
    links = list(filter(lambda x: x not in view_list, links))
    wait_list += links
    wait_list = list(set(wait_list))
    print('wait list:')
    pprint(wait_list)
    print('view list:')
    pprint(view_list)
    print('all text:')
    pprint(all_h1_text)
    print('='*87)


https://afuntw.github.io/Test-Crawling-Website/pages/blog/post.html
wait list:
['https://afuntw.github.io/Test-Crawling-Website/pages/blog/index.html',
 'https://afuntw.github.io/Test-Crawling-Website/pages/blog/contact.html',
 'https://afuntw.github.io/Test-Crawling-Website/pages/blog/about.html']
view list:
['https://afuntw.github.io/Test-Crawling-Website/pages/blog/post.html']
all text:
['Man must explore, and this is exploration at its greatest']
=======================================================================================
https://afuntw.github.io/Test-Crawling-Website/pages/blog/about.html
wait list:
['https://afuntw.github.io/Test-Crawling-Website/pages/blog/index.html',
 'https://afuntw.github.io/Test-Crawling-Website/pages/blog/contact.html']
view list:
['https://afuntw.github.io/Test-Crawling-Website/pages/blog/post.html',
 'https://afuntw.github.io/Test-Crawling-Website/pages/blog/about.html']
all text:
['Man must explore, and this is exploration at its greatest', 'About Me']
=======================================================================================
https://afuntw.github.io/Test-Crawling-Website/pages/blog/contact.html
wait list:
['https://afuntw.github.io/Test-Crawling-Website/pages/blog/index.html']
view list:
['https://afuntw.github.io/Test-Crawling-Website/pages/blog/post.html',
 'https://afuntw.github.io/Test-Crawling-Website/pages/blog/about.html',
 'https://afuntw.github.io/Test-Crawling-Website/pages/blog/contact.html']
all text:
['Man must explore, and this is exploration at its greatest',
 'About Me',
 'Contact Me']
=======================================================================================
https://afuntw.github.io/Test-Crawling-Website/pages/blog/index.html
wait list:
[]
view list:
['https://afuntw.github.io/Test-Crawling-Website/pages/blog/post.html',
 'https://afuntw.github.io/Test-Crawling-Website/pages/blog/about.html',
 'https://afuntw.github.io/Test-Crawling-Website/pages/blog/contact.html',
 'https://afuntw.github.io/Test-Crawling-Website/pages/blog/index.html']
all text:
['Man must explore, and this is exploration at its greatest',
 'About Me',
 'Contact Me',
 'Clean Blog']
=======================================================================================