透過靜態網站爬蟲會看到的圖片是
透過動態網站爬蟲會看到的圖片是
In [1]:
import os
import requests
import re
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from fake_useragent import UserAgent
from pprint import pprint
url = 'https://afuntw.github.io/Test-Crawling-Website/pages/gallery/index.html'
fu = UserAgent()
In [2]:
resp = requests.get(url)
soup = BeautifulSoup(resp.text, 'lxml')
imgs = soup.find_all('img', class_=re.compile('.*img-change'))
imgs = [i['src'] for i in imgs]
imgs = list(set(imgs))
pprint(imgs)
In [3]:
driver = webdriver.Chrome()
results = os.path.abspath('../results')
if not os.path.exists(results):
os.makedirs(results)
try:
# webdriver setting
driver.get(url)
driver.maximize_window()
driver.implicitly_wait(10)
# xpath
imgs = driver.find_elements(By.XPATH, '/html/body/div/div/div/a/img')
imgs = [i.get_attribute('src') for i in imgs]
imgs = list(set(imgs))
print(imgs)
# download
for img in imgs:
headers = {'User-Agent': fu.random}
img_resp = requests.get(img, stream=True, headers=headers)
filename = os.path.basename(img)
print('catch - {}'.format(filename))
filename = os.path.join(results, filename)
with open(filename, 'wb') as f:
for chunk in img_resp.iter_content(2048):
f.write(chunk)
print('save - {}'.format(filename))
except Exception as e:
print(e)
finally:
driver.quit()