練習


In [1]:
import requests
import os

from PIL import Image
from bs4 import BeautifulSoup
from pprint import pprint

url = 'https://afuntw.github.io/Test-Crawling-Website/pages/portfolio/index.html'

In [2]:
resp = requests.get(url)
soup = BeautifulSoup(resp.text, 'lxml')

In [3]:
imgs = soup.find_all('img')
imgs = [i['src'] for i in imgs]

In [4]:
results = os.path.abspath('../results')
if not os.path.exists(results):
    os.makedirs(results)

for i in imgs:
    img_resp = requests.get(i, stream=True)
    image = Image.open(img_resp.raw)
    filename = os.path.basename(i)
    print('catch the filename {} and the real format is {}'.format(filename, image.format))
    
    real_filename = '{}.{}'.format(
        filename.split('.')[0],
        image.format.lower()
    )
    save_filename = os.path.join(results, real_filename)
    print('catch the real filename {}'.format(real_filename))
    
    image.save(save_filename)
    print('save image at {}'.format(save_filename))


catch the filename XgXT3Va.png and the real format is JPEG
catch the real filename XgXT3Va.jpeg
save image at /home/dirl/github/Python-Crawling-Tutorial/results/XgXT3Va.jpeg
catch the filename Q3bkStv.png and the real format is PNG
catch the real filename Q3bkStv.png
save image at /home/dirl/github/Python-Crawling-Tutorial/results/Q3bkStv.png
catch the filename IDPxvSl.jpg and the real format is PNG
catch the real filename IDPxvSl.png
save image at /home/dirl/github/Python-Crawling-Tutorial/results/IDPxvSl.png
catch the filename ZEhBDs6.png and the real format is PNG
catch the real filename ZEhBDs6.png
save image at /home/dirl/github/Python-Crawling-Tutorial/results/ZEhBDs6.png
catch the filename UKxK6FZ.gif and the real format is PNG
catch the real filename UKxK6FZ.png
save image at /home/dirl/github/Python-Crawling-Tutorial/results/UKxK6FZ.png