練習

觀察 https://afuntw.github.io/Test-Crawling-Website/pages/portfolio/index.html 並撰寫爬蟲程式
下載 2018/01/29 14:39:10 之後修改過的圖片
以正確的圖片格式存檔



In [1]:

    
import requests
import os

from PIL import Image
from bs4 import BeautifulSoup
from datetime import datetime
from time import ctime
from pprint import pprint

url = 'https://afuntw.github.io/Test-Crawling-Website/pages/portfolio/index.html'
last_modified = datetime(2018, 1, 29, 14, 39, 10)



In [2]:

    
resp = requests.get(url)
soup = BeautifulSoup(resp.text, 'lxml')



In [3]:

    
imgs = soup.find_all('img')
imgs = [i['src'] for i in imgs]



In [4]:

    
results = os.path.abspath('../results')
if not os.path.exists(results):
    os.makedirs(results)

for i in imgs:
    # check header only
    check_resp = requests.head(i)
    check_head = dict(check_resp.headers)
    if 'Last-Modified' in check_head:
        check_modified = check_head['Last-Modified']
        check_modified = datetime.strptime(check_modified, '%a, %d %b %Y %H:%M:%S GMT')
        check_not_modified = check_modified < last_modified
        if check_not_modified:
            continue
    
    img_resp = requests.get(i, stream=True)
    image = Image.open(img_resp.raw)
    filename = os.path.basename(i)
    print('catch the filename {} and the real format is {}'.format(filename, image.format))
    
    real_filename = '{}.{}'.format(
        filename.split('.')[0],
        image.format.lower()
    )
    save_filename = os.path.join(results, real_filename)
    print('catch the real filename {}'.format(real_filename))
    
    image.save(save_filename)
    print('save image at {}'.format(save_filename))









    



catch the filename IDPxvSl.jpg and the real format is PNG
catch the real filename IDPxvSl.png
save image at /home/dirl/github/Python-Crawling-Tutorial/results/IDPxvSl.png
catch the filename UKxK6FZ.gif and the real format is PNG
catch the real filename UKxK6FZ.png
save image at /home/dirl/github/Python-Crawling-Tutorial/results/UKxK6FZ.png