In [1]:
import requests
import os
from PIL import Image
from bs4 import BeautifulSoup
from datetime import datetime
from time import ctime
from pprint import pprint
url = 'https://afuntw.github.io/Test-Crawling-Website/pages/portfolio/index.html'
last_modified = datetime(2018, 1, 29, 14, 39, 10)
In [2]:
resp = requests.get(url)
soup = BeautifulSoup(resp.text, 'lxml')
In [3]:
imgs = soup.find_all('img')
imgs = [i['src'] for i in imgs]
In [4]:
results = os.path.abspath('../results')
if not os.path.exists(results):
os.makedirs(results)
for i in imgs:
# check header only
check_resp = requests.head(i)
check_head = dict(check_resp.headers)
if 'Last-Modified' in check_head:
check_modified = check_head['Last-Modified']
check_modified = datetime.strptime(check_modified, '%a, %d %b %Y %H:%M:%S GMT')
check_not_modified = check_modified < last_modified
if check_not_modified:
continue
img_resp = requests.get(i, stream=True)
image = Image.open(img_resp.raw)
filename = os.path.basename(i)
print('catch the filename {} and the real format is {}'.format(filename, image.format))
real_filename = '{}.{}'.format(
filename.split('.')[0],
image.format.lower()
)
save_filename = os.path.join(results, real_filename)
print('catch the real filename {}'.format(real_filename))
image.save(save_filename)
print('save image at {}'.format(save_filename))