練習

觀察 https://www.pexels.com/ 並撰寫爬蟲程式
下載 5 張桌布圖



In [1]:

    
import requests
import re
import os

from bs4 import BeautifulSoup
from pprint import pprint

url = 'https://www.pexels.com/'



In [2]:

    
resp = requests.get(url)
soup = BeautifulSoup(resp.text, 'lxml')



In [3]:

    
article = soup.find('div', class_='photos').find_all('article', class_='photo-item')
imgs = [a.find('a').find('img')['src'] for a in article]
target = imgs[:5]

pprint(target)









    



['https://images.pexels.com/photos/106606/pexels-photo-106606.jpeg?h=350&auto=compress&cs=tinysrgb',
 'https://images.pexels.com/photos/405041/pexels-photo-405041.jpeg?h=350&auto=compress&cs=tinysrgb',
 'https://images.pexels.com/photos/102170/pexels-photo-102170.jpeg?h=350&auto=compress&cs=tinysrgb',
 'https://images.pexels.com/photos/583399/pexels-photo-583399.jpeg?h=350&auto=compress&cs=tinysrgb',
 'https://images.pexels.com/photos/398533/pexels-photo-398533.jpeg?h=350&auto=compress&cs=tinysrgb']



In [4]:

    
results = os.path.abspath('../results')

if not os.path.exists(results):
    os.makedirs(results)

for i in target:
    img_resp = requests.get(i, stream=True)    
    filename = re.match(r".*(pexels-photo-([0-9]{6})\.jpeg).*", i).group(1)
    print('regex catch the name {}'.format(filename))
    
    filename = os.path.join(results, filename)

    with open(filename, 'wb') as f:
        for chunk in img_resp.iter_content(2048):
            f.write(chunk)
        print('Save the img at {}'.format(filename))









    



regex catch the name pexels-photo-106606.jpeg
Save the img at /home/dirl/github/Python-Crawling-Tutorial/results/pexels-photo-106606.jpeg
regex catch the name pexels-photo-405041.jpeg
Save the img at /home/dirl/github/Python-Crawling-Tutorial/results/pexels-photo-405041.jpeg
regex catch the name pexels-photo-102170.jpeg
Save the img at /home/dirl/github/Python-Crawling-Tutorial/results/pexels-photo-102170.jpeg
regex catch the name pexels-photo-583399.jpeg
Save the img at /home/dirl/github/Python-Crawling-Tutorial/results/pexels-photo-583399.jpeg
regex catch the name pexels-photo-398533.jpeg
Save the img at /home/dirl/github/Python-Crawling-Tutorial/results/pexels-photo-398533.jpeg