練習


In [1]:
import requests
import re
import os

from PIL import Image
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
from urllib.parse import urljoin
from pprint import pprint

url = 'http://exam.lib.ntu.edu.tw/graduate'

In [2]:
fu = UserAgent()
headers = {'User-Agent': fu.random}
resp = requests.get(url, headers=headers)
soup = BeautifulSoup(resp.text, 'lxml')

In [3]:
results = os.path.abspath('../results')
if not os.path.exists(results):
    os.makedirs(results)

pdfs = soup.find_all('img', class_=re.compile('.*field-icon-application-pdf$'))
for i, pdf in enumerate(pdfs):
    href = pdf.parent['href']
    abs_href = urljoin(resp.url, href)
    file_resp = requests.get(abs_href, headers=headers, stream=True)
    
    filename = os.path.basename(abs_href)
    filename = filename.split('&')[0]
    print('({}/{}) catch the filename {}'.format(i+1, len(pdfs), filename))
    filename = os.path.join(results, filename)

    with open(filename, 'wb') as f:
        for chunk in file_resp.iter_content(2048):
            f.write(chunk)
        print('({}/{}) save file {}'.format(i+1, len(pdfs),filename))


(1/30) catch the filename 106_graduate_4.pdf
(1/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_4.pdf
(2/30) catch the filename 106_graduate_6.pdf
(2/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_6.pdf
(3/30) catch the filename 106_graduate_3.pdf
(3/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_3.pdf
(4/30) catch the filename 106_graduate_1.pdf
(4/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_1.pdf
(5/30) catch the filename 106_graduate_2.pdf
(5/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_2.pdf
(6/30) catch the filename 106_graduate_8.pdf
(6/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_8.pdf
(7/30) catch the filename 106_graduate_5.pdf
(7/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_5.pdf
(8/30) catch the filename 106_graduate_10.pdf
(8/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_10.pdf
(9/30) catch the filename 106_graduate_7.pdf
(9/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_7.pdf
(10/30) catch the filename 106_graduate_11.pdf
(10/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_11.pdf
(11/30) catch the filename 106_graduate_13.pdf
(11/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_13.pdf
(12/30) catch the filename 106_graduate_15.pdf
(12/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_15.pdf
(13/30) catch the filename 106_graduate_14.pdf
(13/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_14.pdf
(14/30) catch the filename 106_graduate_8.pdf
(14/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_8.pdf
(15/30) catch the filename 106_graduate_5.pdf
(15/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_5.pdf
(16/30) catch the filename 106_graduate_16.pdf
(16/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_16.pdf
(17/30) catch the filename 106_graduate_17.pdf
(17/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_17.pdf
(18/30) catch the filename 106_graduate_18.pdf
(18/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_18.pdf
(19/30) catch the filename 106_graduate_19.pdf
(19/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_19.pdf
(20/30) catch the filename 106_graduate_17.pdf
(20/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_17.pdf
(21/30) catch the filename 106_graduate_20.pdf
(21/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_20.pdf
(22/30) catch the filename 106_graduate_22.pdf
(22/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_22.pdf
(23/30) catch the filename 106_graduate_21.pdf
(23/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_21.pdf
(24/30) catch the filename 106_graduate_8.pdf
(24/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_8.pdf
(25/30) catch the filename 106_graduate_25.pdf
(25/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_25.pdf
(26/30) catch the filename 106_graduate_23.pdf
(26/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_23.pdf
(27/30) catch the filename 106_graduate_24.pdf
(27/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_24.pdf
(28/30) catch the filename 106_graduate_8.pdf
(28/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_8.pdf
(29/30) catch the filename 106_graduate_26.pdf
(29/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_26.pdf
(30/30) catch the filename 106_graduate_28.pdf
(30/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_28.pdf