In [1]:
import os
import time
import urllib.error
import urllib.request
from bs4 import BeautifulSoup
In [2]:
url = 'https://news.yahoo.co.jp/list/'
ua = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) '\
'AppleWebKit/537.36 (KHTML, like Gecko) '\
'Chrome/55.0.2883.95 Safari/537.36 '
In [3]:
req = urllib.request.Request(url, headers={'User-Agent': ua})
html = urllib.request.urlopen(req)
In [4]:
soup = BeautifulSoup(html, "html.parser")
In [5]:
url_list = [img.get('data-src') for img in soup.find(class_='list').find_all('img')]
In [6]:
def download_file(url, dst_path):
try:
with urllib.request.urlopen(url) as web_file, open(dst_path, 'wb') as local_file:
local_file.write(web_file.read())
except urllib.error.URLError as e:
print(e)
In [7]:
def download_file_to_dir(url, dst_dir):
download_file(url, os.path.join(dst_dir, os.path.basename(url)))
In [8]:
download_dir = 'data/temp'
sleep_time_sec = 1
for url in url_list:
# print(url)
# download_file_to_dir(url, download_dir)
time.sleep(sleep_time_sec)