In [5]:
import requests
from bs4 import BeautifulSoup
import pathlib
import os
In [6]:
# Create BeautifulSoup object from website
def load_soup(url):
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
return soup
In [7]:
# site galleries contain tables for individual tilesets
def get_tileset_tables(soup):
# 2-corner
tables = get_two_corner_tileset_tables(soup)
for t in tables:
yield t
def get_two_corner_tileset_tables(soup):
return (soup
.find_all('table')[3]
.find('table')
.find_all('table'))
In [8]:
def get_set_name(table):
name = table.find('caption').text.strip().replace('*', '')
return name
def get_image_addresses(table):
imgs = table.find_all('img')
for i in imgs:
address = fixRelativeUrl(i.attrs['src'])
yield address
def fixRelativeUrl(url):
# HACK: for these we go up a level
return url.replace('..', 'http://cr31.co.uk/stagecast')
In [10]:
def get_formatted_tilename(url):
# find just the filename
s = src.split('/')[-1]
num = int(s.split('.')[0])
# keep leading zeros for 3 digits
formatted = set_dir + '/' + '{:0>3d}.gif'.format(num)
return formatted
def download_tile(url, set_dir, verbose=False):
response = requests.get(url)
if response.status_code == 200:
filename = get_formatted_tilename(url)
save_image(response.content, filename, verbose)
else:
print ('Bad response code:', response.status_code, 'for', url)
def save_image(content, filename, verbose=False):
with open(filename, 'wb') as f:
f.write(content)
if verbose:
print ('Saved', filename)
In [12]:
def runTileScraper():
print ('Scraping tiles to', os.getcwd())
# create work folder
tile_dir = './wang'
pathlib.Path(tile_dir).mkdir(parents=True, exist_ok=True)
urls = ['http://cr31.co.uk/stagecast/wang/tiles_c.html']
for url in urls:
soup = load_soup(url)
# loop over all the tile sets
for tileset in get_tileset_tables(soup):
name = get_set_name(tileset)
set_dir = tile_dir + '/' + name
pathlib.Path(set_dir).mkdir(parents=True, exist_ok=True)
# loop over all of the tile images
for img in get_image_addresses(tileset):
download_tile(img, set_dir, verbose=True)
print ('Downloaded', name)
print ('Tile scrape complete for', url)
print ('All tile scrapers complete!')
In [ ]: