In [5]:
import requests
from bs4 import BeautifulSoup
import pathlib
import os

Downloading and Parsing


In [6]:
# Create BeautifulSoup object from website
def load_soup(url):
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    return soup

Working with the Soup


In [7]:
# site galleries contain tables for individual tilesets
def get_tileset_tables(soup):
    # 2-corner
    tables = get_two_corner_tileset_tables(soup)
    for t in tables:
        yield t
    
def get_two_corner_tileset_tables(soup):
    return (soup
            .find_all('table')[3]
            .find('table')
            .find_all('table'))

Working with Tileset Table


In [8]:
def get_set_name(table):
    name = table.find('caption').text.strip().replace('*', '')
    return name

def get_image_addresses(table):
    imgs = table.find_all('img')
    for i in imgs:
        address = fixRelativeUrl(i.attrs['src'])
        yield address
    
def fixRelativeUrl(url):
    # HACK: for these we go up a level
    return url.replace('..', 'http://cr31.co.uk/stagecast')

Downloading and Saving Tile Images


In [10]:
def get_formatted_tilename(url):
    # find just the filename
    s = src.split('/')[-1]
    num = int(s.split('.')[0])
    # keep leading zeros for 3 digits
    formatted = set_dir + '/' + '{:0>3d}.gif'.format(num)
    return formatted

def download_tile(url, set_dir, verbose=False):
    response = requests.get(url)
    if response.status_code == 200:
        filename = get_formatted_tilename(url)
        save_image(response.content, filename, verbose)
    else:
        print ('Bad response code:', response.status_code, 'for', url)
        
def save_image(content, filename, verbose=False):
    with open(filename, 'wb') as f:
        f.write(content)
        if verbose:
            print ('Saved', filename)

Scraper Algorithm


In [12]:
def runTileScraper():
    print ('Scraping tiles to', os.getcwd())
    # create work folder
    tile_dir = './wang'
    pathlib.Path(tile_dir).mkdir(parents=True, exist_ok=True)
    
    urls = ['http://cr31.co.uk/stagecast/wang/tiles_c.html']    
    for url in urls:
        soup = load_soup(url)
        
        # loop over all the tile sets
        for tileset in get_tileset_tables(soup):
            name = get_set_name(tileset)
            set_dir = tile_dir + '/' + name
            pathlib.Path(set_dir).mkdir(parents=True, exist_ok=True)
            
            # loop over all of the tile images
            for img in get_image_addresses(tileset):
                download_tile(img, set_dir, verbose=True)
            
            print ('Downloaded', name)
        
        print ('Tile scrape complete for', url)
    print ('All tile scrapers complete!')

In [ ]: