notebook.community

Edit and run



In [ ]:

    
import requests
import lxml, lxml.html
from urllib.parse import urlparse, urlunparse, urljoin
from pathlib import Path
import time
import os


def url_join_path(url, path=None):
    pr = list(urlparse(url))
    if path is not None:
        pr[2] = urljoin(pr[2], path)
    return urlunparse(pr)

def is_subpath(url1, url2):
    p1 = urlparse(url1)
    p2 = urlparse(url2)
    
    if p1.netloc != p2.netloc:
        return False
    
    if p2.path.startswith(p1.path):
        return True
    else:
        return False
    
assert is_subpath('https://opencontext.org/static/', 'https://opencontext.org/static/' ) is True
assert is_subpath('https://opencontext.org/static/', 'https://nytimes.com' ) is False
assert is_subpath('https://opencontext.org/static/', 'https://opencontext.org/' ) is False
assert is_subpath('https://opencontext.org/static/', 'https://opencontext.org' ) is False


root_url = "https://opencontext.org/static/"



In [ ]:

    
def crawl (url, local_root, base_url, download=True, level=0, maxlevel=None):

    # reject urls that are not in the subroot
    # print (level, url, base_url)
    
    if not is_subpath(base_url, url):
        return 
    
    if maxlevel is not None and level > maxlevel:
        return 
    
    r = requests.get(url)
    if r.status_code != 200:
        return 
    
    # leaf or directory?
    if (url.endswith('/') and r.headers['content-type'].startswith('text/html')):
        html = lxml.html.fromstring(r.text)
        for a in html.cssselect('a'):
            yield from crawl(url=url_join_path (url, a.attrib['href']), 
                  local_root=local_root, 
                  base_url=url, download=download, level=level+1, maxlevel=maxlevel)
    else:
        
        p = Path(urlparse(url).path).relative_to('/static')
        p0 = Path(local_root)
        local_path = Path.joinpath(p0, p)
        yield ({'relative_path':p, 
                'local_path':local_path, 
                'content': r.content
               })



In [ ]:

    
from itertools import islice

for (i,k) in enumerate(islice(crawl(root_url, '/Users/raymondyee/C/src/open-context-py/sysadmin/static', root_url, download=False, level=0, maxlevel=None),300,None)):
    time.sleep(0.5)
    print (i, k['local_path'], len(k['content']))
    filename = k['local_path']
    os.makedirs(os.path.dirname(filename), exist_ok=True)
    with open(filename, "wb") as f:
        f.write(k['content'])



In [ ]: