In [ ]:
import requests
import lxml, lxml.html
from urllib.parse import urlparse, urlunparse, urljoin
from pathlib import Path
import time
import os
def url_join_path(url, path=None):
pr = list(urlparse(url))
if path is not None:
pr[2] = urljoin(pr[2], path)
return urlunparse(pr)
def is_subpath(url1, url2):
p1 = urlparse(url1)
p2 = urlparse(url2)
if p1.netloc != p2.netloc:
return False
if p2.path.startswith(p1.path):
return True
else:
return False
assert is_subpath('https://opencontext.org/static/', 'https://opencontext.org/static/' ) is True
assert is_subpath('https://opencontext.org/static/', 'https://nytimes.com' ) is False
assert is_subpath('https://opencontext.org/static/', 'https://opencontext.org/' ) is False
assert is_subpath('https://opencontext.org/static/', 'https://opencontext.org' ) is False
root_url = "https://opencontext.org/static/"
In [ ]:
def crawl (url, local_root, base_url, download=True, level=0, maxlevel=None):
# reject urls that are not in the subroot
# print (level, url, base_url)
if not is_subpath(base_url, url):
return
if maxlevel is not None and level > maxlevel:
return
r = requests.get(url)
if r.status_code != 200:
return
# leaf or directory?
if (url.endswith('/') and r.headers['content-type'].startswith('text/html')):
html = lxml.html.fromstring(r.text)
for a in html.cssselect('a'):
yield from crawl(url=url_join_path (url, a.attrib['href']),
local_root=local_root,
base_url=url, download=download, level=level+1, maxlevel=maxlevel)
else:
p = Path(urlparse(url).path).relative_to('/static')
p0 = Path(local_root)
local_path = Path.joinpath(p0, p)
yield ({'relative_path':p,
'local_path':local_path,
'content': r.content
})
In [ ]:
from itertools import islice
for (i,k) in enumerate(islice(crawl(root_url, '/Users/raymondyee/C/src/open-context-py/sysadmin/static', root_url, download=False, level=0, maxlevel=None),300,None)):
time.sleep(0.5)
print (i, k['local_path'], len(k['content']))
filename = k['local_path']
os.makedirs(os.path.dirname(filename), exist_ok=True)
with open(filename, "wb") as f:
f.write(k['content'])
In [ ]: