In [ ]:
'''
Crawler to download all the books from All It Ebooks site.
It fetches all the ebook links from the sitemap. A new directory
will be created and then all the files will be downloaded there,
and then saves them in a JSON file.
'''
from lxml import etree, html
from requests.adapters import HTTPAdapter
import requests
import os
import json
import re
aite_sm = 'http://www.allitebooks.com/post-sitemap{0}.xml'
urls = []
with open('list-of-post-urls-allitbooks.md', 'w') as fp:
for i in range(1, 8):
fp.write('\n\n## Post Sitemap {0}\n\n'.format(i))
req = requests.get(aite_sm.format(i))
root = etree.fromstring(req.content)
for sitemap in root:
url = sitemap.getchildren()[0].text
fp.write('\n[{0}]({1})\n'.format(url, url))
urls.append(url)
In [ ]:
down_dict = {}
if not os.path.exists('all-it-ebooks'):
os.mkdir('all-it-ebooks')
save_dir = os.path.abspath(os.path.join(os.curdir, 'all-it-ebooks'))
print('Crawling in Progress ......\n')
for i, url in enumerate(urls[1:]):
page = requests.Session()
page.mount(url, HTTPAdapter(max_retries=5))
page = page.get(url)
tree = html.fromstring(page.content)
down_link = tree.xpath("//*[@class=\"download-links\"]/a/@href")
file_name = down_link[0].split('/')[-1]
title = re.sub('[^A-Za-z0-9]+', '-', file_name.split('.')[0])
down_dict[title] = down_link[0]
save_loc = os.path.join(save_dir, file_name)
#Starting to download the files
data = requests.get(down_link[0], stream=True)
if not os.path.exists(save_loc):
print('\nNow writing {0} - {1}'.format(i + 1, file_name))
with open(save_loc, 'wb') as f:
print(url)
for chunk in data.iter_content(chunk_size=4096):
if chunk:
f.write(chunk)
else:
print('\nFile Exists. Skipped {0} - {1}'.format(i + 1, file_name))
print('All Urls have been crawled and saved.')
print('\nWriting links to JSON file.\n')
with open('all-it-ebooks-download-links.json', 'w') as fp:
json.dump(down_dict, fp, indent=4)
print('\nWriting Complete')
In [ ]: