In [1]:
from scrapy import signals
from scrapy.utils.project import get_project_settings
from scrapy.crawler import CrawlerProcess
from scrapy.xlib.pydispatch import dispatcher
from multiprocessing.queues import Queue
import scrapy
import multiprocessing
import urllib
from scrapy import Request as Request
In [2]:
class CrawlerWorker(multiprocessing.Process):
def __init__(self, spider, result_queue):
multiprocessing.Process.__init__(self)
self.result_queue = result_queue
self.crawler = CrawlerProcess(get_project_settings())
#if not hasattr(project, 'crawler'):
# self.crawler.install()
#self.crawler.configure()
self.items = []
self.spider = spider
dispatcher.connect(self._item_passed, signals.item_passed)
def _item_passed(self, item):
self.items.append(item)
def run(self):
self.crawler.crawl(self.spider)
self.crawler.start()
self.crawler.stop()
self.result_queue.put(self.items)
In [3]:
In [4]:
def main():
result_queue = Queue()
crawler = CrawlerWorker(FreeozLinkSpider(), result_queue)
crawler.start()
#in the spider output has already been written to local file
#for item in result_queue.get():
#print item
In [5]:
if __name__ == '__main__':
main()
In [6]:
#when all the links are written to local file.In the shell, can run
# time cat canberra.link | parallel wget -P ./canberra {}
#fast download and roughly 200 files/minute