notebook.community

Edit and run



In [1]:

    
from scrapy import signals
from scrapy.utils.project import get_project_settings
from scrapy.crawler import CrawlerProcess
from scrapy.xlib.pydispatch import dispatcher
from multiprocessing.queues import Queue
import scrapy
import multiprocessing
import urllib
from scrapy import Request as Request









    



/home/jun/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py:4: ScrapyDeprecationWarning: Importing from scrapy.xlib.pydispatch is deprecated and will no longer be supported in future Scrapy versions. If you just want to connect signals use the from_crawler class method, otherwise import pydispatch directly if needed. See: https://github.com/scrapy/scrapy/issues/1762



In [2]:

    
class CrawlerWorker(multiprocessing.Process):

    def __init__(self, spider, result_queue):
        multiprocessing.Process.__init__(self)
        self.result_queue = result_queue

        self.crawler = CrawlerProcess(get_project_settings())
        #if not hasattr(project, 'crawler'):
        #    self.crawler.install()
        #self.crawler.configure()

        self.items = []
        self.spider = spider
        dispatcher.connect(self._item_passed, signals.item_passed)
        

    def _item_passed(self, item):
        self.items.append(item)

    def run(self):
        self.crawler.crawl(self.spider)
        self.crawler.start()
        self.crawler.stop()
        self.result_queue.put(self.items)



In [3]:



In [4]:

    
def main():
    result_queue = Queue()
    crawler = CrawlerWorker(FreeozLinkSpider(), result_queue)
    crawler.start()
    #in the spider output has already been written to local file
    #for item in result_queue.get():
        #print item



In [5]:

    
if __name__ == '__main__':
    main()









    



2017-01-25 08:31:24 [scrapy] INFO: Scrapy 1.1.1 started (bot: scrapybot)
2017-01-25 08:31:24 [scrapy] INFO: Overridden settings: {}



In [6]:

    
#when all the links are written to local file.In the shell, can run
# time cat canberra.link | parallel wget -P ./canberra {}
#fast download and roughly 200 files/minute