In [23]:
# Settings for notebook
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Show Python version
import platform
platform.python_version()


Out[23]:
'3.6.2'

In [24]:
import scrapy

In [25]:
import json

class JsonWriterPipeline(object):
    
    def open_spider(self, spider):
        self.file = open('quote-results.jl', 'w')
    
    def close_spider(self, spider):
        self.file.close()
    
    def process_item(self, item, spider):
        line = json.dumps(dict(item)) + '\n'
        self.file.write(line)
        return line

In [26]:
import logging

class QuotesSpider(scrapy.Spider):
    name = 'quotes'
    
    start_urls = [
        'http://quotes.toscrape.com/page/1/',
        'http://quotes.toscrape.com/page/2/'
    ]
    
    custom_settings = {
        'LOG_LEVEL': logging.WARNING,
        'ITEM_PIPELINES': {'__main__.JsonWriterPipeline': 1}, # Used for pipeline 1
        'FEED_FORMAT':'json', # Used for pipeline 2
        'FEED_URI': 'quote-results.json' # Used for pipeline 2
    }
    
    def parse(self, response):
        for quote in response.css('div.quote'):
            yield {
                'text': quote.css('span.text::text').extract_first(),
                'author': quote.css('span small::text').extract_first(),
                'tags': quote.css('div.tags a.tag::text').extract(),
            }

In [27]:
from scrapy.crawler import CrawlerProcess

process = CrawlerProcess({
    'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
})

process.crawl(QuotesSpider)
process.start()


2017-09-19 14:25:53 [scrapy.utils.log] INFO: Scrapy 1.4.0 started (bot: scrapybot)
2017-09-19 14:25:53 [scrapy.utils.log] INFO: Overridden settings: {'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'}
Out[27]:
<Deferred at 0x10950b2e8>
---------------------------------------------------------------------------
ReactorNotRestartable                     Traceback (most recent call last)
<ipython-input-27-2c3320ffd620> in <module>()
      6 
      7 process.crawl(QuotesSpider)
----> 8 process.start()

/usr/local/lib/python3.6/site-packages/scrapy/crawler.py in start(self, stop_after_crawl)
    283         tp.adjustPoolsize(maxthreads=self.settings.getint('REACTOR_THREADPOOL_MAXSIZE'))
    284         reactor.addSystemEventTrigger('before', 'shutdown', self.stop)
--> 285         reactor.run(installSignalHandlers=False)  # blocking call
    286 
    287     def _get_dns_resolver(self):

/usr/local/lib/python3.6/site-packages/twisted/internet/base.py in run(self, installSignalHandlers)
   1240 
   1241     def run(self, installSignalHandlers=True):
-> 1242         self.startRunning(installSignalHandlers=installSignalHandlers)
   1243         self.mainLoop()
   1244 

/usr/local/lib/python3.6/site-packages/twisted/internet/base.py in startRunning(self, installSignalHandlers)
   1220         """
   1221         self._installSignalHandlers = installSignalHandlers
-> 1222         ReactorBase.startRunning(self)
   1223 
   1224 

/usr/local/lib/python3.6/site-packages/twisted/internet/base.py in startRunning(self)
    728             raise error.ReactorAlreadyRunning()
    729         if self._startedBefore:
--> 730             raise error.ReactorNotRestartable()
    731         self._started = True
    732         self._stopped = False

ReactorNotRestartable: 

In [28]:
!ls quote-results.*


quote-results.jl   quote-results.json

In [29]:
!tail -n 2 quote-results.jl

In [30]:
!tail -n 2 quote-results.json

In [ ]: