notebook.community

Edit and run



In [3]:

    
import pandas as pd



In [1]:

    
# Importing in each cell because of the kernel restarts.
import scrapy
import re
from scrapy.crawler import CrawlerProcess


class ADSSpider(scrapy.Spider):
    # Naming the spider is important if you are running more than one spider of
    # this class simultaneously.
    name = "ADS"
    
    # URL(s) to start with.
    start_urls = [
        'https://www.gumtree.com/phones',
    ]

    # Use XPath to parse the response we get.
    def parse(self, response):
        
        # Iterate over every <article> element on the page.
        for article in response.xpath('//article'):
            
            # Yield a dictionary with the values we want.
            yield {
                'name': article.xpath('//*[@id="srp-results"]/div/div/div/ul/li/article/a/div/h2/text()').extract(),
                'price': article.xpath('//*[@id="srp-results"]/div/div/div/ul/li/article/a/div/span/strong/text()').extract(),
                'description': article.xpath('//*[@id="srp-results"]/div/div/div/ul/li/article/a/div/p/text()').extract()
            }
        # Get the URL of the next page.
        next_page = 'http://www.gumtree.com'+response.xpath('//li[@class="pagination-page"]/a/@href').extract()
        # There are a LOT of pages here.  For our example, we'll just scrape the first 9.
        # This finds the page number. The next segment of code prevents us from going beyond page 9.
        pagenum = int(re.findall(r'\d+',next_page)[0])
        print(pagenum)
            # Recursively call the spider to run on the next page, if it exists.
        if next_page is not None and pagenum < 10:
            next_page = response.urljoin(next_page)
            # Request the next page and recursively parse it the same way we did above
        yield scrapy.Request(next_page, callback=self.parse)
            
# Tell the script how to run the crawler by passing in settings.
# The new settings have to do with scraping etiquette.          
process = CrawlerProcess({
    'FEED_FORMAT': 'json',         # Store data in JSON format.
    'FEED_URI': 'gumtree.json',       # Name our storage file.
    'LOG_ENABLED': False,          # Turn off logging for now.
    'ROBOTSTXT_OBEY': True,
    'USER_AGENT': 'ThinkfulDataScienceBootcampCrawler (thinkful.com)',
    'AUTOTHROTTLE_ENABLED': True,
    'HTTPCACHE_ENABLED': True
})

# Start the crawler with our spider.
process.crawl(ADSSpider)
process.start()
print('Success!')









    



Success!



In [4]:

    
#Read the json file
gumtree = pd.read_json('gumtree.json', orient = 'records')

#Convert data into series
names = pd.Series(gumtree.name[0], name='Names')
descriptions = pd.Series(gumtree.description[0], name='Descriptions')
prices = pd.Series(gumtree.price[0], name='Prices (£)')

#Concatenate series into a dataframe
items_df = pd.concat([names, descriptions, prices], axis=1)

#Clean data
items_df = items_df.replace('\n','', regex=True)
items_df = items_df.replace('£','', regex=True)

#Print 5 first lines of the dataframe
print(items_df.shape)
items_df.head()









    



(30, 3)






    Out[4]:







  
    
      
      Names
      Descriptions
      Prices (£)
    
  
  
    
      0
      IPHONE X 64GB SILVER EE - SEALED*
      FACTORY SEALED BRAND NEW APPLE IPHONE X 64GB S...
      800
    
    
      1
      IPHONE 8. BRAND NEW
      iPhone 8 - 256gb - Space Grey. Brand new in se...
      600
    
    
      2
      Apple iphone 6 16GB GOLD OR BLACK unlocked phone
      GRADE A , MINT CONDITION 160,\rGRADE B SOME MI...
      150
    
    
      3
      NEW Sealed Samsung Galaxy S8 64gb & Plus Midni...
      NO FRAUD - No PayPal/Bank Transfers or Postage...
      490
    
    
      4
      Samsung S8 plus -- Read the description before...
      .++++++++++++++++++++++++++ Fraud Alert! +++++...
      310



In [ ]:

	Names	Descriptions	Prices (£)
0	IPHONE X 64GB SILVER EE - SEALED*	FACTORY SEALED BRAND NEW APPLE IPHONE X 64GB S...	800
1	IPHONE 8. BRAND NEW	iPhone 8 - 256gb - Space Grey. Brand new in se...	600
2	Apple iphone 6 16GB GOLD OR BLACK unlocked phone	GRADE A , MINT CONDITION 160,\rGRADE B SOME MI...	150
3	NEW Sealed Samsung Galaxy S8 64gb & Plus Midni...	NO FRAUD - No PayPal/Bank Transfers or Postage...	490
4	Samsung S8 plus -- Read the description before...	.++++++++++++++++++++++++++ Fraud Alert! +++++...	310