In [3]:
import pandas as pd

In [1]:
# Importing in each cell because of the kernel restarts.
import scrapy
import re
from scrapy.crawler import CrawlerProcess


class ADSSpider(scrapy.Spider):
    # Naming the spider is important if you are running more than one spider of
    # this class simultaneously.
    name = "ADS"
    
    # URL(s) to start with.
    start_urls = [
        'https://www.gumtree.com/phones',
    ]

    # Use XPath to parse the response we get.
    def parse(self, response):
        
        # Iterate over every <article> element on the page.
        for article in response.xpath('//article'):
            
            # Yield a dictionary with the values we want.
            yield {
                'name': article.xpath('//*[@id="srp-results"]/div/div/div/ul/li/article/a/div/h2/text()').extract(),
                'price': article.xpath('//*[@id="srp-results"]/div/div/div/ul/li/article/a/div/span/strong/text()').extract(),
                'description': article.xpath('//*[@id="srp-results"]/div/div/div/ul/li/article/a/div/p/text()').extract()
            }
        # Get the URL of the next page.
        next_page = 'http://www.gumtree.com'+response.xpath('//li[@class="pagination-page"]/a/@href').extract()
        # There are a LOT of pages here.  For our example, we'll just scrape the first 9.
        # This finds the page number. The next segment of code prevents us from going beyond page 9.
        pagenum = int(re.findall(r'\d+',next_page)[0])
        print(pagenum)
            # Recursively call the spider to run on the next page, if it exists.
        if next_page is not None and pagenum < 10:
            next_page = response.urljoin(next_page)
            # Request the next page and recursively parse it the same way we did above
        yield scrapy.Request(next_page, callback=self.parse)
            
# Tell the script how to run the crawler by passing in settings.
# The new settings have to do with scraping etiquette.          
process = CrawlerProcess({
    'FEED_FORMAT': 'json',         # Store data in JSON format.
    'FEED_URI': 'gumtree.json',       # Name our storage file.
    'LOG_ENABLED': False,          # Turn off logging for now.
    'ROBOTSTXT_OBEY': True,
    'USER_AGENT': 'ThinkfulDataScienceBootcampCrawler (thinkful.com)',
    'AUTOTHROTTLE_ENABLED': True,
    'HTTPCACHE_ENABLED': True
})

# Start the crawler with our spider.
process.crawl(ADSSpider)
process.start()
print('Success!')


Success!

In [4]:
#Read the json file
gumtree = pd.read_json('gumtree.json', orient = 'records')

#Convert data into series
names = pd.Series(gumtree.name[0], name='Names')
descriptions = pd.Series(gumtree.description[0], name='Descriptions')
prices = pd.Series(gumtree.price[0], name='Prices (£)')

#Concatenate series into a dataframe
items_df = pd.concat([names, descriptions, prices], axis=1)

#Clean data
items_df = items_df.replace('\n','', regex=True)
items_df = items_df.replace('£','', regex=True)

#Print 5 first lines of the dataframe
print(items_df.shape)
items_df.head()


(30, 3)
Out[4]:
Names Descriptions Prices (£)
0 IPHONE X 64GB SILVER EE - SEALED* FACTORY SEALED BRAND NEW APPLE IPHONE X 64GB S... 800
1 IPHONE 8. BRAND NEW iPhone 8 - 256gb - Space Grey. Brand new in se... 600
2 Apple iphone 6 16GB GOLD OR BLACK unlocked phone GRADE A , MINT CONDITION 160,\rGRADE B SOME MI... 150
3 NEW Sealed Samsung Galaxy S8 64gb & Plus Midni... NO FRAUD - No PayPal/Bank Transfers or Postage... 490
4 Samsung S8 plus -- Read the description before... .++++++++++++++++++++++++++ Fraud Alert! +++++... 310

In [ ]: