In [3]:
import pandas as pd
In [1]:
# Importing in each cell because of the kernel restarts.
import scrapy
import re
from scrapy.crawler import CrawlerProcess
class ADSSpider(scrapy.Spider):
# Naming the spider is important if you are running more than one spider of
# this class simultaneously.
name = "ADS"
# URL(s) to start with.
start_urls = [
'https://www.gumtree.com/phones',
]
# Use XPath to parse the response we get.
def parse(self, response):
# Iterate over every <article> element on the page.
for article in response.xpath('//article'):
# Yield a dictionary with the values we want.
yield {
'name': article.xpath('//*[@id="srp-results"]/div/div/div/ul/li/article/a/div/h2/text()').extract(),
'price': article.xpath('//*[@id="srp-results"]/div/div/div/ul/li/article/a/div/span/strong/text()').extract(),
'description': article.xpath('//*[@id="srp-results"]/div/div/div/ul/li/article/a/div/p/text()').extract()
}
# Get the URL of the next page.
next_page = 'http://www.gumtree.com'+response.xpath('//li[@class="pagination-page"]/a/@href').extract()
# There are a LOT of pages here. For our example, we'll just scrape the first 9.
# This finds the page number. The next segment of code prevents us from going beyond page 9.
pagenum = int(re.findall(r'\d+',next_page)[0])
print(pagenum)
# Recursively call the spider to run on the next page, if it exists.
if next_page is not None and pagenum < 10:
next_page = response.urljoin(next_page)
# Request the next page and recursively parse it the same way we did above
yield scrapy.Request(next_page, callback=self.parse)
# Tell the script how to run the crawler by passing in settings.
# The new settings have to do with scraping etiquette.
process = CrawlerProcess({
'FEED_FORMAT': 'json', # Store data in JSON format.
'FEED_URI': 'gumtree.json', # Name our storage file.
'LOG_ENABLED': False, # Turn off logging for now.
'ROBOTSTXT_OBEY': True,
'USER_AGENT': 'ThinkfulDataScienceBootcampCrawler (thinkful.com)',
'AUTOTHROTTLE_ENABLED': True,
'HTTPCACHE_ENABLED': True
})
# Start the crawler with our spider.
process.crawl(ADSSpider)
process.start()
print('Success!')
In [4]:
#Read the json file
gumtree = pd.read_json('gumtree.json', orient = 'records')
#Convert data into series
names = pd.Series(gumtree.name[0], name='Names')
descriptions = pd.Series(gumtree.description[0], name='Descriptions')
prices = pd.Series(gumtree.price[0], name='Prices (£)')
#Concatenate series into a dataframe
items_df = pd.concat([names, descriptions, prices], axis=1)
#Clean data
items_df = items_df.replace('\n','', regex=True)
items_df = items_df.replace('£','', regex=True)
#Print 5 first lines of the dataframe
print(items_df.shape)
items_df.head()
Out[4]:
In [ ]: