In [2]:
from scrapy.spiders import BaseSpider
from scrapy.selector import HtmlXPathSelector
from scrapy.http import Request
import re

In [ ]:
# url string components for reviewer pages
URL_BASE = 'http://www.bringfido.com/'

# yelp unique url endings for each restaurant
CITIES = ['new_haven_ct_us',
         ]

def createRestaurantPageLinks(self, response):
   reviewsPerPage = 40
   hxs = HtmlXPathSelector(response)
   totalReviews = int(hxs.select('//h2[@id="total_reviews"]/text()').extract()[0].strip().split(' ')[0])
   pages = [Request(url=response.url + '?start=' + str(reviewsPerPage*(n+1)), \
                    callback=self.parse) \
            for n in range(totalReviews/reviewsPerPage)]
   return pages

def createReviewerPageLinks(self, response):
   reviewsPerPage = 10
   hxs = HtmlXPathSelector(response)
   totalReviews = int(hxs.select('//div[@id="review_lister_header"]/em/text()').extract()[0].split(' ')[0])
   pages = [Request(url=response.url + '&rec_pagestart=' + str(reviewsPerPage*(n+1)), \
                    callback=self.parseReviewer) \
            for n in range(totalReviews/reviewsPerPage)]
   return pages

class HotelSpider(BaseSpider):
   name = 'crawlHotels'
   allowed_domains = ['bringfido.com']
   start_urls = [ 'http://http://www.bringfido.com/lodging/city/{}'.format(s) for s in CITIES]

   # default parse used for the landing page for each start_url
   def parse(self, response):
      requests = []

      # extract all reviews from the page and return a list of requests for the 5 star reviewers' profiles
      hxs = HtmlXPathSelector(response)
      userIDs = [userUrl.split('?userid=')[1] for \
                 userUrl in hxs.select('//li[@class="user-name"]/a/@href').extract()]
      ratings = hxs.select('//div[@id="reviews-other"]//meta[@itemprop="ratingValue"]/@content').extract()
   
      for i in range(len(ratings)):
         if float(ratings[i]) == 5:
            requests.append(Request(url=URL_BASE + userIDs[i] + FILTER_SETTINGS, \
                                    callback=self.parseReviewer))
   
      # request additional pages if we are on page 1 of the restaurant
      if response.url.find('?start=') == -1:
         requests += createRestaurantPageLinks(self, response)

      return requests
      
   # parse a given reviewer
   def parseReviewer(self, response):
      hxs = HtmlXPathSelector(response)
      restaurantUrls = hxs.select('//div[@class="review clearfix"]/ \
                                  div[@class="biz_info"]/h4/a/@href').extract()
      restaurants = [re.search(r'(?<=/biz/)[^#]*', rest).group() for rest in restaurantUrls]
      reviewerName = hxs.select('//title/text()').extract()[0].split('|')[0].replace('\'s Profile','').strip()
      reviewerUserID = re.search(r'(?<=userid=)[^&]*', response.url).group()
      ratingText = hxs.select('//div[@class="rating"]/i/@title').extract()
      ratings = [s.replace(' star rating','') for s in ratingText]

      reviews = []
      for i in range(len(restaurants)):
         review = Review()
         review['restaurant'] = restaurants[i]
         review['reviewerName'] = reviewerName
         review['reviewerUserID'] = reviewerUserID
         review['rating'] = float(ratings[i])
         reviews.append(review)

      # request additional pages if we are on page 1 of the reviewer
      additionalPages = []
      if response.url.find('&rec_pagestart=') == -1:
         additionalPages = createReviewerPageLinks(self, response)

      return reviews + additionalPages