drive_bf_qt_scraping

A notebook to describing the code used to drive the scraping.



In [1]:

    
import sys  
from PyQt4.QtGui import *  
from PyQt4.QtCore import *  
from PyQt4.QtWebKit import *  
from lxml import html
import pandas as pd
import numpy as np
import time



In [2]:

    
import sys
from PyQt4 import QtGui, QtCore, QtWebKit 

class MainForm(QtGui.QMainWindow):
    def __init__(self, parent=None):
        super(MainForm, self).__init__(parent)

        self.pages = ['http://www.google.com', 'http://www.citrix.com', 'http://yahoo.com', 'http://reddit.com']
        self.index = 0

        self.view = QtWebKit.QWebView()  
        self.view.connect(self.view, QtCore.SIGNAL('loadFinished(bool)'), self.loadFinished)
        self.setCentralWidget(self.view)

        self.view.load(QtCore.QUrl(self.pages[self.index]))

    def loadFinished(self, ok):
        self.index += 1
        if self.index < len(self.pages) :
            self.view.load(QtCore.QUrl(self.pages[self.index]))
        else:
            print 'done'

def main():
    app = QtGui.QApplication(sys.argv)
    form = MainForm()
    form.show()
    app.exec_()



In [3]:

    
class Render(QWebPage):  
    def __init__(self, url):  
        self.app = QApplication(sys.argv)  
        QWebPage.__init__(self)  
        self.loadFinished.connect(self._loadFinished)  
        self.mainFrame().load(QUrl(url))  
        self.app.exec_()  

    def _loadFinished(self, result):  
        self.frame = self.mainFrame()  
        self.app.quit() 
    
    def update_url(self, url):
        self.mainFrame().load(QUrl(url))  
        self.app.exec_()



In [11]:

    
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait # available since 2.4.0
from selenium.webdriver.support import expected_conditions as EC # available since 2.26.0



In [ ]:

First get the hotel data links and text summaries



In [4]:

    
url = 'http://www.bringfido.com/lodging/city/phoenix_az_us/'



In [16]:

    
ff = webdriver.Firefox()
ff.get(url)
ff.find_element_by_xpath("//div[@class='photo']")

try:
    element = WebDriverWait(ff, 10).until(EC.presence_of_element_located((By.CLASS_NAME, "photo_inner")))
finally:
    archive_links = ff.find_element_by_xpath('//*[@id="results_list"]/div')
    ff.quit()



In [5]:

    
#This does the magic.Loads everything
r = Render(url)  
#result is a QString.
result = r.frame.toHtml()



In [5]:

    
r.update_url(url)



In [6]:

    
#QString should be converted to string before processed by lxml
formatted_result = str(result.toAscii())



In [7]:

    
#formatted_result



In [7]:

    
#Next build lxml tree from formatted_result
tree = html.fromstring(formatted_result)



In [8]:

    
#Now using correct Xpath we are fetching URL of archives
archive_links = tree.xpath('//*[@id="results_list"]/div')



In [9]:

    
hotel_names = []
text_summaries = []
links = []

for lnk in archive_links:
    hotel_names.append(lnk.xpath('div[2]/h1/a/text()')[0])
    text_summaries.append(lnk.text_content())
    links.append(lnk.xpath('div/h1/a/@href')[0])
    # print(lnk.text_content())
    # print('*'*25)



In [10]:

    
tree.xpath('//div[contains(@id, "results_list")]')









    Out[10]:





[<Element div at 0x10ca10f18>, <Element div at 0x10ca10f70>]



In [14]:

    
res = browser.find_element_by_xpath('//div[contains(@id, "results_list")]/div')



In [14]:

    
hotel_names









    Out[14]:





['Pointe Hilton Squaw Peak Resort',
 'La Quinta Inn Phoenix Arcadia',
 'La Quinta Inn Phoenix North',
 'La Quinta Inn Phoenix Thomas Road',
 'Crossland Economy Studios Phoenix Metro',
 'House, 1 Bedroom, Sleeps 4, 1 Bathroom',
 'Arizona Biltmore Waldorf Astoria',
 '2-Blocks from South Mountain Park',
 'Sheraton Crescent Hotel',
 'Best Western InnSuites Phoenix Hotel & Suites',
 'Sheraton Phoenix Downtown Hotel',
 'Embassy Suites Phoenix Airport West',
 'Condo, 1 Bedroom, Sleeps 2, 1 Bathroom',
 'Pointe Hilton Tapatio Cliffs Resort',
 'Royal Palms Resort & Spa']



In [13]:

    
hotel_names = ['Pointe Hilton Squaw Peak Resort',
 'La Quinta Inn Phoenix Arcadia',
 'La Quinta Inn Phoenix North',
 'La Quinta Inn Phoenix Thomas Road',
 'Crossland Economy Studios Phoenix Metro',
 'House, 1 Bedroom, Sleeps 4, 1 Bathroom',
 'Arizona Biltmore Waldorf Astoria',
 '2-Blocks from South Mountain Park',
 'Sheraton Crescent Hotel',
 'Best Western InnSuites Phoenix Hotel & Suites',
 'Sheraton Phoenix Downtown Hotel',
 'Embassy Suites Phoenix Airport West',
 'Condo, 1 Bedroom, Sleeps 2, 1 Bathroom',
 'Pointe Hilton Tapatio Cliffs Resort',
 'Royal Palms Resort & Spa']



In [11]:

    
links









    Out[11]:





['/lodging/64846/?cid=14020&ar=&dt=&rm=1&ad=1&ch=0&dg=1&rt=199',
 '/lodging/64835/?cid=14020&ar=&dt=&rm=1&ad=1&ch=0&dg=1&rt=64',
 '/lodging/64834/?cid=14020&ar=&dt=&rm=1&ad=1&ch=0&dg=1&rt=62',
 '/lodging/64836/?cid=14020&ar=&dt=&rm=1&ad=1&ch=0&dg=1&rt=64',
 '/lodging/64824/?cid=14020&ar=&dt=&rm=1&ad=1&ch=0&dg=1&rt=44.99',
 '/lodging/132841/?cid=14020&ar=&dt=&rm=1&ad=1&ch=0&dg=1&rt=53',
 '/lodging/64763/?cid=14020&ar=&dt=&rm=1&ad=1&ch=0&dg=1&rt=329',
 '/lodging/130968/?cid=14020&ar=&dt=&rm=1&ad=1&ch=0&dg=1&rt=66',
 '/lodging/64862/?cid=14020&ar=&dt=&rm=1&ad=1&ch=0&dg=1&rt=159',
 '/lodging/64769/?cid=14020&ar=&dt=&rm=1&ad=1&ch=0&dg=1&rt=71.1',
 '/lodging/64863/?cid=14020&ar=&dt=&rm=1&ad=1&ch=0&dg=1&rt=319',
 '/lodging/64794/?cid=14020&ar=&dt=&rm=1&ad=1&ch=0&dg=1&rt=71.2',
 '/lodging/132840/?cid=14020&ar=&dt=&rm=1&ad=1&ch=0&dg=1&rt=30',
 '/lodging/64847/?cid=14020&ar=&dt=&rm=1&ad=1&ch=0&dg=1&rt=199',
 '/lodging/64861/?cid=14020&ar=&dt=&rm=1&ad=1&ch=0&dg=1&rt=134.78']



In [15]:

    
links = ['/lodging/64846/?cid=14020&ar=&dt=&rm=1&ad=1&ch=0&dg=1&rt=199',
         '/lodging/64835/?cid=14020&ar=&dt=&rm=1&ad=1&ch=0&dg=1&rt=64',
         '/lodging/64834/?cid=14020&ar=&dt=&rm=1&ad=1&ch=0&dg=1&rt=62',
         '/lodging/64836/?cid=14020&ar=&dt=&rm=1&ad=1&ch=0&dg=1&rt=64',
         '/lodging/64824/?cid=14020&ar=&dt=&rm=1&ad=1&ch=0&dg=1&rt=44.99',
         '/lodging/132841/?cid=14020&ar=&dt=&rm=1&ad=1&ch=0&dg=1&rt=53',
         '/lodging/64763/?cid=14020&ar=&dt=&rm=1&ad=1&ch=0&dg=1&rt=329',
         '/lodging/130968/?cid=14020&ar=&dt=&rm=1&ad=1&ch=0&dg=1&rt=66',
         '/lodging/64862/?cid=14020&ar=&dt=&rm=1&ad=1&ch=0&dg=1&rt=159',
         '/lodging/64769/?cid=14020&ar=&dt=&rm=1&ad=1&ch=0&dg=1&rt=71.1',
         '/lodging/64863/?cid=14020&ar=&dt=&rm=1&ad=1&ch=0&dg=1&rt=319',
         '/lodging/64794/?cid=14020&ar=&dt=&rm=1&ad=1&ch=0&dg=1&rt=71.2',
         '/lodging/132840/?cid=14020&ar=&dt=&rm=1&ad=1&ch=0&dg=1&rt=30',
         '/lodging/64847/?cid=14020&ar=&dt=&rm=1&ad=1&ch=0&dg=1&rt=199',
         '/lodging/64861/?cid=14020&ar=&dt=&rm=1&ad=1&ch=0&dg=1&rt=134.78']



In [11]:

    
text_summaries









    Out[11]:





['Pointe Hilton Squaw Peak ResortPhoenix, AZ, USPointe Hilton Squaw Peak Resort welcomes two pets, under 75 lbs, for an additional $75 per stay.Hotel Overview | Map | Photos | Guest ReviewsLow Rates from$199 + pet feeCHECK RATES',
 'La Quinta Inn Phoenix ArcadiaPhoenix, AZ, USLa Quinta Inn Phoenix Arcadia is dog friendly. Two pets of any size are welcome for no additional fee. All rooms are pet friendly.Hotel Overview | Map | Photos | Guest ReviewsLow Rates from$64 (no pet fee)VISIT WEBSITE',
 'La Quinta Inn Phoenix NorthPhoenix, AZ, USLa Quinta Inn Phoenix North is dog friendly. Two pets of any size are welcome for no additional fee. All rooms are pet friendly.Hotel Overview | Map | Photos | Guest ReviewsLow Rates from$62 (no pet fee)VISIT WEBSITE',
 'La Quinta Inn Phoenix Thomas RoadPhoenix, AZ, USLa Quinta Inn Phoenix Thomas Road is dog friendly. Two pets of any size are welcome for no additional fee. All rooms are pet friendly.Hotel Overview | Map | Photos | Guest ReviewsLow Rates from$64 (no pet fee)VISIT WEBSITE',
 'Crossland Economy Studios Phoenix MetroPhoenix, AZ, USCrossland Economy Studios Phoenix Metro is pet friendly! Two cats or dogs of any size are welcome for an additional fee of $25 per pet per night, not to exceed $150 per room. Well-behaved pets are ...Hotel Overview | Map | Photos | Guest ReviewsLow Rates from$45 + pet feeCHECK RATES',
 'House, 1 Bedroom, Sleeps 4, 1 BathroomPhoenix, AZ, USPets are allowed with prior approval from the homeowner only. Please inquire for information on pet fee and other restrictions.Hotel Overview | Map | Photos | Guest ReviewsLow Rates from$53 + pet feeVISIT WEBSITE',
 'Arizona Biltmore Waldorf AstoriaPhoenix, AZ, USArizona Biltmore Resort & Spa is pet friendly! Up to two pets under 100 lbs are allowed per room for a $100 flat fee per stay. Please note that there is a limited number of pet ...Hotel Overview | Map | Photos | Guest ReviewsLow Rates from$329 + pet feeCHECK RATES',
 '2-Blocks from South Mountain ParkPhoenix, AZ, USPets are allowed with prior approval from the homeowner only. Please inquire for information on pet fee and other restrictions. Large, fenced-in lot available for your pet to run and enjoy.Hotel Overview | Map | Photos | Guest ReviewsLow Rates from$66 + pet feeVISIT WEBSITE',
 'Sheraton Crescent HotelPhoenix, AZ, USThe Sheraton Crescent Hotel is pet friendly! Up to two pets (50 lbs or less) are welcome for no additional fee. Guests traveling with pets are required to sign a pet waiver upon check-in.Hotel Overview | Map | Photos | Guest ReviewsLow Rates from$159 (no pet fee)CHECK RATES',
 'Best Western InnSuites Phoenix Hotel & SuitesPhoenix, AZ, USBest Western Innsuites Hotel Phoenix is pet friendly! Two dogs (up to 80 lbs) are welcome in select rooms WITH TWO QUEEN BEDS for an additional fee of $20 per stay. Pet friendly rooms are limited. ...Hotel Overview | Map | Photos | Guest ReviewsLow Rates from$71 + pet feeVISIT WEBSITE',
 'Sheraton Phoenix Downtown HotelPhoenix, AZ, USThe Sheraton is pet friendly. They welcome up to two DOGS only, (max weight 80 lbs) They do not charge a pet fee, but dogs may not be left alone in the room.Hotel Overview | Map | Photos | Guest ReviewsLow Rates from$319 + pet feeCHECK RATES',
 'Embassy Suites Phoenix Airport WestPhoenix, AZ, USThe Embassy Suites Phoenix - Airport West welcomes multiple pets, under 50 lbs, for an additional $25 per room, per night.Hotel Overview | Map | Photos | Guest ReviewsLow Rates from$71 + pet feeCHECK RATES',
 'Condo, 1 Bedroom, Sleeps 2, 1 BathroomPhoenix, AZ, USPets are allowed with prior approval from the homeowner only. Please inquire for information on pet fee and other restrictions.Hotel Overview | Map | Photos | Guest ReviewsLow Rates from$30 + pet feeVISIT WEBSITE',
 'Pointe Hilton Tapatio Cliffs ResortPhoenix, AZ, USPointe Hilton Tapatio Cliffs Resort welcomes a maximum of two pets, 75 lbs or less, per guest room for an additional $75 per stay. Additional pets may be accepted with approval prior to booking.Hotel Overview | Map | Photos | Guest ReviewsLow Rates from$199 + pet feeCHECK RATES',
 'Royal Palms Resort & SpaPhoenix, AZ, USThe Royal Palms Resort and Spa is pet friendly! Up to two dogs (with a combined weight of 30 lbs or less) are welcome for an additional fee of $100 per stay along with a $150 ...Hotel Overview | Map | Photos | Guest ReviewsLow Rates from$135 + pet feeCHECK RATES']



In [9]:

    
url_base = 'http://www.bringfido.com'  
r.update_url(url_base+links[1])



In [10]:

    
columns = ['hotel_id',
           'hotel_name',
           'hotel_address',
           'hotel_city',
           'hotel_state',
           'hotel_rating',
           'hotel_latitude',
           'hotel_longitude',
           'review_count',
           'hotel_address',
           'business_id',
           'review_id',
           'user_id',
           'review_text',
           'review_rating',
           'review_date']



In [11]:

    
df = pd.DataFrame(columns=columns)



In [12]:

    
df.head(1)









    Out[12]:






  
    
      
      hotel_id
      hotel_name
      hotel_address
      hotel_city
      hotel_state
      hotel_rating
      hotel_latitude
      hotel_longitude
      review_count
      hotel_address
      business_id
      review_id
      user_id
      review_text
      review_rating
      review_date



In [14]:

    
result = r.frame.toHtml()

#QString should be converted to string before processed by lxml
formatted_result = str(result.toAscii())

tree = html.fromstring(formatted_result) 

hotel_description = tree.xpath('//*[@class="body"]/text()')

# scrape the address details section of the page
details = tree.xpath('//*[@class="address"]/text()')

# now get just the address:
address = details[0]

# and just the city, state, country, and zip code:
csczip = details[1]

# and just the phone number
phone = details[2]

# outer_addresses.append(address)
# outer_csczip.append(csczip)
# outer_phones.append(phone)

#Now using correct Xpath we are fetching URL of archives
reviews = tree.xpath('//*[@class="review_container"]')

texts = []
titles = []
authors = []
ratings = []

print(reviews)
print('')
for rev in reviews:
    titles.append(rev.xpath('div/div[1]/text()')[0])
    authors.append(rev.xpath('div/div[2]/text()')[0])
    texts.append(rev.xpath('div/div[3]/text()')[0])
    ratings.append(rev.xpath('div[2]/img/@src')[0].split('/')[-1][0:1])
    print(rev.xpath('div[2]/img/@src')[0].split('/')[-1][0:1])









    



[<Element div at 0x117b471b0>, <Element div at 0x117b47208>, <Element div at 0x117b47260>, <Element div at 0x117b472b8>, <Element div at 0x117b47310>, <Element div at 0x117b47368>]

5
5
5
4
3
1



In [15]:

    
city, state, zipcode = csczip.strip().split(',')
zipcode = zipcode[3:]



In [16]:

    
print(city)
print(state)
print(zipcode)









    



Phoenix
 AZ
 85018



In [17]:

    
texts









    Out[17]:





["We've stayed at this La Quinta several times with our two mini schnauzers. The staff is all very friendly. Rooms were clean but not overly fancy - we felt like we didn't need to worry leaving our dogs there for short periods while we went out to eat, etc. In a great location. Great breakfast too!",
 'This place was awesome!! The entire staff was extremely friendly and accommodating to us and our English Mastiff, Kadence. Dog area was well lit, clean and provided clean up bags. Continental Breakfast had a great selection. There was plenty of space in the room for our kennel and all of our luggage. I would recommend this hotel to dog owners and non- dog owners alike. If we ever pass through this way again this will definitely be the place we stay!! ',
 "We've stayed at this hotel two times with our three mini dachshunds. Nice grass area with picnic tables and doggie bags everywhere. Great place and very dog friendly.",
 "My room while a bit small was very clean and the beds were super comfy. The free breakfast was pretty standard for hotel breakfast with the exception of the biscuits and gravy which were delicious. The staff was extremely helpful and friendly. The location was great too. Within walking distance to a shopping mall that had practically everything in it. A median strip between 47/48th st that's built like a park/walkway was the perfect spot to take my dogs on a walk. There's also another park a few blocks south that's nice if you need to change things up a bit.\nThere was multiple wifi access points too so you can pick the one with the best signal strength. In the few other hotels I've been staying at on my trip they will have only one wifi router so if you have a crappy signal you'd have to move to another room or deal with it.\nFor the price I think this is one of the better places to stay.\n",
 'The big gentleman who manages the front desk is very creepy. He is rude and has very young girls working the front desk that seem to be very submissive towards him, eek! The thought of the rest!',
 "I was driving from So Cal to Houston, TX for the holidays and stayed at this La Quinta because I wanted an inexpensive, dog friendly hotel with a reputation for cleanliness and reliability. This place did not meet my standards at all. The front desk person was rude, unattentive and unhelful. Our room was a wreck. We ended up in a handicap room-which was dirty and the shower wand didn't work. I do NOT recommend this place at all. Yuck!"]



In [18]:

    
df = pd.DataFrame(columns=columns)



In [19]:

    
df['review_text'] = texts



In [21]:

    
df['hotel_id'] = 1



In [24]:

    
df









    Out[24]:






  
    
      
      hotel_id
      hotel_name
      hotel_address
      hotel_city
      hotel_state
      hotel_rating
      hotel_latitude
      hotel_longitude
      review_count
      hotel_address
      business_id
      review_id
      user_id
      review_text
      review_rating
      review_date
    
  
  
    
      0
      1
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      We've stayed at this La Quinta several times w...
      NaN
      NaN
    
    
      1
      1
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      This place was awesome!! The entire staff was ...
      NaN
      NaN
    
    
      2
      1
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      We've stayed at this hotel two times with our ...
      NaN
      NaN
    
    
      3
      1
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      My room while a bit small was very clean and t...
      NaN
      NaN
    
    
      4
      1
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      The big gentleman who manages the front desk i...
      NaN
      NaN
    
    
      5
      1
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      I was driving from So Cal to Houston, TX for t...
      NaN
      NaN

Now cycle through each hotel, and scrape the reviews, ratings, etc.



In [16]:

    
url_base = 'http://www.bringfido.com'  

columns = ['hotel_id',
           'hotel_url',
           'hotel_name',
           'hotel_address',
           'hotel_city',
           'hotel_state',
           'hotel_rating',
           'hotel_latitude',
           'hotel_longitude',
           'review_count',
           'hotel_address',
           'business_id',
           'review_id',
           'user_id',
           'username',
           'review_title',
           'review_text',
           'review_rating',
           'review_date']

bigdf = pd.DataFrame(columns=columns)


for hotel_id, link in enumerate(links[:3]):
    print('*'*50)
    print('Now on {}'.format(link))
    print('*'*50)
    r.update_url(url_base+link)  
    result = r.frame.toHtml()
    
    df = pd.DataFrame(columns=columns)

    #QString should be converted to string before processed by lxml
    formatted_result = str(result.toAscii())

    tree = html.fromstring(formatted_result) 

    hotel_description = tree.xpath('//*[@class="body"]/text()')

    # scrape the address details section of the page
    details = tree.xpath('//*[@class="address"]/text()')

    # now get just the address:
    address = details[0]
    
    # and just the city, state, country, and zip code:
    csczip = details[1]
    
    # and just the phone number
    phone = details[2]
    
    # now separate the city, state, and zip:
    city, state, zipcode = csczip.strip().split(',')
    zipcode = zipcode[3:]
    
    #Now using correct Xpath we are fetching URL of archives
    reviews = tree.xpath('//*[@class="review_container"]')

    texts = []
    titles = []
    authors = []
    ratings = []

    print(reviews)
    print('')
    for rev in reviews:
        titles.append(rev.xpath('div/div[1]/text()')[0])
        authors.append(rev.xpath('div/div[2]/text()')[0])
        texts.append(rev.xpath('div/div[3]/text()')[0])
        ratings.append(rev.xpath('div[2]/img/@src')[0].split('/')[-1][0:1])
        print(rev.xpath('div[2]/img/@src')[0].split('/')[-1][0:1])
    
    df['review_title'] = titles
    df['username'] = authors
    df['review_text'] = texts
    df['review_rating'] = ratings
    df['hotel_id'] = hotel_id
    df['hotel_name'] = hotel_names[hotel_id]
    df['hotel_url'] = url
    df['hotel_address'] = address
    df['hotel_city'] = city
    df['hotel_state'] = state
    df['hotel_rating'] = np.mean([int(rat) for rat in ratings])
    df['hotel_latitude'] = ''
    df['hotel_longitude'] = ''
    df['review_count'] = len(texts)
    df['review_id'] = 0
    df['user_id'] = 0
    
    bigdf = bigdf.append(df)









    



**************************************************
Now on /lodging/64846/?cid=14020&ar=&dt=&rm=1&ad=1&ch=0&dg=1&rt=199
**************************************************
[<Element div at 0x11903ed08>]

5
**************************************************
Now on /lodging/64835/?cid=14020&ar=&dt=&rm=1&ad=1&ch=0&dg=1&rt=64
**************************************************
[<Element div at 0x119009a48>, <Element div at 0x119009aa0>, <Element div at 0x119009af8>, <Element div at 0x119009b50>, <Element div at 0x119009ba8>, <Element div at 0x119009c00>]

5
5
5
4
3
1
**************************************************
Now on /lodging/64834/?cid=14020&ar=&dt=&rm=1&ad=1&ch=0&dg=1&rt=62
**************************************************
[<Element div at 0x10ca10d60>]

2



In [13]:

    
df









    Out[13]:






  
    
      
      hotel_id
      hotel_url
      hotel_name
      hotel_address
      hotel_city
      hotel_state
      hotel_rating
      hotel_latitude
      hotel_longitude
      review_count
      hotel_address
      business_id
      review_id
      user_id
      username
      review_title
      review_text
      review_rating
      review_date
    
  
  
    
      0
      2
      http://www.bringfido.com/lodging/city/phoenix_...
      La Quinta Inn Phoenix North
      \n2510 WGreenway Rd
      Phoenix
      AZ
      2
      
      
      1
      \n2510 WGreenway Rd
      NaN
      0
      0
      \nemma\nin park city\n
      no dog area
      the hotel was fine but it is all hard scape ar...
      2
      NaN



In [15]:

    
bigdf = bigdf.append(df)



In [17]:

    
bigdf









    Out[17]:






  
    
      
      hotel_id
      hotel_url
      hotel_name
      hotel_address
      hotel_city
      hotel_state
      hotel_rating
      hotel_latitude
      hotel_longitude
      review_count
      hotel_address
      business_id
      review_id
      user_id
      username
      review_title
      review_text
      review_rating
      review_date
    
  
  
    
      0
      0
      http://www.bringfido.com/lodging/city/phoenix_...
      Pointe Hilton Squaw Peak Resort
      \n7677 North 16th Street
      Phoenix
      AZ
      5.000000
      
      
      1
      \n7677 North 16th Street
      NaN
      0
      0
      \nMarti Ackermann\nin Tucson, AZ\n
      Pointe Hilton Squaw Peak
      Really nice property. Great walking areas for ...
      5
      NaN
    
    
      0
      1
      http://www.bringfido.com/lodging/city/phoenix_...
      La Quinta Inn Phoenix Arcadia
      \n4727 E Thomas Rd
      Phoenix
      AZ
      3.833333
      
      
      6
      \n4727 E Thomas Rd
      NaN
      0
      0
      \nJen\nin Vancouver\n
      Great place to stay!
      We've stayed at this La Quinta several times w...
      5
      NaN
    
    
      1
      1
      http://www.bringfido.com/lodging/city/phoenix_...
      La Quinta Inn Phoenix Arcadia
      \n4727 E Thomas Rd
      Phoenix
      AZ
      3.833333
      
      
      6
      \n4727 E Thomas Rd
      NaN
      0
      0
      \nJody\nin Sierra Vista, AZ\n
      Awesome Staff!!
      This place was awesome!! The entire staff was ...
      5
      NaN
    
    
      2
      1
      http://www.bringfido.com/lodging/city/phoenix_...
      La Quinta Inn Phoenix Arcadia
      \n4727 E Thomas Rd
      Phoenix
      AZ
      3.833333
      
      
      6
      \n4727 E Thomas Rd
      NaN
      0
      0
      \nKristi\nin Henderson NV\n
      Very dog friendly
      We've stayed at this hotel two times with our ...
      5
      NaN
    
    
      3
      1
      http://www.bringfido.com/lodging/city/phoenix_...
      La Quinta Inn Phoenix Arcadia
      \n4727 E Thomas Rd
      Phoenix
      AZ
      3.833333
      
      
      6
      \n4727 E Thomas Rd
      NaN
      0
      0
      \nChristopher\nin Los Angeles\n
      All around great place to stay.
      My room while a bit small was very clean and t...
      4
      NaN
    
    
      4
      1
      http://www.bringfido.com/lodging/city/phoenix_...
      La Quinta Inn Phoenix Arcadia
      \n4727 E Thomas Rd
      Phoenix
      AZ
      3.833333
      
      
      6
      \n4727 E Thomas Rd
      NaN
      0
      0
      \nDave\nin Sedona\n
      Management and customer service
      The big gentleman who manages the front desk i...
      3
      NaN
    
    
      5
      1
      http://www.bringfido.com/lodging/city/phoenix_...
      La Quinta Inn Phoenix Arcadia
      \n4727 E Thomas Rd
      Phoenix
      AZ
      3.833333
      
      
      6
      \n4727 E Thomas Rd
      NaN
      0
      0
      \nTeri\nin Southern California\n
      Bad Experience.
      I was driving from So Cal to Houston, TX for t...
      1
      NaN
    
    
      0
      2
      http://www.bringfido.com/lodging/city/phoenix_...
      La Quinta Inn Phoenix North
      \n2510 WGreenway Rd
      Phoenix
      AZ
      2.000000
      
      
      1
      \n2510 WGreenway Rd
      NaN
      0
      0
      \nemma\nin park city\n
      no dog area
      the hotel was fine but it is all hard scape ar...
      2
      NaN



In [27]:

    
ratings









    Out[27]:





['5']



In [8]:

    
# for idx, hotel_nm in enumerate(hotel_names):
#     print('*'*50)
#     print(hotel_nm)
#     print(outer_addresses[idx])
#     print(outer_csczip[idx])
#     print(outer_phones[idx])
#     print('Number of reviews: {}'.format(len(outer_texts[idx])))
#     print(outer_texts[idx])

Write data to SQL



In [24]:

    
bigdf.columns









    Out[24]:





Index([u'hotel_id', u'hotel_url', u'hotel_name', u'hotel_address', u'hotel_city', u'hotel_state', u'hotel_rating', u'hotel_latitude', u'hotel_longitude', u'review_count', u'hotel_address', u'business_id', u'review_id', u'user_id', u'username', u'review_title', u'review_text', u'review_rating', u'review_date'], dtype='object')



In [18]:

    
import sqlalchemy
import connect_aws_db as cadb



In [19]:

    
engine = cadb.connect_aws_db(write_unicode=True)



In [20]:

    
conn = engine.connect()

First the reviews



In [28]:

    
cmd = "DROP TABLE bf_reviews"
result = conn.execute(cmd)



In [29]:

    
cmd = """
        CREATE TABLE bf_reviews
        (
        review_id MEDIUMINT AUTO_INCREMENT,
        hotel_id VARCHAR(256),
        business_id VARCHAR(256),
        user_id MEDIUMINT,
        username VARCHAR(128),
        review_title VARCHAR(256),
        review_rating INT,
        review_text VARCHAR(5000),
        PRIMARY KEY (review_id)
        )
        """



In [30]:

    
result = conn.execute(cmd)



In [31]:

    
bigdf_reviews = bigdf[['hotel_id', 'review_id', 'business_id', 'user_id',
                      'username', 'review_title', 'review_text', 'review_rating']].copy()



In [32]:

    
bigdf_reviews.to_sql('bf_reviews', engine, if_exists='append', index=False)

Next the hotels



In [33]:

    
bigdf.columns









    Out[33]:





Index([u'hotel_id', u'hotel_url', u'hotel_name', u'hotel_address', u'hotel_city', u'hotel_state', u'hotel_rating', u'hotel_latitude', u'hotel_longitude', u'review_count', u'hotel_address', u'business_id', u'review_id', u'user_id', u'username', u'review_title', u'review_text', u'review_rating', u'review_date'], dtype='object')



In [63]:

    
cmd = "DROP TABLE bf_hotels"
result = conn.execute(cmd)



In [64]:

    
cmd = """
        CREATE TABLE bf_hotels
        (
        hotel_id MEDIUMINT AUTO_INCREMENT,
        hotel_url VARCHAR(512),
        hotel_name VARCHAR(512),
        hotel_address VARCHAR(1024),
        hotel_city VARCHAR(512),
        hotel_state VARCHAR(32),
        hotel_rating INT,
        hotel_latitude FLOAT,
        hotel_longitude FLOAT,
        business_id VARCHAR(256),
        review_count MEDIUMINT,
        PRIMARY KEY (hotel_id)
        )
        """



In [65]:

    
result = conn.execute(cmd)



In [66]:

    
bigdf_hotels = bigdf[['hotel_id', 'hotel_url', 'hotel_name', 'hotel_address',
                      'hotel_city', 'hotel_state', 'hotel_rating', 'hotel_latitude',
                      'hotel_longitude', 'review_count']].copy()



In [67]:

    
bigdf_hotels['hotel_id'] = 0
bigdf_hotels['hotel_latitude'] = 0.
bigdf_hotels['hotel_longitude'] = 0.



In [68]:

    
bigdf_hotels.to_sql('bf_hotels', engine, if_exists='append', index=False)



In [ ]:

	hotel_id	hotel_name	hotel_address	hotel_city	hotel_state	hotel_rating	hotel_latitude	hotel_longitude	review_count	hotel_address	business_id	review_id	user_id	review_text	review_rating	review_date
0	1	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	We've stayed at this La Quinta several times w...	NaN	NaN
1	1	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	This place was awesome!! The entire staff was ...	NaN	NaN
2	1	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	We've stayed at this hotel two times with our ...	NaN	NaN
3	1	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	My room while a bit small was very clean and t...	NaN	NaN
4	1	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	The big gentleman who manages the front desk i...	NaN	NaN
5	1	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	I was driving from So Cal to Houston, TX for t...	NaN	NaN

	hotel_id	hotel_url	hotel_name	hotel_address	hotel_city	hotel_state	hotel_rating	review_count	hotel_address	business_id	username	review_title	review_text	review_rating	review_date
0	0	http://www.bringfido.com/lodging/city/phoenix_...	Pointe Hilton Squaw Peak Resort	\n7677 North 16th Street	Phoenix	AZ	5.000000	1	\n7677 North 16th Street	NaN	\nMarti Ackermann\nin Tucson, AZ\n	Pointe Hilton Squaw Peak	Really nice property. Great walking areas for ...	5	NaN
0	1	http://www.bringfido.com/lodging/city/phoenix_...	La Quinta Inn Phoenix Arcadia	\n4727 E Thomas Rd	Phoenix	AZ	3.833333	6	\n4727 E Thomas Rd	NaN	\nJen\nin Vancouver\n	Great place to stay!	We've stayed at this La Quinta several times w...	5	NaN
1	1	http://www.bringfido.com/lodging/city/phoenix_...	La Quinta Inn Phoenix Arcadia	\n4727 E Thomas Rd	Phoenix	AZ	3.833333	6	\n4727 E Thomas Rd	NaN	\nJody\nin Sierra Vista, AZ\n	Awesome Staff!!	This place was awesome!! The entire staff was ...	5	NaN
2	1	http://www.bringfido.com/lodging/city/phoenix_...	La Quinta Inn Phoenix Arcadia	\n4727 E Thomas Rd	Phoenix	AZ	3.833333	6	\n4727 E Thomas Rd	NaN	\nKristi\nin Henderson NV\n	Very dog friendly	We've stayed at this hotel two times with our ...	5	NaN
3	1	http://www.bringfido.com/lodging/city/phoenix_...	La Quinta Inn Phoenix Arcadia	\n4727 E Thomas Rd	Phoenix	AZ	3.833333	6	\n4727 E Thomas Rd	NaN	\nChristopher\nin Los Angeles\n	All around great place to stay.	My room while a bit small was very clean and t...	4	NaN
4	1	http://www.bringfido.com/lodging/city/phoenix_...	La Quinta Inn Phoenix Arcadia	\n4727 E Thomas Rd	Phoenix	AZ	3.833333	6	\n4727 E Thomas Rd	NaN	\nDave\nin Sedona\n	Management and customer service	The big gentleman who manages the front desk i...	3	NaN
5	1	http://www.bringfido.com/lodging/city/phoenix_...	La Quinta Inn Phoenix Arcadia	\n4727 E Thomas Rd	Phoenix	AZ	3.833333	6	\n4727 E Thomas Rd	NaN	\nTeri\nin Southern California\n	Bad Experience.	I was driving from So Cal to Houston, TX for t...	1	NaN
0	2	http://www.bringfido.com/lodging/city/phoenix_...	La Quinta Inn Phoenix North	\n2510 WGreenway Rd	Phoenix	AZ	2.000000	1	\n2510 WGreenway Rd	NaN	\nemma\nin park city\n	no dog area	the hotel was fine but it is all hard scape ar...	2	NaN