drive_bf_qt_scraping

A notebook to describing the code used to drive the scraping.


In [1]:
import sys  
from PyQt4.QtGui import *  
from PyQt4.QtCore import *  
from PyQt4.QtWebKit import *  
from lxml import html
import pandas as pd
import numpy as np
import time

In [2]:
import sys
from PyQt4 import QtGui, QtCore, QtWebKit 

class MainForm(QtGui.QMainWindow):
    def __init__(self, parent=None):
        super(MainForm, self).__init__(parent)

        self.pages = ['http://www.google.com', 'http://www.citrix.com', 'http://yahoo.com', 'http://reddit.com']
        self.index = 0

        self.view = QtWebKit.QWebView()  
        self.view.connect(self.view, QtCore.SIGNAL('loadFinished(bool)'), self.loadFinished)
        self.setCentralWidget(self.view)

        self.view.load(QtCore.QUrl(self.pages[self.index]))

    def loadFinished(self, ok):
        self.index += 1
        if self.index < len(self.pages) :
            self.view.load(QtCore.QUrl(self.pages[self.index]))
        else:
            print 'done'

def main():
    app = QtGui.QApplication(sys.argv)
    form = MainForm()
    form.show()
    app.exec_()

In [3]:
class Render(QWebPage):  
    def __init__(self, url):  
        self.app = QApplication(sys.argv)  
        QWebPage.__init__(self)  
        self.loadFinished.connect(self._loadFinished)  
        self.mainFrame().load(QUrl(url))  
        self.app.exec_()  

    def _loadFinished(self, result):  
        self.frame = self.mainFrame()  
        self.app.quit() 
    
    def update_url(self, url):
        self.mainFrame().load(QUrl(url))  
        self.app.exec_()

In [11]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait # available since 2.4.0
from selenium.webdriver.support import expected_conditions as EC # available since 2.26.0

In [ ]:


In [4]:
url = 'http://www.bringfido.com/lodging/city/phoenix_az_us/'

In [16]:
ff = webdriver.Firefox()
ff.get(url)
ff.find_element_by_xpath("//div[@class='photo']")

try:
    element = WebDriverWait(ff, 10).until(EC.presence_of_element_located((By.CLASS_NAME, "photo_inner")))
finally:
    archive_links = ff.find_element_by_xpath('//*[@id="results_list"]/div')
    ff.quit()

In [5]:
#This does the magic.Loads everything
r = Render(url)  
#result is a QString.
result = r.frame.toHtml()

In [5]:
r.update_url(url)

In [6]:
#QString should be converted to string before processed by lxml
formatted_result = str(result.toAscii())

In [7]:
#formatted_result

In [7]:
#Next build lxml tree from formatted_result
tree = html.fromstring(formatted_result)

In [8]:
#Now using correct Xpath we are fetching URL of archives
archive_links = tree.xpath('//*[@id="results_list"]/div')

In [9]:
hotel_names = []
text_summaries = []
links = []

for lnk in archive_links:
    hotel_names.append(lnk.xpath('div[2]/h1/a/text()')[0])
    text_summaries.append(lnk.text_content())
    links.append(lnk.xpath('div/h1/a/@href')[0])
    # print(lnk.text_content())
    # print('*'*25)

In [10]:
tree.xpath('//div[contains(@id, "results_list")]')


Out[10]:
[<Element div at 0x10ca10f18>, <Element div at 0x10ca10f70>]

In [14]:
res = browser.find_element_by_xpath('//div[contains(@id, "results_list")]/div')

In [14]:
hotel_names


Out[14]:
['Pointe Hilton Squaw Peak Resort',
 'La Quinta Inn Phoenix Arcadia',
 'La Quinta Inn Phoenix North',
 'La Quinta Inn Phoenix Thomas Road',
 'Crossland Economy Studios Phoenix Metro',
 'House, 1 Bedroom, Sleeps 4, 1 Bathroom',
 'Arizona Biltmore Waldorf Astoria',
 '2-Blocks from South Mountain Park',
 'Sheraton Crescent Hotel',
 'Best Western InnSuites Phoenix Hotel & Suites',
 'Sheraton Phoenix Downtown Hotel',
 'Embassy Suites Phoenix Airport West',
 'Condo, 1 Bedroom, Sleeps 2, 1 Bathroom',
 'Pointe Hilton Tapatio Cliffs Resort',
 'Royal Palms Resort & Spa']

In [13]:
hotel_names = ['Pointe Hilton Squaw Peak Resort',
 'La Quinta Inn Phoenix Arcadia',
 'La Quinta Inn Phoenix North',
 'La Quinta Inn Phoenix Thomas Road',
 'Crossland Economy Studios Phoenix Metro',
 'House, 1 Bedroom, Sleeps 4, 1 Bathroom',
 'Arizona Biltmore Waldorf Astoria',
 '2-Blocks from South Mountain Park',
 'Sheraton Crescent Hotel',
 'Best Western InnSuites Phoenix Hotel & Suites',
 'Sheraton Phoenix Downtown Hotel',
 'Embassy Suites Phoenix Airport West',
 'Condo, 1 Bedroom, Sleeps 2, 1 Bathroom',
 'Pointe Hilton Tapatio Cliffs Resort',
 'Royal Palms Resort & Spa']

In [11]:
links


Out[11]:
['/lodging/64846/?cid=14020&ar=&dt=&rm=1&ad=1&ch=0&dg=1&rt=199',
 '/lodging/64835/?cid=14020&ar=&dt=&rm=1&ad=1&ch=0&dg=1&rt=64',
 '/lodging/64834/?cid=14020&ar=&dt=&rm=1&ad=1&ch=0&dg=1&rt=62',
 '/lodging/64836/?cid=14020&ar=&dt=&rm=1&ad=1&ch=0&dg=1&rt=64',
 '/lodging/64824/?cid=14020&ar=&dt=&rm=1&ad=1&ch=0&dg=1&rt=44.99',
 '/lodging/132841/?cid=14020&ar=&dt=&rm=1&ad=1&ch=0&dg=1&rt=53',
 '/lodging/64763/?cid=14020&ar=&dt=&rm=1&ad=1&ch=0&dg=1&rt=329',
 '/lodging/130968/?cid=14020&ar=&dt=&rm=1&ad=1&ch=0&dg=1&rt=66',
 '/lodging/64862/?cid=14020&ar=&dt=&rm=1&ad=1&ch=0&dg=1&rt=159',
 '/lodging/64769/?cid=14020&ar=&dt=&rm=1&ad=1&ch=0&dg=1&rt=71.1',
 '/lodging/64863/?cid=14020&ar=&dt=&rm=1&ad=1&ch=0&dg=1&rt=319',
 '/lodging/64794/?cid=14020&ar=&dt=&rm=1&ad=1&ch=0&dg=1&rt=71.2',
 '/lodging/132840/?cid=14020&ar=&dt=&rm=1&ad=1&ch=0&dg=1&rt=30',
 '/lodging/64847/?cid=14020&ar=&dt=&rm=1&ad=1&ch=0&dg=1&rt=199',
 '/lodging/64861/?cid=14020&ar=&dt=&rm=1&ad=1&ch=0&dg=1&rt=134.78']

In [15]:
links = ['/lodging/64846/?cid=14020&ar=&dt=&rm=1&ad=1&ch=0&dg=1&rt=199',
         '/lodging/64835/?cid=14020&ar=&dt=&rm=1&ad=1&ch=0&dg=1&rt=64',
         '/lodging/64834/?cid=14020&ar=&dt=&rm=1&ad=1&ch=0&dg=1&rt=62',
         '/lodging/64836/?cid=14020&ar=&dt=&rm=1&ad=1&ch=0&dg=1&rt=64',
         '/lodging/64824/?cid=14020&ar=&dt=&rm=1&ad=1&ch=0&dg=1&rt=44.99',
         '/lodging/132841/?cid=14020&ar=&dt=&rm=1&ad=1&ch=0&dg=1&rt=53',
         '/lodging/64763/?cid=14020&ar=&dt=&rm=1&ad=1&ch=0&dg=1&rt=329',
         '/lodging/130968/?cid=14020&ar=&dt=&rm=1&ad=1&ch=0&dg=1&rt=66',
         '/lodging/64862/?cid=14020&ar=&dt=&rm=1&ad=1&ch=0&dg=1&rt=159',
         '/lodging/64769/?cid=14020&ar=&dt=&rm=1&ad=1&ch=0&dg=1&rt=71.1',
         '/lodging/64863/?cid=14020&ar=&dt=&rm=1&ad=1&ch=0&dg=1&rt=319',
         '/lodging/64794/?cid=14020&ar=&dt=&rm=1&ad=1&ch=0&dg=1&rt=71.2',
         '/lodging/132840/?cid=14020&ar=&dt=&rm=1&ad=1&ch=0&dg=1&rt=30',
         '/lodging/64847/?cid=14020&ar=&dt=&rm=1&ad=1&ch=0&dg=1&rt=199',
         '/lodging/64861/?cid=14020&ar=&dt=&rm=1&ad=1&ch=0&dg=1&rt=134.78']

In [11]:
text_summaries


Out[11]:
['Pointe Hilton Squaw Peak ResortPhoenix, AZ, USPointe Hilton Squaw Peak Resort welcomes two pets, under 75 lbs, for an additional $75 per stay.Hotel Overview | Map | Photos | Guest ReviewsLow Rates from$199 + pet feeCHECK RATES',
 'La Quinta Inn Phoenix ArcadiaPhoenix, AZ, USLa Quinta Inn Phoenix Arcadia is dog friendly. Two pets of any size are welcome for no additional fee. All rooms are pet friendly.Hotel Overview | Map | Photos | Guest ReviewsLow Rates from$64 (no pet fee)VISIT WEBSITE',
 'La Quinta Inn Phoenix NorthPhoenix, AZ, USLa Quinta Inn Phoenix North is dog friendly. Two pets of any size are welcome for no additional fee. All rooms are pet friendly.Hotel Overview | Map | Photos | Guest ReviewsLow Rates from$62 (no pet fee)VISIT WEBSITE',
 'La Quinta Inn Phoenix Thomas RoadPhoenix, AZ, USLa Quinta Inn Phoenix Thomas Road is dog friendly. Two pets of any size are welcome for no additional fee. All rooms are pet friendly.Hotel Overview | Map | Photos | Guest ReviewsLow Rates from$64 (no pet fee)VISIT WEBSITE',
 'Crossland Economy Studios Phoenix MetroPhoenix, AZ, USCrossland Economy Studios Phoenix Metro is pet friendly! Two cats or dogs of any size are welcome for an additional fee of $25 per pet per night, not to exceed $150 per room. Well-behaved pets are ...Hotel Overview | Map | Photos | Guest ReviewsLow Rates from$45 + pet feeCHECK RATES',
 'House, 1 Bedroom, Sleeps 4, 1 BathroomPhoenix, AZ, USPets are allowed with prior approval from the homeowner only. Please inquire for information on pet fee and other restrictions.Hotel Overview | Map | Photos | Guest ReviewsLow Rates from$53 + pet feeVISIT WEBSITE',
 'Arizona Biltmore Waldorf AstoriaPhoenix, AZ, USArizona Biltmore Resort & Spa is pet friendly! Up to two pets under 100 lbs are allowed per room for a $100 flat fee per stay. Please note that there is a limited number of pet ...Hotel Overview | Map | Photos | Guest ReviewsLow Rates from$329 + pet feeCHECK RATES',
 '2-Blocks from South Mountain ParkPhoenix, AZ, USPets are allowed with prior approval from the homeowner only. Please inquire for information on pet fee and other restrictions. Large, fenced-in lot available for your pet to run and enjoy.Hotel Overview | Map | Photos | Guest ReviewsLow Rates from$66 + pet feeVISIT WEBSITE',
 'Sheraton Crescent HotelPhoenix, AZ, USThe Sheraton Crescent Hotel is pet friendly! Up to two pets (50 lbs or less) are welcome for no additional fee. Guests traveling with pets are required to sign a pet waiver upon check-in.Hotel Overview | Map | Photos | Guest ReviewsLow Rates from$159 (no pet fee)CHECK RATES',
 'Best Western InnSuites Phoenix Hotel & SuitesPhoenix, AZ, USBest Western Innsuites Hotel Phoenix is pet friendly! Two dogs (up to 80 lbs) are welcome in select rooms WITH TWO QUEEN BEDS for an additional fee of $20 per stay. Pet friendly rooms are limited. ...Hotel Overview | Map | Photos | Guest ReviewsLow Rates from$71 + pet feeVISIT WEBSITE',
 'Sheraton Phoenix Downtown HotelPhoenix, AZ, USThe Sheraton is pet friendly. They welcome up to two DOGS only, (max weight 80 lbs) They do not charge a pet fee, but dogs may not be left alone in the room.Hotel Overview | Map | Photos | Guest ReviewsLow Rates from$319 + pet feeCHECK RATES',
 'Embassy Suites Phoenix Airport WestPhoenix, AZ, USThe Embassy Suites Phoenix - Airport West welcomes multiple pets, under 50 lbs, for an additional $25 per room, per night.Hotel Overview | Map | Photos | Guest ReviewsLow Rates from$71 + pet feeCHECK RATES',
 'Condo, 1 Bedroom, Sleeps 2, 1 BathroomPhoenix, AZ, USPets are allowed with prior approval from the homeowner only. Please inquire for information on pet fee and other restrictions.Hotel Overview | Map | Photos | Guest ReviewsLow Rates from$30 + pet feeVISIT WEBSITE',
 'Pointe Hilton Tapatio Cliffs ResortPhoenix, AZ, USPointe Hilton Tapatio Cliffs Resort welcomes a maximum of two pets, 75 lbs or less, per guest room for an additional $75 per stay. Additional pets may be accepted with approval prior to booking.Hotel Overview | Map | Photos | Guest ReviewsLow Rates from$199 + pet feeCHECK RATES',
 'Royal Palms Resort & SpaPhoenix, AZ, USThe Royal Palms Resort and Spa is pet friendly! Up to two dogs (with a combined weight of 30 lbs or less) are welcome for an additional fee of $100 per stay along with a $150 ...Hotel Overview | Map | Photos | Guest ReviewsLow Rates from$135 + pet feeCHECK RATES']

In [9]:
url_base = 'http://www.bringfido.com'  
r.update_url(url_base+links[1])

In [10]:
columns = ['hotel_id',
           'hotel_name',
           'hotel_address',
           'hotel_city',
           'hotel_state',
           'hotel_rating',
           'hotel_latitude',
           'hotel_longitude',
           'review_count',
           'hotel_address',
           'business_id',
           'review_id',
           'user_id',
           'review_text',
           'review_rating',
           'review_date']

In [11]:
df = pd.DataFrame(columns=columns)

In [12]:
df.head(1)


Out[12]:
hotel_id hotel_name hotel_address hotel_city hotel_state hotel_rating hotel_latitude hotel_longitude review_count hotel_address business_id review_id user_id review_text review_rating review_date

In [14]:
result = r.frame.toHtml()

#QString should be converted to string before processed by lxml
formatted_result = str(result.toAscii())

tree = html.fromstring(formatted_result) 

hotel_description = tree.xpath('//*[@class="body"]/text()')

# scrape the address details section of the page
details = tree.xpath('//*[@class="address"]/text()')

# now get just the address:
address = details[0]

# and just the city, state, country, and zip code:
csczip = details[1]

# and just the phone number
phone = details[2]

# outer_addresses.append(address)
# outer_csczip.append(csczip)
# outer_phones.append(phone)

#Now using correct Xpath we are fetching URL of archives
reviews = tree.xpath('//*[@class="review_container"]')

texts = []
titles = []
authors = []
ratings = []

print(reviews)
print('')
for rev in reviews:
    titles.append(rev.xpath('div/div[1]/text()')[0])
    authors.append(rev.xpath('div/div[2]/text()')[0])
    texts.append(rev.xpath('div/div[3]/text()')[0])
    ratings.append(rev.xpath('div[2]/img/@src')[0].split('/')[-1][0:1])
    print(rev.xpath('div[2]/img/@src')[0].split('/')[-1][0:1])


[<Element div at 0x117b471b0>, <Element div at 0x117b47208>, <Element div at 0x117b47260>, <Element div at 0x117b472b8>, <Element div at 0x117b47310>, <Element div at 0x117b47368>]

5
5
5
4
3
1

In [15]:
city, state, zipcode = csczip.strip().split(',')
zipcode = zipcode[3:]

In [16]:
print(city)
print(state)
print(zipcode)


Phoenix
 AZ
 85018

In [17]:
texts


Out[17]:
["We've stayed at this La Quinta several times with our two mini schnauzers. The staff is all very friendly. Rooms were clean but not overly fancy - we felt like we didn't need to worry leaving our dogs there for short periods while we went out to eat, etc. In a great location. Great breakfast too!",
 'This place was awesome!! The entire staff was extremely friendly and accommodating to us and our English Mastiff, Kadence. Dog area was well lit, clean and provided clean up bags. Continental Breakfast had a great selection. There was plenty of space in the room for our kennel and all of our luggage. I would recommend this hotel to dog owners and non- dog owners alike. If we ever pass through this way again this will definitely be the place we stay!! ',
 "We've stayed at this hotel two times with our three mini dachshunds. Nice grass area with picnic tables and doggie bags everywhere. Great place and very dog friendly.",
 "My room while a bit small was very clean and the beds were super comfy. The free breakfast was pretty standard for hotel breakfast with the exception of the biscuits and gravy which were delicious. The staff was extremely helpful and friendly. The location was great too. Within walking distance to a shopping mall that had practically everything in it. A median strip between 47/48th st that's built like a park/walkway was the perfect spot to take my dogs on a walk. There's also another park a few blocks south that's nice if you need to change things up a bit.\nThere was multiple wifi access points too so you can pick the one with the best signal strength. In the few other hotels I've been staying at on my trip they will have only one wifi router so if you have a crappy signal you'd have to move to another room or deal with it.\nFor the price I think this is one of the better places to stay.\n",
 'The big gentleman who manages the front desk is very creepy. He is rude and has very young girls working the front desk that seem to be very submissive towards him, eek! The thought of the rest!',
 "I was driving from So Cal to Houston, TX for the holidays and stayed at this La Quinta because I wanted an inexpensive, dog friendly hotel with a reputation for cleanliness and reliability. This place did not meet my standards at all. The front desk person was rude, unattentive and unhelful. Our room was a wreck. We ended up in a handicap room-which was dirty and the shower wand didn't work. I do NOT recommend this place at all. Yuck!"]

In [18]:
df = pd.DataFrame(columns=columns)

In [19]:
df['review_text'] = texts

In [21]:
df['hotel_id'] = 1

In [24]:
df


Out[24]:
hotel_id hotel_name hotel_address hotel_city hotel_state hotel_rating hotel_latitude hotel_longitude review_count hotel_address business_id review_id user_id review_text review_rating review_date
0 1 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN We've stayed at this La Quinta several times w... NaN NaN
1 1 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN This place was awesome!! The entire staff was ... NaN NaN
2 1 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN We've stayed at this hotel two times with our ... NaN NaN
3 1 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN My room while a bit small was very clean and t... NaN NaN
4 1 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN The big gentleman who manages the front desk i... NaN NaN
5 1 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN I was driving from So Cal to Houston, TX for t... NaN NaN

Now cycle through each hotel, and scrape the reviews, ratings, etc.


In [16]:
url_base = 'http://www.bringfido.com'  

columns = ['hotel_id',
           'hotel_url',
           'hotel_name',
           'hotel_address',
           'hotel_city',
           'hotel_state',
           'hotel_rating',
           'hotel_latitude',
           'hotel_longitude',
           'review_count',
           'hotel_address',
           'business_id',
           'review_id',
           'user_id',
           'username',
           'review_title',
           'review_text',
           'review_rating',
           'review_date']

bigdf = pd.DataFrame(columns=columns)


for hotel_id, link in enumerate(links[:3]):
    print('*'*50)
    print('Now on {}'.format(link))
    print('*'*50)
    r.update_url(url_base+link)  
    result = r.frame.toHtml()
    
    df = pd.DataFrame(columns=columns)

    #QString should be converted to string before processed by lxml
    formatted_result = str(result.toAscii())

    tree = html.fromstring(formatted_result) 

    hotel_description = tree.xpath('//*[@class="body"]/text()')

    # scrape the address details section of the page
    details = tree.xpath('//*[@class="address"]/text()')

    # now get just the address:
    address = details[0]
    
    # and just the city, state, country, and zip code:
    csczip = details[1]
    
    # and just the phone number
    phone = details[2]
    
    # now separate the city, state, and zip:
    city, state, zipcode = csczip.strip().split(',')
    zipcode = zipcode[3:]
    
    #Now using correct Xpath we are fetching URL of archives
    reviews = tree.xpath('//*[@class="review_container"]')

    texts = []
    titles = []
    authors = []
    ratings = []

    print(reviews)
    print('')
    for rev in reviews:
        titles.append(rev.xpath('div/div[1]/text()')[0])
        authors.append(rev.xpath('div/div[2]/text()')[0])
        texts.append(rev.xpath('div/div[3]/text()')[0])
        ratings.append(rev.xpath('div[2]/img/@src')[0].split('/')[-1][0:1])
        print(rev.xpath('div[2]/img/@src')[0].split('/')[-1][0:1])
    
    df['review_title'] = titles
    df['username'] = authors
    df['review_text'] = texts
    df['review_rating'] = ratings
    df['hotel_id'] = hotel_id
    df['hotel_name'] = hotel_names[hotel_id]
    df['hotel_url'] = url
    df['hotel_address'] = address
    df['hotel_city'] = city
    df['hotel_state'] = state
    df['hotel_rating'] = np.mean([int(rat) for rat in ratings])
    df['hotel_latitude'] = ''
    df['hotel_longitude'] = ''
    df['review_count'] = len(texts)
    df['review_id'] = 0
    df['user_id'] = 0
    
    bigdf = bigdf.append(df)


**************************************************
Now on /lodging/64846/?cid=14020&ar=&dt=&rm=1&ad=1&ch=0&dg=1&rt=199
**************************************************
[<Element div at 0x11903ed08>]

5
**************************************************
Now on /lodging/64835/?cid=14020&ar=&dt=&rm=1&ad=1&ch=0&dg=1&rt=64
**************************************************
[<Element div at 0x119009a48>, <Element div at 0x119009aa0>, <Element div at 0x119009af8>, <Element div at 0x119009b50>, <Element div at 0x119009ba8>, <Element div at 0x119009c00>]

5
5
5
4
3
1
**************************************************
Now on /lodging/64834/?cid=14020&ar=&dt=&rm=1&ad=1&ch=0&dg=1&rt=62
**************************************************
[<Element div at 0x10ca10d60>]

2

In [13]:
df


Out[13]:
hotel_id hotel_url hotel_name hotel_address hotel_city hotel_state hotel_rating hotel_latitude hotel_longitude review_count hotel_address business_id review_id user_id username review_title review_text review_rating review_date
0 2 http://www.bringfido.com/lodging/city/phoenix_... La Quinta Inn Phoenix North \n2510 WGreenway Rd Phoenix AZ 2 1 \n2510 WGreenway Rd NaN 0 0 \nemma\nin park city\n no dog area the hotel was fine but it is all hard scape ar... 2 NaN

In [15]:
bigdf = bigdf.append(df)

In [17]:
bigdf


Out[17]:
hotel_id hotel_url hotel_name hotel_address hotel_city hotel_state hotel_rating hotel_latitude hotel_longitude review_count hotel_address business_id review_id user_id username review_title review_text review_rating review_date
0 0 http://www.bringfido.com/lodging/city/phoenix_... Pointe Hilton Squaw Peak Resort \n7677 North 16th Street Phoenix AZ 5.000000 1 \n7677 North 16th Street NaN 0 0 \nMarti Ackermann\nin Tucson, AZ\n Pointe Hilton Squaw Peak Really nice property. Great walking areas for ... 5 NaN
0 1 http://www.bringfido.com/lodging/city/phoenix_... La Quinta Inn Phoenix Arcadia \n4727 E Thomas Rd Phoenix AZ 3.833333 6 \n4727 E Thomas Rd NaN 0 0 \nJen\nin Vancouver\n Great place to stay! We've stayed at this La Quinta several times w... 5 NaN
1 1 http://www.bringfido.com/lodging/city/phoenix_... La Quinta Inn Phoenix Arcadia \n4727 E Thomas Rd Phoenix AZ 3.833333 6 \n4727 E Thomas Rd NaN 0 0 \nJody\nin Sierra Vista, AZ\n Awesome Staff!! This place was awesome!! The entire staff was ... 5 NaN
2 1 http://www.bringfido.com/lodging/city/phoenix_... La Quinta Inn Phoenix Arcadia \n4727 E Thomas Rd Phoenix AZ 3.833333 6 \n4727 E Thomas Rd NaN 0 0 \nKristi\nin Henderson NV\n Very dog friendly We've stayed at this hotel two times with our ... 5 NaN
3 1 http://www.bringfido.com/lodging/city/phoenix_... La Quinta Inn Phoenix Arcadia \n4727 E Thomas Rd Phoenix AZ 3.833333 6 \n4727 E Thomas Rd NaN 0 0 \nChristopher\nin Los Angeles\n All around great place to stay. My room while a bit small was very clean and t... 4 NaN
4 1 http://www.bringfido.com/lodging/city/phoenix_... La Quinta Inn Phoenix Arcadia \n4727 E Thomas Rd Phoenix AZ 3.833333 6 \n4727 E Thomas Rd NaN 0 0 \nDave\nin Sedona\n Management and customer service The big gentleman who manages the front desk i... 3 NaN
5 1 http://www.bringfido.com/lodging/city/phoenix_... La Quinta Inn Phoenix Arcadia \n4727 E Thomas Rd Phoenix AZ 3.833333 6 \n4727 E Thomas Rd NaN 0 0 \nTeri\nin Southern California\n Bad Experience. I was driving from So Cal to Houston, TX for t... 1 NaN
0 2 http://www.bringfido.com/lodging/city/phoenix_... La Quinta Inn Phoenix North \n2510 WGreenway Rd Phoenix AZ 2.000000 1 \n2510 WGreenway Rd NaN 0 0 \nemma\nin park city\n no dog area the hotel was fine but it is all hard scape ar... 2 NaN

In [27]:
ratings


Out[27]:
['5']

In [8]:
# for idx, hotel_nm in enumerate(hotel_names):
#     print('*'*50)
#     print(hotel_nm)
#     print(outer_addresses[idx])
#     print(outer_csczip[idx])
#     print(outer_phones[idx])
#     print('Number of reviews: {}'.format(len(outer_texts[idx])))
#     print(outer_texts[idx])

Write data to SQL


In [24]:
bigdf.columns


Out[24]:
Index([u'hotel_id', u'hotel_url', u'hotel_name', u'hotel_address', u'hotel_city', u'hotel_state', u'hotel_rating', u'hotel_latitude', u'hotel_longitude', u'review_count', u'hotel_address', u'business_id', u'review_id', u'user_id', u'username', u'review_title', u'review_text', u'review_rating', u'review_date'], dtype='object')

In [18]:
import sqlalchemy
import connect_aws_db as cadb

In [19]:
engine = cadb.connect_aws_db(write_unicode=True)

In [20]:
conn = engine.connect()

First the reviews


In [28]:
cmd = "DROP TABLE bf_reviews"
result = conn.execute(cmd)

In [29]:
cmd = """
        CREATE TABLE bf_reviews
        (
        review_id MEDIUMINT AUTO_INCREMENT,
        hotel_id VARCHAR(256),
        business_id VARCHAR(256),
        user_id MEDIUMINT,
        username VARCHAR(128),
        review_title VARCHAR(256),
        review_rating INT,
        review_text VARCHAR(5000),
        PRIMARY KEY (review_id)
        )
        """

In [30]:
result = conn.execute(cmd)

In [31]:
bigdf_reviews = bigdf[['hotel_id', 'review_id', 'business_id', 'user_id',
                      'username', 'review_title', 'review_text', 'review_rating']].copy()

In [32]:
bigdf_reviews.to_sql('bf_reviews', engine, if_exists='append', index=False)

Next the hotels


In [33]:
bigdf.columns


Out[33]:
Index([u'hotel_id', u'hotel_url', u'hotel_name', u'hotel_address', u'hotel_city', u'hotel_state', u'hotel_rating', u'hotel_latitude', u'hotel_longitude', u'review_count', u'hotel_address', u'business_id', u'review_id', u'user_id', u'username', u'review_title', u'review_text', u'review_rating', u'review_date'], dtype='object')

In [63]:
cmd = "DROP TABLE bf_hotels"
result = conn.execute(cmd)

In [64]:
cmd = """
        CREATE TABLE bf_hotels
        (
        hotel_id MEDIUMINT AUTO_INCREMENT,
        hotel_url VARCHAR(512),
        hotel_name VARCHAR(512),
        hotel_address VARCHAR(1024),
        hotel_city VARCHAR(512),
        hotel_state VARCHAR(32),
        hotel_rating INT,
        hotel_latitude FLOAT,
        hotel_longitude FLOAT,
        business_id VARCHAR(256),
        review_count MEDIUMINT,
        PRIMARY KEY (hotel_id)
        )
        """

In [65]:
result = conn.execute(cmd)

In [66]:
bigdf_hotels = bigdf[['hotel_id', 'hotel_url', 'hotel_name', 'hotel_address',
                      'hotel_city', 'hotel_state', 'hotel_rating', 'hotel_latitude',
                      'hotel_longitude', 'review_count']].copy()

In [67]:
bigdf_hotels['hotel_id'] = 0
bigdf_hotels['hotel_latitude'] = 0.
bigdf_hotels['hotel_longitude'] = 0.

In [68]:
bigdf_hotels.to_sql('bf_hotels', engine, if_exists='append', index=False)

In [ ]: