In [1]:
import sys
from PyQt4.QtGui import *
from PyQt4.QtCore import *
from PyQt4.QtWebKit import *
from lxml import html
import pandas as pd
import numpy as np
import time
In [2]:
import sys
from PyQt4 import QtGui, QtCore, QtWebKit
class MainForm(QtGui.QMainWindow):
def __init__(self, parent=None):
super(MainForm, self).__init__(parent)
self.pages = ['http://www.google.com', 'http://www.citrix.com', 'http://yahoo.com', 'http://reddit.com']
self.index = 0
self.view = QtWebKit.QWebView()
self.view.connect(self.view, QtCore.SIGNAL('loadFinished(bool)'), self.loadFinished)
self.setCentralWidget(self.view)
self.view.load(QtCore.QUrl(self.pages[self.index]))
def loadFinished(self, ok):
self.index += 1
if self.index < len(self.pages) :
self.view.load(QtCore.QUrl(self.pages[self.index]))
else:
print 'done'
def main():
app = QtGui.QApplication(sys.argv)
form = MainForm()
form.show()
app.exec_()
In [3]:
class Render(QWebPage):
def __init__(self, url):
self.app = QApplication(sys.argv)
QWebPage.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.mainFrame().load(QUrl(url))
self.app.exec_()
def _loadFinished(self, result):
self.frame = self.mainFrame()
self.app.quit()
def update_url(self, url):
self.mainFrame().load(QUrl(url))
self.app.exec_()
In [11]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait # available since 2.4.0
from selenium.webdriver.support import expected_conditions as EC # available since 2.26.0
In [ ]:
In [4]:
url = 'http://www.bringfido.com/lodging/city/phoenix_az_us/'
In [16]:
ff = webdriver.Firefox()
ff.get(url)
ff.find_element_by_xpath("//div[@class='photo']")
try:
element = WebDriverWait(ff, 10).until(EC.presence_of_element_located((By.CLASS_NAME, "photo_inner")))
finally:
archive_links = ff.find_element_by_xpath('//*[@id="results_list"]/div')
ff.quit()
In [5]:
#This does the magic.Loads everything
r = Render(url)
#result is a QString.
result = r.frame.toHtml()
In [5]:
r.update_url(url)
In [6]:
#QString should be converted to string before processed by lxml
formatted_result = str(result.toAscii())
In [7]:
#formatted_result
In [7]:
#Next build lxml tree from formatted_result
tree = html.fromstring(formatted_result)
In [8]:
#Now using correct Xpath we are fetching URL of archives
archive_links = tree.xpath('//*[@id="results_list"]/div')
In [9]:
hotel_names = []
text_summaries = []
links = []
for lnk in archive_links:
hotel_names.append(lnk.xpath('div[2]/h1/a/text()')[0])
text_summaries.append(lnk.text_content())
links.append(lnk.xpath('div/h1/a/@href')[0])
# print(lnk.text_content())
# print('*'*25)
In [10]:
tree.xpath('//div[contains(@id, "results_list")]')
Out[10]:
In [14]:
res = browser.find_element_by_xpath('//div[contains(@id, "results_list")]/div')
In [14]:
hotel_names
Out[14]:
In [13]:
hotel_names = ['Pointe Hilton Squaw Peak Resort',
'La Quinta Inn Phoenix Arcadia',
'La Quinta Inn Phoenix North',
'La Quinta Inn Phoenix Thomas Road',
'Crossland Economy Studios Phoenix Metro',
'House, 1 Bedroom, Sleeps 4, 1 Bathroom',
'Arizona Biltmore Waldorf Astoria',
'2-Blocks from South Mountain Park',
'Sheraton Crescent Hotel',
'Best Western InnSuites Phoenix Hotel & Suites',
'Sheraton Phoenix Downtown Hotel',
'Embassy Suites Phoenix Airport West',
'Condo, 1 Bedroom, Sleeps 2, 1 Bathroom',
'Pointe Hilton Tapatio Cliffs Resort',
'Royal Palms Resort & Spa']
In [11]:
links
Out[11]:
In [15]:
links = ['/lodging/64846/?cid=14020&ar=&dt=&rm=1&ad=1&ch=0&dg=1&rt=199',
'/lodging/64835/?cid=14020&ar=&dt=&rm=1&ad=1&ch=0&dg=1&rt=64',
'/lodging/64834/?cid=14020&ar=&dt=&rm=1&ad=1&ch=0&dg=1&rt=62',
'/lodging/64836/?cid=14020&ar=&dt=&rm=1&ad=1&ch=0&dg=1&rt=64',
'/lodging/64824/?cid=14020&ar=&dt=&rm=1&ad=1&ch=0&dg=1&rt=44.99',
'/lodging/132841/?cid=14020&ar=&dt=&rm=1&ad=1&ch=0&dg=1&rt=53',
'/lodging/64763/?cid=14020&ar=&dt=&rm=1&ad=1&ch=0&dg=1&rt=329',
'/lodging/130968/?cid=14020&ar=&dt=&rm=1&ad=1&ch=0&dg=1&rt=66',
'/lodging/64862/?cid=14020&ar=&dt=&rm=1&ad=1&ch=0&dg=1&rt=159',
'/lodging/64769/?cid=14020&ar=&dt=&rm=1&ad=1&ch=0&dg=1&rt=71.1',
'/lodging/64863/?cid=14020&ar=&dt=&rm=1&ad=1&ch=0&dg=1&rt=319',
'/lodging/64794/?cid=14020&ar=&dt=&rm=1&ad=1&ch=0&dg=1&rt=71.2',
'/lodging/132840/?cid=14020&ar=&dt=&rm=1&ad=1&ch=0&dg=1&rt=30',
'/lodging/64847/?cid=14020&ar=&dt=&rm=1&ad=1&ch=0&dg=1&rt=199',
'/lodging/64861/?cid=14020&ar=&dt=&rm=1&ad=1&ch=0&dg=1&rt=134.78']
In [11]:
text_summaries
Out[11]:
In [9]:
url_base = 'http://www.bringfido.com'
r.update_url(url_base+links[1])
In [10]:
columns = ['hotel_id',
'hotel_name',
'hotel_address',
'hotel_city',
'hotel_state',
'hotel_rating',
'hotel_latitude',
'hotel_longitude',
'review_count',
'hotel_address',
'business_id',
'review_id',
'user_id',
'review_text',
'review_rating',
'review_date']
In [11]:
df = pd.DataFrame(columns=columns)
In [12]:
df.head(1)
Out[12]:
In [14]:
result = r.frame.toHtml()
#QString should be converted to string before processed by lxml
formatted_result = str(result.toAscii())
tree = html.fromstring(formatted_result)
hotel_description = tree.xpath('//*[@class="body"]/text()')
# scrape the address details section of the page
details = tree.xpath('//*[@class="address"]/text()')
# now get just the address:
address = details[0]
# and just the city, state, country, and zip code:
csczip = details[1]
# and just the phone number
phone = details[2]
# outer_addresses.append(address)
# outer_csczip.append(csczip)
# outer_phones.append(phone)
#Now using correct Xpath we are fetching URL of archives
reviews = tree.xpath('//*[@class="review_container"]')
texts = []
titles = []
authors = []
ratings = []
print(reviews)
print('')
for rev in reviews:
titles.append(rev.xpath('div/div[1]/text()')[0])
authors.append(rev.xpath('div/div[2]/text()')[0])
texts.append(rev.xpath('div/div[3]/text()')[0])
ratings.append(rev.xpath('div[2]/img/@src')[0].split('/')[-1][0:1])
print(rev.xpath('div[2]/img/@src')[0].split('/')[-1][0:1])
In [15]:
city, state, zipcode = csczip.strip().split(',')
zipcode = zipcode[3:]
In [16]:
print(city)
print(state)
print(zipcode)
In [17]:
texts
Out[17]:
In [18]:
df = pd.DataFrame(columns=columns)
In [19]:
df['review_text'] = texts
In [21]:
df['hotel_id'] = 1
In [24]:
df
Out[24]:
In [16]:
url_base = 'http://www.bringfido.com'
columns = ['hotel_id',
'hotel_url',
'hotel_name',
'hotel_address',
'hotel_city',
'hotel_state',
'hotel_rating',
'hotel_latitude',
'hotel_longitude',
'review_count',
'hotel_address',
'business_id',
'review_id',
'user_id',
'username',
'review_title',
'review_text',
'review_rating',
'review_date']
bigdf = pd.DataFrame(columns=columns)
for hotel_id, link in enumerate(links[:3]):
print('*'*50)
print('Now on {}'.format(link))
print('*'*50)
r.update_url(url_base+link)
result = r.frame.toHtml()
df = pd.DataFrame(columns=columns)
#QString should be converted to string before processed by lxml
formatted_result = str(result.toAscii())
tree = html.fromstring(formatted_result)
hotel_description = tree.xpath('//*[@class="body"]/text()')
# scrape the address details section of the page
details = tree.xpath('//*[@class="address"]/text()')
# now get just the address:
address = details[0]
# and just the city, state, country, and zip code:
csczip = details[1]
# and just the phone number
phone = details[2]
# now separate the city, state, and zip:
city, state, zipcode = csczip.strip().split(',')
zipcode = zipcode[3:]
#Now using correct Xpath we are fetching URL of archives
reviews = tree.xpath('//*[@class="review_container"]')
texts = []
titles = []
authors = []
ratings = []
print(reviews)
print('')
for rev in reviews:
titles.append(rev.xpath('div/div[1]/text()')[0])
authors.append(rev.xpath('div/div[2]/text()')[0])
texts.append(rev.xpath('div/div[3]/text()')[0])
ratings.append(rev.xpath('div[2]/img/@src')[0].split('/')[-1][0:1])
print(rev.xpath('div[2]/img/@src')[0].split('/')[-1][0:1])
df['review_title'] = titles
df['username'] = authors
df['review_text'] = texts
df['review_rating'] = ratings
df['hotel_id'] = hotel_id
df['hotel_name'] = hotel_names[hotel_id]
df['hotel_url'] = url
df['hotel_address'] = address
df['hotel_city'] = city
df['hotel_state'] = state
df['hotel_rating'] = np.mean([int(rat) for rat in ratings])
df['hotel_latitude'] = ''
df['hotel_longitude'] = ''
df['review_count'] = len(texts)
df['review_id'] = 0
df['user_id'] = 0
bigdf = bigdf.append(df)
In [13]:
df
Out[13]:
In [15]:
bigdf = bigdf.append(df)
In [17]:
bigdf
Out[17]:
In [27]:
ratings
Out[27]:
In [8]:
# for idx, hotel_nm in enumerate(hotel_names):
# print('*'*50)
# print(hotel_nm)
# print(outer_addresses[idx])
# print(outer_csczip[idx])
# print(outer_phones[idx])
# print('Number of reviews: {}'.format(len(outer_texts[idx])))
# print(outer_texts[idx])
In [24]:
bigdf.columns
Out[24]:
In [18]:
import sqlalchemy
import connect_aws_db as cadb
In [19]:
engine = cadb.connect_aws_db(write_unicode=True)
In [20]:
conn = engine.connect()
In [28]:
cmd = "DROP TABLE bf_reviews"
result = conn.execute(cmd)
In [29]:
cmd = """
CREATE TABLE bf_reviews
(
review_id MEDIUMINT AUTO_INCREMENT,
hotel_id VARCHAR(256),
business_id VARCHAR(256),
user_id MEDIUMINT,
username VARCHAR(128),
review_title VARCHAR(256),
review_rating INT,
review_text VARCHAR(5000),
PRIMARY KEY (review_id)
)
"""
In [30]:
result = conn.execute(cmd)
In [31]:
bigdf_reviews = bigdf[['hotel_id', 'review_id', 'business_id', 'user_id',
'username', 'review_title', 'review_text', 'review_rating']].copy()
In [32]:
bigdf_reviews.to_sql('bf_reviews', engine, if_exists='append', index=False)
In [33]:
bigdf.columns
Out[33]:
In [63]:
cmd = "DROP TABLE bf_hotels"
result = conn.execute(cmd)
In [64]:
cmd = """
CREATE TABLE bf_hotels
(
hotel_id MEDIUMINT AUTO_INCREMENT,
hotel_url VARCHAR(512),
hotel_name VARCHAR(512),
hotel_address VARCHAR(1024),
hotel_city VARCHAR(512),
hotel_state VARCHAR(32),
hotel_rating INT,
hotel_latitude FLOAT,
hotel_longitude FLOAT,
business_id VARCHAR(256),
review_count MEDIUMINT,
PRIMARY KEY (hotel_id)
)
"""
In [65]:
result = conn.execute(cmd)
In [66]:
bigdf_hotels = bigdf[['hotel_id', 'hotel_url', 'hotel_name', 'hotel_address',
'hotel_city', 'hotel_state', 'hotel_rating', 'hotel_latitude',
'hotel_longitude', 'review_count']].copy()
In [67]:
bigdf_hotels['hotel_id'] = 0
bigdf_hotels['hotel_latitude'] = 0.
bigdf_hotels['hotel_longitude'] = 0.
In [68]:
bigdf_hotels.to_sql('bf_hotels', engine, if_exists='append', index=False)
In [ ]: