This notebook describes how hotel data can be scraped using PyQT.
The items we want to extract are:
Once the links for each hotel are determined, I then want to extract the following items pertaining to each review:
In [1]:
import sys
from PyQt4.QtGui import *
from PyQt4.QtCore import *
from PyQt4.QtWebKit import *
from lxml import html
In [2]:
class Render(QWebPage):
def __init__(self, url):
self.app = QApplication(sys.argv)
QWebPage.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.mainFrame().load(QUrl(url))
self.app.exec_()
def _loadFinished(self, result):
self.frame = self.mainFrame()
self.app.quit()
def update_url(self, url):
self.mainFrame().load(QUrl(url))
self.app.exec_()
In [68]:
url = 'http://www.bringfido.com/lodging/city/new_haven_ct_us'
#This does the magic.Loads everything
r = Render(url)
#result is a QString.
result = r.frame.toHtml()
In [65]:
# result
In [ ]:
#QString should be converted to string before processed by lxml
formatted_result = str(result.toAscii())
In [16]:
#Next build lxml tree from formatted_result
tree = html.fromstring(formatted_result)
In [24]:
tree.text_content
Out[24]:
In [19]:
#Now using correct Xpath we are fetching URL of archives
archive_links = tree.xpath('//*[@id="results_list"]/div')
print archive_links
In [2]:
url = 'http://pycoders.com/archive/'
r = Render(url)
result = r.frame.toHtml()
#QString should be converted to string before processed by lxml
formatted_result = str(result.toAscii())
tree = html.fromstring(formatted_result)
In [5]:
#Now using correct Xpath we are fetching URL of archives
archive_links = tree.xpath('//*[@class="campaign"]/a/@href')
# for lnk in archive_links:
# print(lnk)
In [3]:
url = 'http://www.bringfido.com/lodging/city/new_haven_ct_us'
r = Render(url)
result = r.frame.toHtml()
#QString should be converted to string before processed by lxml
formatted_result = str(result.toAscii())
tree = html.fromstring(formatted_result)
In [4]:
#Now using correct Xpath we are fetching URL of archives
archive_links = tree.xpath('//*[@id="results_list"]/div')
print(archive_links)
print('')
for lnk in archive_links:
print(lnk.xpath('div[2]/h1/a/text()')[0])
print(lnk.text_content())
print('*'*25)
In [5]:
links = []
for lnk in archive_links:
print(lnk.xpath('div/h1/a/@href')[0])
links.append(lnk.xpath('div/h1/a/@href')[0])
print('*'*25)
In [6]:
lnk.xpath('//*/div/h1/a/@href')[0]
Out[6]:
In [7]:
links
Out[7]:
In [8]:
url_base = 'http://www.bringfido.com'
r.update_url(url_base+links[0])
result = r.frame.toHtml()
#QString should be converted to string before processed by lxml
formatted_result = str(result.toAscii())
tree = html.fromstring(formatted_result)
In [16]:
hotel_description = tree.xpath('//*[@class="body"]/text()')
details = tree.xpath('//*[@class="address"]/text()')
address = details[0]
csczip = details[1]
phone = details[2]
#Now using correct Xpath we are fetching URL of archives
reviews = tree.xpath('//*[@class="review_container"]')
texts = []
titles = []
authors = []
ratings = []
print(reviews)
print('')
for rev in reviews:
titles.append(rev.xpath('div/div[1]/text()')[0])
authors.append(rev.xpath('div/div[2]/text()')[0])
texts.append(rev.xpath('div/div[3]/text()')[0])
ratings.append(rev.xpath('div[2]/img/@src')[0].split('/')[-1][0:1])
print(rev.xpath('div[2]/img/@src')[0].split('/')[-1][0:1])
In [17]:
titles
Out[17]:
In [18]:
authors
Out[18]:
In [19]:
texts
Out[19]:
In [64]:
ratings
Out[64]:
In [ ]: