In [52]:
import pandas as pd
import numpy as np
from splinter.browser import Browser
import re
import time
import connect_aws_db as cadb
In [163]:
city = 'new haven'
state = 'ct'
category = 'hotels'
In [164]:
br = Browser()
In [165]:
url = "http://www.tripadvisor.com"
In [167]:
url = "http://www.tripadvisor.com/Hotels-g33851-New_Haven_Connecticut-Hotels.html"
In [168]:
br.visit(url)
In [170]:
loclist = br.find_by_xpath('//*[contains(@id, "BREADCRUMBS")]')
loclist
Out[170]:
In [176]:
locstring = loclist.text.split(u'\u203a')
locstring
Out[176]:
In [185]:
city = locstring[2].lower()
city
Out[185]:
In [184]:
state = re.findall('\w+ \(([A-Z][A-Z])\)',locstring[1])[0].lower()
state
Out[184]:
In [ ]:
In [156]:
search_box = br.find_by_xpath('//*[contains(@id, "searchbox")]')
search_box.fill(city+', '+state)
In [162]:
np.random.uniform(0, 1)
Out[162]:
In [135]:
search_field = br.find_by_xpath('//*[contains(@id, "GEO_SCOPED_SEARCH_INPUT")]')
search_field
Out[135]:
In [138]:
search_field.fill(city+', '+state+'\r')
time.sleep(1)
search_field.fill('\n')
In [139]:
#br.find_by_xpath('//*[contains(@class, "poi_overview_item")]').click()
In [140]:
lookin_field = br.find_by_xpath('//*[contains(@id, "mainSearch")]')
#lookin_field.click()
In [144]:
lookin_field.fill(category+'\r')
time.sleep(1)
lookin_field.fill(category+'\r')
time.sleep(1)
lookin_field.fill('\r')
In [ ]:
In [11]:
br = Browser()
In [12]:
url = "http://www.tripadvisor.com/Hotels-g31310-Phoenix_Arizona-Hotels.html"
In [13]:
br.visit(url)
In [14]:
date_bar = br.find_by_xpath('//*[contains(@class, "meta_date_wrapper")]')
cin_btn = date_bar.find_by_xpath('span[contains(@class, "meta_date_field check_in")]/span')[0]
In [15]:
cin_btn
Out[15]:
In [16]:
cin_btn.click()
In [17]:
rightcal = br.find_by_xpath('//div[contains(@class, "month")]')[1]
rightcal
Out[17]:
In [18]:
fri_btn = rightcal.find_by_xpath('table/tbody/tr[3]/td[6]/div')
fri_btn
Out[18]:
In [19]:
fri_btn.click()
In [20]:
cout_btn = date_bar.find_by_xpath('span[contains(@class, "meta_date_field check_out")]/span')[0]
cout_btn
Out[20]:
In [21]:
cout_btn.click()
In [22]:
leftcal = br.find_by_xpath('//div[contains(@class, "month")]')[0]
leftcal
Out[22]:
In [23]:
sat_btn = leftcal.find_by_xpath('table/tbody/tr[3]/td[7]/div')
sat_btn
Out[23]:
In [24]:
sat_btn.click()
In [16]:
time.sleep(5)
In [27]:
#hotels_div = br.find_by_xpath('//*[@id="ACCOM_OVERVIEW"]/div/div/div/div')
#property_div = br.find_by_xpath('//*[contains(@class, "property_details")]')
In [28]:
#property_div
In [29]:
#len(property_div)
In [30]:
listing_div = br.find_by_xpath('//div[contains(@class, "hotels_lf_condensed")]')
In [31]:
listing_div
Out[31]:
In [32]:
len(listing_div)
Out[32]:
In [33]:
#prop = property_div[0]
In [34]:
listing = listing_div[1]
prop = listing.find_by_xpath('//*[contains(@class, "property_details")]')
In [35]:
re.findall('hotel_(\d+)', listing['id'])
Out[35]:
In [36]:
prop.find_by_xpath('div/div[@class="listing_title"]/a').text
Out[36]:
In [37]:
# extract the URL for the hotel thumbnail image
prop.find_by_xpath('div[@class="photo_booking"]/div/div/a/img')['src']
Out[37]:
In [38]:
# get the price
price_text = prop.find_by_xpath('div[contains(@class, "prw_rup")]/div/div/div/div[@class="headerContents"]/div[contains(@class, "price")]').text
price_text
Out[38]:
In [39]:
re.findall('(\d+)', price_text)[0]
Out[39]:
In [20]:
br.find_by_xpath('//div[@id="ACCOM_OVERVIEW"]/div[contains(@class, "hotels_lf_condensed")]')
Out[20]:
In [ ]:
In [24]:
hotel_names = []
links = []
img_url = []
hotel_price = []
business_id = []
listing_div = br.find_by_xpath('//div[@id="ACCOM_OVERVIEW"]/div[contains(@class, "hotels_lf_condensed")]')
listing_div = listing_div[:3]
for listing in listing_div:
biz_id = re.findall('hotel_(\d+)', listing['id'])
print('business_id: {}'.format(biz_id))
business_id.append(biz_id)
prop = listing.find_by_xpath('div/div/div/div[contains(@class, "property_details")]')
title = prop.find_by_xpath('div/div[@class="listing_title"]')
print(title.text)
hotel_names.append(title.text)
print(title.find_by_xpath('a')['href'])
links.append(title.find_by_xpath('a')['href'])
hotel_img = prop.find_by_xpath('div[@class="photo_booking"]/div/div/a/img')['src']
img_url.append(hotel_img)
print('Hotel img URL: {}'.format(hotel_img))
price_text = prop.find_by_xpath('div[contains(@class, "prw_rup")]/div/div/div/div[@class="headerContents"]/div[contains(@class, "price")]').text
price = re.findall('(\d+)', price_text)[0]
hotel_price.append(price)
print('Price: ${}'.format(price))
print('*'*50)
In [155]:
links
Out[155]:
In [156]:
len(links)
Out[156]:
In [164]:
nxt_btn = br.find_by_xpath('//div[contains(@class, "deckTools")]/div[contains(@class, "unified")]/a[contains(@class, "next")]')
In [165]:
nxt_btn.click()
In [167]:
len(nxt_btn)
Out[167]:
In [41]:
br.is_element_present_by_xpath('//*[contains(@class, "prw_rup")]/div/div/div/div[@class="headerContents"]/div[contains(@class, "price")]', wait_time=1)
Out[41]:
In [46]:
# this only needs to be done at the very beginning
br = Browser()
In [47]:
# number of pages of hotel results to scrape
max_pages = 7
url = "http://www.tripadvisor.com/Hotels-g31310-Phoenix_Arizona-Hotels.html"
#####################################################
## do not edit below this line
#####################################################
# more_pages is used to keep track if there is more
# than one page of hotel results for the given city
more_pages = True
# scraping will start on page 1 of the hotel results
page = 1
# open the URL in a browser object:
br.visit(url)
# find the div to enter the date range. This is needed to get pricing info:
date_bar = br.find_by_xpath('//*[contains(@class, "meta_date_wrapper")]')
# find the check in calendar span:
cin_btn = date_bar.find_by_xpath('span[contains(@class, "meta_date_field check_in")]/span')[0]
# now click the check_in span to activate it
cin_btn.click()
# select the right calendar div (next month)
rightcal = br.find_by_xpath('//div[contains(@class, "month")]')[1]
# now select the third Friday of next month as the check in date
fri_btn = rightcal.find_by_xpath('table/tbody/tr[3]/td[6]/div')
# and click it
fri_btn.click()
# now choose the next day (saturday) as the check out date
cout_btn = date_bar.find_by_xpath('span[contains(@class, "meta_date_field check_out")]/span')[0]
cout_btn.click()
leftcal = br.find_by_xpath('//div[contains(@class, "month")]')[0]
sat_btn = leftcal.find_by_xpath('table/tbody/tr[3]/td[7]/div')
sat_btn.click()
print('Dates selected.')
# wait a few seconds for ta to retrieve prices
time.sleep(5)
# create a pandas dataframe that will be used for writing
# the results to the DB:
columns = ['hotel_id',
'hotel_url',
'hotel_img_url',
'hotel_name',
'hotel_address',
'hotel_city',
'hotel_state',
'hotel_rating',
'hotel_latitude',
'hotel_longitude',
'hotel_price',
'business_id',
'review_count',
'dog_review_count',
]
bigdf = pd.DataFrame(columns=columns)
# create some lists to fill w. the results from each page
hotel_names = []
links = []
img_url = []
hotel_price = []
business_id = []
print('starting scraper loop.')
while more_pages and page <= max_pages:
print('*'*75)
print('Now scraping page {} of {} of the hotel results'.format(page, max_pages))
print('*'*75)
# get all the review divs
time.sleep(5)
listing_div = br.find_by_xpath('//*[contains(@class, "hotels_lf_condensed")]')
xsts1 = br.is_element_present_by_xpath('//*[contains(@class, "photo_booking")]', wait_time=1)
xsts2 = br.is_element_present_by_xpath('//*[contains(@class, "property_details")]', wait_time=1)
xsts3 = br.is_element_present_by_xpath('//*[contains(@class, "prw_rup")]/div/div/div/div[@class="headerContents"]/div[contains(@class, "price")]', wait_time=1)
while len(listing_div) < 3 or not xsts1 or not xsts2 or not xsts3:
print('now waiting for DOIs to return')
time.sleep(5)
listing_div = br.find_by_xpath('//*[contains(@class, "hotels_lf_condensed")]')
xsts1 = br.is_element_present_by_xpath('//*[contains(@class, "photo_booking")]', wait_time=1)
xsts2 = br.is_element_present_by_xpath('//*[contains(@class, "property_details")]', wait_time=1)
xsts3 = br.is_element_present_by_xpath('//*[contains(@class, "prw_rup")]/div/div/div/div[@class="headerContents"]/div[contains(@class, "price")]', wait_time=1)
print('# of listings: {}'.format(len(listing_div)))
print('photo_booking exists: {}'.format(xsts1))
print('property_details exists: {}'.format(xsts2))
print('prw_up exists: {}'.format(xsts3))
print('Number of hotel listings on this page: {}'.format(len(listing_div)))
df = pd.DataFrame(columns=columns)
for listing in listing_div:
try:
biz_id = re.findall('hotel_(\d+)', listing['id'])
if len(biz_id) > 0:
biz_id = biz_id[0]
else:
biz_id = None
prop = listing.find_by_xpath('div/div/div/div[contains(@class, "property_details")]')
title = prop.find_by_xpath('div/div[@class="listing_title"]')
hotel_link = title.find_by_xpath('a')['href']
hotel_img = prop.find_by_xpath('div[@class="photo_booking"]/div/div/a/img')['src']
price_text = prop.find_by_xpath('div[contains(@class, "prw_rup")]/div/div/div/div[@class="headerContents"]/div[contains(@class, "price")]').text
price = re.findall('(\d+)', price_text)[0]
print('business_id: {}'.format(biz_id))
print(title.text)
print(hotel_link)
print('Hotel img URL: {}'.format(hotel_img))
print('Price: ${}'.format(price))
business_id.append(biz_id)
hotel_names.append(title.text)
links.append(hotel_link)
img_url.append(hotel_img)
hotel_price.append(price)
except:
print('!'*80)
print('ONE OF THE NEEDED DIVS DOES NOT EXIST!')
print('!'*80)
print('*'*50)
if len(hotel_names) > 0:
df['hotel_name'] = hotel_names
df['hotel_price'] = hotel_price
df['hotel_img_url'] = img_url
df['hotel_url'] = links
df['business_id'] = business_id
bigdf = bigdf.append(df)
# update the page number
page += 1
# if more pages are desired, look for a "next" button
if page <= max_pages:
nxt_btn = br.find_by_xpath('//div[contains(@class, "deckTools")]/div[contains(@class, "unified")]/a[contains(@class, "next")]')
# if there is a next button, click it
# else exit the while loop
if len(nxt_btn) > 0:
nxt_btn.click()
else:
more_pages = False
In [48]:
len(bigdf)
Out[48]:
In [49]:
len(hotel_names)
Out[49]:
In [50]:
bigdf.head(5)
Out[50]:
In [53]:
engine = cadb.connect_aws_db(write_unicode=True)
bigdf.to_sql('ta_hotels', engine, if_exists='append', index=False)
In [ ]:
In [46]:
print(links[0])
In [47]:
br.visit(links[0])
In [186]:
hotel_url = "http://www.tripadvisor.com/Hotel_Review-g31310-d73905-Reviews-Wyndham_Garden_Phoenix_Midtown-Phoenix_Arizona.html"
In [188]:
br = Browser()
In [189]:
br.visit(hotel_url)
In [190]:
full_reviews = br.find_by_xpath('//div[contains(@class, "reviewSelector")]')
In [191]:
full_reviews
Out[191]:
In [192]:
fullrev = full_reviews[0]
In [193]:
fullrev.find_by_xpath('div/div[contains(@class, "col1of2")]/div[contains(@class, "member_info")]').text
Out[193]:
In [194]:
member_info = fullrev.find_by_xpath('div/div[contains(@class, "col1of2")]/div[contains(@class, "member_info")]')
In [200]:
member_str = member_info.find_by_xpath('div[contains(@class, "memberOverlayLink")]')['id']
In [201]:
member_id = re.findall('UID_(.*)-', member_str)
member_id
Out[201]:
In [39]:
member_info.find_by_xpath('div/div[contains(@class, "username mo")]').text
Out[39]:
In [16]:
review = fullrev.find_by_xpath('div/div[@class="col2of2"]/div[@class="innerBubble"]')
In [17]:
review.find_by_xpath('div/div[contains(@class, "quote")]').text.strip()[1:-1]
Out[17]:
In [18]:
review.find_by_xpath('div/div[contains(@class, "rating")]/span/img')['alt'].split(' ')[0]
Out[18]:
In [19]:
review.find_by_xpath('div/div[contains(@class, "rating")]/span[contains(@class, "ratingDate")]')['title']
Out[19]:
In [20]:
review.find_by_xpath('div/div[contains(@class, "entry")]').text.strip().replace("\n", "")
Out[20]:
In [21]:
fullrev['id']
Out[21]:
In [22]:
br.find_by_xpath('//a[contains(@class, "next")]')
Out[22]:
In [23]:
len(br.find_by_xpath('//a[contains(@class, "next")]')) > 0
Out[23]:
In [24]:
br.find_by_xpath('//a[contains(@class, "next")]')['href']
Out[24]:
In [62]:
hotel_address = br.find_by_xpath('//span[contains(@class, "street-address")]').text
hotel_address
Out[62]:
In [63]:
#city
hotel_city = br.find_by_xpath('//span[contains(@property, "addressLocality")]').text
hotel_city
Out[63]:
In [64]:
#state
hotel_state = br.find_by_xpath('//span[contains(@property, "addressRegion")]').text
hotel_state
Out[64]:
In [65]:
#zip code
hotel_zip = br.find_by_xpath('//span[contains(@property, "postalCode")]').text
hotel_zip
Out[65]:
In [204]:
for fullrev in full_reviews:
# user name:
member_info = fullrev.find_by_xpath('div/div[contains(@class, "col1of2")]/div[contains(@class, "member_info")]')
member_str = member_info.find_by_xpath('div[contains(@class, "memberOverlayLink")]')['id']
member_id = re.findall('UID_(.*)-', member_str)[0]
usrnm = member_info.find_by_xpath('div/div[contains(@class, "username mo")]')
review = fullrev.find_by_xpath('div/div[@class="col2of2"]/div[@class="innerBubble"]')[0]
title = review.find_by_xpath('div/div[contains(@class, "quote")]').text.strip()[1:-1]
rating = review.find_by_xpath('div/div[contains(@class, "rating")]/span/img')['alt'].split(' ')[0]
date = review.find_by_xpath('div/div[contains(@class, "rating")]/span[contains(@class, "ratingDate")]')['title']
rev = review.find_by_xpath('div/div[contains(@class, "entry")]').text.strip().replace("\n", "")
if len(usrnm) > 0:
#location = member_info.xpath('div[1]')[0].text_content()
print('Username: {}'.format(str(usrnm[0].text).strip()))
else:
print('Username: A Trip Advisor Member')
print('Member id: {}'.format(member_id))
location = member_info.find_by_xpath('div[contains(@class, "location")]')
if len(location) > 0:
print('Location: {}'.format(str(location[0].text).strip()))
else:
print('Location: ')
print('full review_id: {}'.format(fullrev['id']))
try:
rev_id = re.search('review_(\d+)$', fullrev['id']).group(1)
except AttributeError:
rev_id = ''
print('review_id: {}'.format(rev_id))
print('Title: {}'.format(title))
print('Rating: {}'.format(rating))
print('Date: {}'.format(date))
print('Review:')
print(rev)
print('*'*50)
In [205]:
def return_results(url, page):
br.visit(url)
full_reviews = br.find_by_xpath('//div[contains(@class, "reviewSelector")]')
page_usernames = []
page_memberids = []
page_locations = []
page_titles = []
page_ratings = []
page_dates = []
page_reviews = []
page_review_ids = []
for fullrev in full_reviews:
# user name:
member_info = fullrev.find_by_xpath('div/div[contains(@class, "col1of2")]/div[contains(@class, "member_info")]')
member_str = member_info.find_by_xpath('div[contains(@class, "memberOverlayLink")]')['id']
member_id = re.findall('UID_(.*)-', member_str)[0]
usrnm = member_info.find_by_xpath('div/div[contains(@class, "username mo")]')
review = fullrev.find_by_xpath('div/div[@class="col2of2"]/div[@class="innerBubble"]')[0]
title = review.find_by_xpath('div/div[contains(@class, "quote")]').text.strip()[1:-1]
rating = review.find_by_xpath('div/div[contains(@class, "rating")]/span/img')['alt'].split(' ')[0]
date = review.find_by_xpath('div/div[contains(@class, "rating")]/span[contains(@class, "ratingDate")]')['title']
rev = review.find_by_xpath('div/div[contains(@class, "entry")]').text.strip().replace("\n", "")
if len(usrnm) > 0:
username = str(usrnm[0].text).strip()
print('Username: {}'.format(username))
else:
print('Username: A Trip Advisor Member')
locationel = member_info.find_by_xpath('div[contains(@class, "location")]')
if len(locationel) > 0:
location = str(locationel[0].text).strip()
print('Location: {}'.format(location))
else:
location = ''
print('Location: ')
print('full review_id: {}'.format(fullrev['id']))
try:
rev_id = re.search('review_(\d+)$', fullrev['id']).group(1)
except AttributeError:
rev_id = ''
# print('review_id: {}'.format(rev_id))
# print('Title: {}'.format(title))
# print('Rating: {}'.format(rating))
# print('Date: {}'.format(date))
# print('Review:')
# print(rev)
# print('*'*50)
page_usernames.append(username)
page_memberids.append(member_id)
page_locations.append(location)
page_titles.append(title)
page_ratings.append(ratings)
page_dates.append(date)
page_reviews.append(rev)
page_review_ids.append(rev_id)
if len(br.find_by_xpath('//a[contains(@class, "next")]')) > 0:
url = br.find_by_xpath('//a[contains(@class, "next")]')['href']
more_reviews = True
page += 1
# print('url and page updated.')
else:
more_reviews = False
ret_dict = {'usrnms': page_usernames,
'mmbrids': page_memberids,
'locs': page_locations,
'ttls': page_titles,
'rtngs': page_ratings,
'dts': page_dates,
'rvws': page_reviews,
'revids': page_review_ids,
'url': url,
'more_reviews': more_reviews,
'page': page}
return ret_dict
In [30]:
idx = 0
usernames = []
memberids = []
locations = []
titles = []
ratings = []
dates = []
reviews = []
review_ids = []
columns = ['review_id',
'hotel_id',
'hotel_name',
'business_id',
'biz_review_id',
'biz_member_id',
'username',
'review_title',
'review_rating',
'review_text',
'review_date']
bigdf = pd.DataFrame(columns=columns)
url = links[idx]
hotel_name = hotel_names[idx]
more_reviews = True
page = 1
while more_reviews:
#print('*'*50)
print('*'*50)
print('Now on page {}'.format(page))
#print('*'*50)
df = pd.DataFrame(columns=columns)
ret_dict = return_results(url, page)
print(ret_dict['locs'])
print(ret_dict['ttls'])
df['biz_review_id'] = ret_dict['revids']
df['biz_member_id'] = ret_dict['mmbrids']
df['username'] = ret_dict['usrnms']
df['review_title'] = ret_dict['ttls']
df['review_rating'] = ret_dict['rtngs']
df['review_date'] = ret_dict['dts']
df['review_text'] = ret_dict['rvws']
df['hotel_name'] = hotel_name
url = ret_dict['url']
more_reviews = ret_dict['more_reviews']
page = ret_dict['page']
print('successfully completed page {}'.format(page))
bigdf = bigdf.append(df)
more_reviews = False
In [29]:
hotel_names
Out[29]:
In [31]:
df
Out[31]:
In [225]:
engine = cadb.connect_aws_db(write_unicode=True)
In [226]:
conn = engine.connect()
In [237]:
city = 'new haven'
state = 'ct'
In [238]:
cmd = "SELECT hotel_url, hotel_name FROM ta_hotels WHERE "
cmd += "hotel_city='"+city.lower()+"' AND "
cmd += "hotel_state='"+state.lower()+"';"
In [239]:
print(cmd)
In [240]:
conn.execute(cmd)
Out[240]:
In [241]:
result = engine.execute(cmd)
In [242]:
result
Out[242]:
In [243]:
for row in result:
print(row['hotel_name'], row['hotel_url'])
In [ ]: