In [1]:
import pandas as pd
import numpy as np
In [2]:
from splinter.browser import Browser
In [3]:
br = Browser()
In [4]:
br.visit('http://www.bringfido.com/lodging/city/san_jose_ca_us/')
In [5]:
archive_links = br.find_by_xpath('//*[@id="results_list"]/div')
In [6]:
one_link = archive_links[1]
In [7]:
one_link.text
Out[7]:
In [8]:
one_link.find_by_xpath('div/h1/a').value
Out[8]:
In [9]:
one_link.text
Out[9]:
In [10]:
one_link.find_by_xpath('div/h1/a')['href']
Out[10]:
In [21]:
one_link['id'].split('_')[-1]
Out[21]:
In [30]:
one_link.find_by_xpath('div/div[@class="photo_inner"]/a/img')['src']
Out[30]:
In [14]:
hotel_names = []
text_summaries = []
links = []
biz_ids = []
for lnk in archive_links:
hotel_names.append(lnk.find_by_xpath('div[2]/h1/a').value)
biz_ids.append(lnk['id'].split('_')[-1])
text_summaries.append(lnk.text)
this_link = lnk.find_by_xpath('div/h1/a')['href']
links.append(this_link)
In [15]:
links
Out[15]:
In [16]:
biz_ids
Out[16]:
In [19]:
len(br.find_by_xpath('//*[@id="results_paging_controls_bottom"]/span'))
Out[19]:
In [21]:
button = br.find_by_id('page_2')[0]
In [22]:
button
Out[22]:
In [18]:
button.click()
In [13]:
url_base = 'http://www.bringfido.com'
hotel_id = 0
link = links[1]
br.visit(link)
In [14]:
hotel_description = br.find_by_xpath('//*[@class="body"]').text
In [15]:
hotel_description
Out[15]:
In [16]:
details = br.find_by_xpath('//*[@class="address"]').text.split('\n')
In [17]:
details
Out[17]:
In [18]:
reviews = br.find_by_xpath('//*[@class="review_container"]')
In [19]:
reviews
Out[19]:
In [20]:
rev = reviews[0]
In [21]:
# title:
rev.find_by_xpath('div/div[1]').text
Out[21]:
In [22]:
# author
rev.find_by_xpath('div/div[2]').text
Out[22]:
In [23]:
rev.find_by_xpath('div/div[3]').text
Out[23]:
In [24]:
rev.find_by_xpath('div[2]/img')['src'].split('/')[-1][0:1]
Out[24]:
In [25]:
columns = ['hotel_id',
'hotel_url',
'hotel_name',
'hotel_address',
'hotel_city',
'hotel_state',
'hotel_rating',
'hotel_latitude',
'hotel_longitude',
'review_count',
'hotel_address',
'business_id',
'review_id',
'user_id',
'username',
'review_title',
'review_text',
'review_rating',
'review_date']
bigdf = pd.DataFrame(columns=columns)
for hotel_id, link in enumerate(links):
print('*'*50)
print('Now on {}'.format(link))
print('*'*50)
br.visit(link)
df = pd.DataFrame(columns=columns)
hotel_description = br.find_by_xpath('//*[@class="body"]').text
# scrape the address details section of the page
details = br.find_by_xpath('//*[@class="address"]').text.split('\n')
# now get just the address:
address = details[0]
# and just the city, state, country, and zip code:
csczip = details[1]
# and just the phone number
phone = details[2]
# now separate the city, state, and zip:
city, state, zipcode = csczip.strip().split(',')
zipcode = zipcode[3:]
#Now using correct Xpath we are fetching URL of archives
reviews = br.find_by_xpath('//*[@class="review_container"]')
texts = []
titles = []
authors = []
ratings = []
print(reviews)
print('')
for rev in reviews:
titles.append(rev.find_by_xpath('div/div[1]').text)
authors.append(rev.find_by_xpath('div/div[2]').text)
texts.append(rev.find_by_xpath('div/div[3]').text)
ratings.append(rev.find_by_xpath('div[2]/img')['src'].split('/')[-1][0:1])
print(rev.find_by_xpath('div[2]/img')['src'].split('/')[-1][0:1])
df['review_title'] = titles
df['username'] = authors
df['review_text'] = texts
df['review_rating'] = ratings
df['hotel_id'] = hotel_id
df['hotel_name'] = hotel_names[hotel_id]
df['hotel_url'] = link
df['hotel_address'] = address
df['hotel_city'] = city
df['hotel_state'] = state
df['hotel_rating'] = np.mean([int(rat) for rat in ratings])
df['hotel_latitude'] = ''
df['hotel_longitude'] = ''
df['review_count'] = len(texts)
df['review_id'] = 0
df['user_id'] = 0
bigdf = bigdf.append(df)
In [26]:
bigdf
Out[26]:
In [27]:
len(bigdf)
Out[27]:
In [31]:
import sqlalchemy
import connect_aws_db as cadb
In [41]:
engine = cadb.connect_aws_db(write_unicode=True)
In [42]:
conn = engine.connect()
In [43]:
cmd = "DROP TABLE bf_reviews"
result = conn.execute(cmd)
In [44]:
cmd = """
CREATE TABLE bf_reviews
(
review_id MEDIUMINT AUTO_INCREMENT,
hotel_id VARCHAR(256),
hotel_url VARCHAR(512),
hotel_img_url VARCHAR(512),
business_id VARCHAR(256),
user_id MEDIUMINT,
username VARCHAR(128),
review_title VARCHAR(256),
review_rating INT,
review_text VARCHAR(5000),
PRIMARY KEY (review_id)
)
"""
In [45]:
result = conn.execute(cmd)
In [34]:
bigdf_reviews = bigdf[['hotel_id', 'review_id', 'business_id', 'user_id',
'username', 'review_title', 'review_text', 'review_rating']].copy()
In [35]:
bigdf_reviews.to_sql('bf_reviews', engine, if_exists='append', index=False)
In [46]:
cmd = "DROP TABLE bf_hotels"
result = conn.execute(cmd)
In [47]:
cmd = """
CREATE TABLE bf_hotels
(
hotel_id MEDIUMINT AUTO_INCREMENT,
hotel_url VARCHAR(512),
hotel_img_url VARCHAR(512),
hotel_name VARCHAR(512),
hotel_address VARCHAR(1024),
hotel_city VARCHAR(512),
hotel_state VARCHAR(32),
hotel_rating INT(11),
hotel_latitude FLOAT,
hotel_longitude FLOAT,
business_id VARCHAR(256),
review_count MEDIUMINT,
PRIMARY KEY (hotel_id)
)
"""
In [48]:
result = conn.execute(cmd)
In [39]:
#page_1_url = "http://www.bringfido.com/lodging/city/phoenix_az_us/?arrival=None&adults=1&fee_under_25=False&rate_low=0&multiple_allowed=False&departure=None&dogs=0&rate_high=1000&rating_high=5&rooms=1&rating_low=0&large_allowed=False&nofee=False&deposit_only=False&type=&children=0&location=Phoenix%2C+AZ%2C+US"
# palo alto
br.visit('http://www.bringfido.com/lodging/city/palo_alto_ca_us')
In [40]:
archive_links = br.find_by_xpath('//*[@id="results_list"]/div')
hotel_names = []
text_summaries = []
links = []
for lnk in archive_links:
hotel_names.append(lnk.find_by_xpath('div[2]/h1/a').value)
text_summaries.append(lnk.text)
this_link = lnk.find_by_xpath('div/h1/a')['href']
links.append(this_link)
links
Out[40]:
In [41]:
columns = ['hotel_id',
'hotel_url',
'hotel_name',
'hotel_address',
'hotel_city',
'hotel_state',
'hotel_rating',
'hotel_latitude',
'hotel_longitude',
'review_count',
'hotel_address',
'business_id',
'review_id',
'user_id',
'username',
'review_title',
'review_text',
'review_rating',
'review_date']
bigdf = pd.DataFrame(columns=columns)
for hotel_id, link in enumerate(links):
print('*'*50)
print('Now on {}'.format(link))
print('*'*50)
br.visit(link)
df = pd.DataFrame(columns=columns)
hotel_description = br.find_by_xpath('//*[@class="body"]').text
# scrape the address details section of the page
details = br.find_by_xpath('//*[@class="address"]').text.split('\n')
# now get just the address:
address = details[0]
# and just the city, state, country, and zip code:
csczip = details[1]
# and just the phone number
phone = details[2]
# now separate the city, state, and zip:
city, state, zipcode = csczip.strip().split(',')
zipcode = zipcode[3:]
#Now using correct Xpath we are fetching URL of archives
reviews = br.find_by_xpath('//*[@class="review_container"]')
texts = []
titles = []
authors = []
ratings = []
print(reviews)
print('')
for rev in reviews:
titles.append(rev.find_by_xpath('div/div[1]').text)
authors.append(rev.find_by_xpath('div/div[2]').text)
texts.append(rev.find_by_xpath('div/div[3]').text)
ratings.append(rev.find_by_xpath('div[2]/img')['src'].split('/')[-1][0:1])
print(rev.find_by_xpath('div[2]/img')['src'].split('/')[-1][0:1])
df['review_title'] = titles
df['username'] = authors
df['review_text'] = texts
df['review_rating'] = ratings
df['hotel_id'] = hotel_id
df['hotel_name'] = hotel_names[hotel_id]
df['hotel_url'] = link
df['hotel_address'] = address
df['hotel_city'] = city
df['hotel_state'] = state
df['hotel_rating'] = np.mean([int(rat) for rat in ratings])
df['hotel_latitude'] = ''
df['hotel_longitude'] = ''
df['review_count'] = len(texts)
df['review_id'] = 0
df['user_id'] = 0
bigdf = bigdf.append(df)
In [42]:
bigdf_reviews = bigdf[['hotel_id', 'review_id', 'business_id', 'user_id',
'username', 'review_title', 'review_text', 'review_rating']].copy()
bigdf_reviews.to_sql('bf_reviews', engine, if_exists='append', index=False)
In [ ]: