In [96]:
from splinter.browser import Browser
import connect_aws_db as cadb
from splinter_scrape_ta_reviews import get_hotel_urls
from splinter_scrape_ta_reviews import scrape_hotel
from splinter_scrape_ta_reviews import get_done_business_ids
from splinter_scrape_ta_reviews import remove_duplicates
from splinter_scrape_ta_reviews import get_biz_review_ids
import pandas as pd
import numpy as np
In [97]:
%load_ext autoreload
In [98]:
%autoreload 2
In [4]:
#import sys
#reload(sys)
#sys.setdefaultencoding('utf8')
In [5]:
city = 'new_haven'
state = 'ct'
In [99]:
engine = cadb.connect_aws_db(write_unicode=True)
In [7]:
blinks = get_hotel_urls(city, state, engine)
In [8]:
br = Browser()
In [8]:
for hotel_id, biz_id, link in blinks:
print(hotel_id, biz_id, link)
In [10]:
blinks[0][2]
Out[10]:
In [12]:
donebids = get_done_business_ids(city, engine)
In [13]:
donebids
Out[13]:
In [16]:
blinks[0][1]
Out[16]:
In [17]:
int(blinks[0][1]) not in donebids
Out[17]:
In [50]:
dt = [(None, 13923, u'81644', u'315150463', u'BDB26AE47382E5E2393FAE326E676068', u'Paul R', u'Poor Service Plus Billing Issues', u'1', u"I'm a an Elite member (80+ nights/year). I will no longer stay at this hotel, nor will I have any of my organization stay there. The General Manager is Marco Quniteros and the Guest Care Manager is Luis Sandoval. Poor Service: I traveled there on business and arrived arrived late around 8pm. There were 2 check in desks, the one...More", u'September 30, 2015'), (None, 13923, u'81644', u'312887596', u'19826BDC0584E92ADB07FD0B5DBC60AF', u'JJJ4762', u'Wonderful service, beautiful clean room, close to airport', u'5', u'Booked one night at this Courtyard as my flight was scheduled for the following morning. Comfortable recently renovated lobby, terrific room and a good breakfast but the highlight of my stay was the warm personal touches from the staff. In particular I want to mention Ana, at the front desk. After a very long couple of days of travel, the...More', u'September 22, 2015'), (None, 13923, u'81644', u'307031384', u'655AFE999E10FFB21C23BFB823B1B0D8', u'Maureen F', u'Close to airport.', u'5', u'Usual Courtyard hotel. Staff was friendly and helpful. Location is central to airport but a distance from downtown shopping and eating areas. There was a great chicken place across the street. If you are looking for a location that is away from the hustle bustle of San Jose, this Courtyard Marriott is the place for you. My stay was a...More', u'September 4, 2015'), (None, 13923, u'81644', u'303857639', u'CAB27AA0C676D42947E8AEA5D326BF1D', u'MSir', u'good option for layover', u'4', u"clean, quiet comfortable rooms. Close to the airport, free shuttle. Surprisingly good food for breakfast. We would definitely stay here again. Can't speak to the value as we used American miles. The staff was friendly. Although supposedly you can drink the water, if you opt to buy it, hit the Walmart that is literally right next door.", u''), (None, 13923, u'81644', u'300634339', u'3762AD12C96D5DA585F6E6F015305672', u'jooliecoolie', u'Fantastic Sleep Quality', u'5', u"My husband and I stayed here one night for a wedding. I wish we could have stayed for another night. The pool and patio area were beautiful. The room was spacious and clean and the bed was heavenly. We slept so great with the blackout curtains that we didn't wake up until cleaning services knocked on our door! We were...More", u''), (None, 13923, u'81644', u'298361218', u'C50E11B994B8FBE8B0C821717A5F9BC4', u'Phil d', u'Great Stay', u'5', u'I stayed at this hotel with a colleague for a weekend conference. We found the hotel very clean and comfortable! Everything was as expected from a Marriott. Although we did not have breakfast included in our room rate we mentioned on checkout that we would liked to have had breakfast included. The manager on duty promptly gave us a complimentary...More', u''), (None, 13923, u'81644', u'297368772', u'1B8283DF302E9D87CF054E56F3B4DB2D', u'LeeMTucson_Arizona', u'A very good alternative', u'4', u"Close to the airport. Close to the expressway exit. Still plenty quiet. Nice rooms. Nice pool. Casual dining downstairs and a good alternative restaurant within walking distance. I couldn't ask more from a close to the airport hotel. Can you want to the airport. I'd say it was a little too far. And if you are going to drop off...More", u''), (None, 13923, u'81644', u'296585304', u'CBD5A992B6BE37F7A24F37C771309082', u'Mal07', u'Good location', u'4', u"Stayed here 5 nights for a business trip, a good clean hotel with larger than average room sizes. Close to San Jose airport and the 101 for getting around the Valley. Only downside is there is no restaurant (But you can get hot food at the bar / coffee shop, but not a decent meal) but this isn't a major...More", u''), (None, 13923, u'81644', u'168426305', u'A8C4C9701BC5573606B61E2B7F41FB3C', u'Alex-Lava...', u"\xc0 2 minutes de l'a\xe9roport", u'4', u'Hotel tr\xe8s bien comme la plupart des Courtyard. Id\xe9al pour une nuit avant ou apr\xe8s un vol. Shuttle prends 2...more', u''), (None, 13923, u'81644', u'167089751', u'B7A15AB21AAB5264F398381408221ECE', u'Maricarme...', u'Excelente punto central en nuestro caso.', u'5', u'Estuvimos cuatro noches, nos qued\xf3 muy bien ubicado ya que estuvimos yendo a San Francisco y a Monterey California. No...more', u'')]
In [52]:
len(dt)
Out[52]:
In [53]:
dtdf = pd.DataFrame(dt)
In [64]:
dtdf.columns = ['review_id',
'hotel_id',
'business_id',
'biz_review_id',
'biz_member_id',
'username',
'review_title',
'review_rating',
'review_text',
'review_date']
In [65]:
dtdf.head(3)
Out[65]:
In [90]:
xstng_revs = get_biz_review_ids(city, engine)
In [91]:
len(xstng_revs)
Out[91]:
In [95]:
np.where(xstng_revs == 312887596)
Out[95]:
In [70]:
dtdf.columns
Out[70]:
In [77]:
dtdf['biz_review_id'] = np.int64(dtdf['biz_review_id'].values)
In [81]:
dtdf['biz_review_id']
Out[81]:
In [83]:
np.where(xstng_revs == 312887596L)
Out[83]:
In [86]:
dtdf2 = remove_duplicates(dtdf, 'san_jose', engine)
In [87]:
len(dtdf)
Out[87]:
In [88]:
len(dtdf2)
Out[88]:
In [ ]:
In [28]:
susrnm = ' C太郎'
In [29]:
type(susrnm)
Out[29]:
In [32]:
susrnm
Out[32]:
In [34]:
susrnm.decode('utf-8', 'ignore').strip()
Out[34]:
In [36]:
print(susrnm.decode('utf-8', 'ignore').strip())
In [27]:
str(usrnm[0]).strip()
Out[27]:
In [24]:
bigdf = scrape_hotel(blinks[0][2], br, engine)
In [25]:
len(bigdf)
Out[25]:
In [31]:
bigdf['business_id'] = blinks[0][0]
In [32]:
# drop the hotel_name column (this column does not exist in future versions):
bigdf.drop('hotel_name', axis=1, inplace=True)
In [34]:
bigdf.to_sql('ta_reviews', engine, if_exists='append', index=False)
In [30]:
bigdf.head(5)
Out[30]:
In [100]:
dt = [(None, 13059, u'2192761', u'315810623', u'7D2C28361F65B09D6E36A78F046DE51E', u'Influencer', u'Exceptional Business and Convention Hotel', u'5', u"I cannot say enough about how much I've enjoyed staying at the Omni Dallas for my business trips to Dallas. The interior design, cleanliness, service, food, amenities--all of these features, and much more, are delivered in grand style and with no headaches for the weary business traveler. Although there are many great hotels in Dallas, the Omni is at the...More", u'October 3, 2015'), (None, 13059, u'2192761', u'315687003', u'01F5D2EB9E9446BCAFB454915CCC59CB', u'nofatboys', u'Not enough staff disorganized', u'2', u"Waited about 20 minutes in line to valet my car. No one available to help with bags. Waited another 10 minutes in to check in. Came to my room and it still had someone's room service just outside my door. Just disappointing all the way around", u'October 2, 2015'), (None, 13059, u'2192761', u'315739158', u'E0198AEC18F96F6EB0C7FB469290C147', u'Tina G', u'first stay in Dallas', u'5', u'From the front desk to the Spice restaurant to the bell hop everyone was extremely nice and welcoming we drove 10 hours and arrived at about noon the hotel was able to check us in early and placed us in a room that was high and quiet just what we wanted!', u'October 2, 2015'), (None, 13059, u'2192761', u'315739154', u'DAAA1F239CD5D5D50806D5FF6AA17FB1', u'danielled643', u'Fabulous', u'5', u'From the moment we pulled up to the moment we left, this hotel property was fabulous. They did everything right!! The staff was friendly and helpful. The rooms were perfect Everything was perfect. Thanks OMNI', u'October 2, 2015'), (None, 13059, u'2192761', u'315462377', u'78536447E73E770AC6F0F6CEB8C1FFE9', u'jhaensel', u'Perfect Hotel!!!', u'5', u'The Omni Dallas is very close to the Convention Center and to Downtown Dallas. Some very good restaurants and friendly staff. The rooms are nice, very clean and the view on the city is just outstanding. I would absolutely recommend this hotel for your stay in Dallas.', u'October 1, 2015'), (None, 13059, u'2192761', u'315514245', u'A99AF65C69AC37D84B45195FA212D9EF', u'rhtwo', u'Could Have Been Perfect, But', u'4', u'I made my reservation on a third party website. When I called the Omni Dallas to ask if my room was a nonsmoking one as requested I was told that because I had used a "discount" site I would need to contact the site to confirm any reservation details. The hotel confirmation I received indicated that all details were to...More', u'October 1, 2015'), (None, 13059, u'2192761', u'315514212', u'064FF74DFBF361A4463973ACF34F45F7', u'Julie T', u'Great for Business and Pleasure', u'5', u"Always stay here when I am in Dallas for business. Have also stayed for pleasure. The rooms are nice and well appointed including robes! You can't beat the awesome pool with a view of the city after a long day of work! If you like sports, you can't beat the sports bar with a ton of TVs!", u'October 1, 2015'), (None, 13059, u'2192761', u'315170219', u'75BFE1BE738E4A6760038A05F60306F5', u'Charles M', u'Beautiful hotel, top notch!', u'5', u'The city view room was amazing. The pool was perfect and great Margaritas from the pool bar. All the restaurants were very good. The staff was very friendly and helpful. The close proximity to the sites we were there to see was very helpful.', u'September 30, 2015'), (None, 13059, u'2192761', u'127734410', u'E50EC57CEE86E2773C70951D064BBFB7', u'tomypeck', u'Situation ideal y espectaculo de luces.', u'5', u'Con motivo de la celebracion de la Media Maraton de Dallas el pasado 25 de Marzo, nos alojamos mi mujer...more', u''), (None, 13059, u'2192761', u'127290453', u'F4415CE5A0677DC55E7E6910FADAD6A4', u'btebte', u'tres bon hotel que ce soit pour affaire ou pour le tourisme', u'4', u'tres bien plac\xe9 a deux pas du centre de dallas belles prestations dans cet hotel moderne et tout neuf des...more', u'')]
In [101]:
dtdf = pd.DataFrame(dt)
In [102]:
dtdf.columns = ['review_id',
'hotel_id',
'business_id',
'biz_review_id',
'biz_member_id',
'username',
'review_title',
'review_rating',
'review_text',
'review_date']
In [105]:
dtdf['biz_review_id'].values
Out[105]:
In [106]:
dtdf['biz_review_id'] = np.int64(dtdf['biz_review_id'].values)
In [107]:
dtdf['biz_review_id'].values
Out[107]:
In [ ]: