classify reviews

This notebook describes the binary classification of Yelp hotel reviews on whether or not they are dog related.


In [1]:
import numpy as np
from time import time
import matplotlib.pyplot as plt

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import RidgeClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.utils.extmath import density
from sklearn import metrics

import pandas as pd
import connect_aws_db as cadb

In [2]:
%matplotlib inline

Connect to DB


In [2]:
engine = cadb.connect_aws_db(write_unicode=True)

Restore BF Reviews


In [3]:
cmd = "SELECT review_id, review_rating, review_text FROM bf_reviews"

In [4]:
bfdf = pd.read_sql_query(cmd, engine)

In [5]:
print(len(bfdf))
bfdf.head(5)


3039
Out[5]:
review_id review_rating review_text
0 1 2 When you first go to your room you notice the ...
1 2 2 We were going to the Pre-Westminster event tha...
2 3 2 While the room was not the fanciest that one c...
3 4 1 I stayed at the Hotel Penn in Manhattan becaus...
4 5 5 We loved this hotel! The are very friendly and...

In [7]:
len(bfdf[bfdf['review_text'].str.len() > 500])


Out[7]:
622

In [9]:
num_cities = 'all'
if num_cities is 'all':
    print('hello')


hello

Restore Yelp Reviews


In [7]:
cmd = "SELECT * FROM yelp_reviews"

In [8]:
yelpdf = pd.read_sql_query(cmd, engine)

In [9]:
print(len(yelpdf))
yelpdf.head(5)


6263
Out[9]:
rev_id business_id yelp_review_date yelp_review_id review_rating review_text user_id review_category
0 1 bWWrrsPWuoHuAGUCgH8Tyg 2007-10-28 TMiAAS5RB-P5EwXs7B9aBw 4 I stayed at the hotel for several months durin... ZqoMKTrJOtyYZOcvP8rTUA general
1 2 bWWrrsPWuoHuAGUCgH8Tyg 2008-01-05 F78V6FFuUr9pawcok4YRyg 5 I have been coming to Pittsburgh for quite a w... uiMTavYKLw9hxskV4xB2tg general
2 3 bWWrrsPWuoHuAGUCgH8Tyg 2008-03-08 g6vMKao00XSECOd6JtKPEA 4 This is a very good hotel and with a corporate... TckShYnQa0eD3WQxdoE3GQ general
3 4 bWWrrsPWuoHuAGUCgH8Tyg 2008-03-29 PNS9z4aFDbfhpIsHCfNvNg 4 i got upgraded to a junior suite, and then aga... MquuHY8ar78FUjkbcmVyPw general
4 5 bWWrrsPWuoHuAGUCgH8Tyg 2008-07-07 gyYXmBwP3jSinLuXtuGRuQ 5 This is a fantastic hotel. I went to a conven... pzhiDEp8EFltFmdldP9Oow general

In [10]:
yelp_review_data = yelpdf['review_text'].values

In [11]:
train_data = np.hstack((bfdf['review_text'].values[:1500],
                        yelpdf['review_text'].values[:1500]))

In [12]:
len(train_data)


Out[12]:
3000

In [13]:
labels = ['dog'] * 1500
labels.extend(['general'] * 1500)
y_train = labels

In [14]:
t0 = time()
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
                             stop_words='english')
X_train = vectorizer.fit_transform(train_data)
duration = time() - t0
print('vectorized in {:.2f} seconds.'.format(duration))
print(X_train.shape)


vectorized in 0.40 seconds.
(3000, 12877)

In [15]:
feature_names = np.asarray(vectorizer.get_feature_names())

In [16]:
len(feature_names)


Out[16]:
12877

In [17]:
penalty = 'l2'
clf = LinearSVC(loss='l2', penalty=penalty, dual=False, tol=1e-3)

In [18]:
print(clf)


LinearSVC(C=1.0, class_weight=None, dual=False, fit_intercept=True,
     intercept_scaling=1, loss='l2', max_iter=1000, multi_class='ovr',
     penalty='l2', random_state=None, tol=0.001, verbose=0)

In [19]:
clf.fit(X_train, y_train)


/Applications/anaconda/lib/python2.7/site-packages/sklearn/svm/classes.py:192: DeprecationWarning: loss='l2' has been deprecated in favor of loss='squared_hinge' as of 0.16. Backward compatibility for the loss='l2' will be removed in 1.0
  DeprecationWarning)
Out[19]:
LinearSVC(C=1.0, class_weight=None, dual=False, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.001,
     verbose=0)

In [20]:
#yelp_review_data[:10]

In [21]:
X_yrevs = vectorizer.transform(yelp_review_data)

In [22]:
pred = clf.predict(X_yrevs)

In [23]:
pred.shape


Out[23]:
(6263,)

In [24]:
# print the number of yelp hotel reviews that are identified as dog reviews:
len(np.where(pred == 'dog')[0])


Out[24]:
483

In [25]:
ydogrevs = np.where(pred == 'dog')[0]

In [26]:
yelp_review_data[ydogrevs[4]]


Out[26]:
u"When I first contacted the Windmill Inn, the receptionist was friendly and helpful at getting me a great room at a reasonable rate. As someone who always travels with my pets, I LOVE when hotels are pet friendly and offer complimentary pet accommodations. At the Windmill in, they also have a designated pet friendly section of the hotel, so all of your hotel neighbors are pet people too! No anxiety about your dog barking or bothering other pet-free guests, and quick access to the outdoors as the pet rooms are on the first floor. They even gave us a cute bag of dog treats along with our chocolate-chip cookies at check-in. :)\nThere is also a beautiful, grassy courtyard, with a pond that is full of koi, turtles, and cute ducks! The pool is in the center of this courtyard and is warm, well maintained, and handicap accessible. \nI thoroughly enjoyed my stay, and will definitely return the next time I'm in the area."

In [31]:
yelp_review_data[ydogrevs[5]]


Out[31]:
u"I stayed here specifically because I found out they were a dog friendly hotel on bringfido.com... Me and my doggie stayed there two nights (7/2, 7/3) , the girl at the desk when we checked in was so nice and accommodating and gave my doggie a treat. They also gave me the closest room to a door on the lowest floor just so taking him out would be easier. And there is plenty of outside room and grass to walk dogs around on. There were other people with dogs around the hotel too so I didn't feel weird. The room was very nice and the bed was huge, had a big nice bathroom too. Overall it made my trip with my dog as easy and worry-free as possible. Would def stay there again. :)"

In [32]:
ygenrevs = np.where(pred == "general")[0]
ygenrevs


Out[32]:
array([   0,    1,    2, ..., 6260, 6261, 6262])

In [37]:
yelp_review_data[ygenrevs[4]]


Out[37]:
u"This is a fantastic hotel.  I went to a convention here at the end of June and loved the setting.  The convention center is well setup and the link between the hotel and the convention center works really well, especially if it's raining.\n\nMy only complaint is that the elevators are slow during busy times."

In [27]:
print(len(pred))
print(len(yelpdf))


6263
6263

In [28]:
pred[:10]


Out[28]:
array(['general', 'general', 'general', 'general', 'general', 'general',
       'general', 'general', 'general', 'general'], 
      dtype='|S7')

Add a New Column Stating the Review Type


In [42]:
yelpdf['review_category'] = pred

Update the yelp_reviews SQL Table with the Dog Friendly Data


In [66]:
# conn = engine.connect()

In [67]:
# cmd = "ALTER TABLE yelp_reviews "
# cmd += "ADD review_category VARCHAR(56)"

In [68]:
# print(cmd)
# result = conn.execute(cmd)

In [69]:
# cmd = "UPDATE TABLE yelp_reviews "
# cmd += "SET review_category = ('"
# cmd += "','".join(pred)+"') "
# cmd += "WHERE yelp_review_id = ('"
# cmd += "','".join(yelpdf['yelp_review_id'].values)+"')"
# print(cmd[:500])
# print(cmd[-50:])

In [70]:
#result = conn.execute(cmd)

In [71]:
cmd = "DROP TABLE yelp_reviews"

In [72]:
result = conn.execute(cmd)

In [73]:
cmd = """
        CREATE TABLE yelp_reviews
        (
        rev_id MEDIUMINT AUTO_INCREMENT,
        business_id VARCHAR(256),
        yelp_review_date DATE,
        yelp_review_id VARCHAR(256),
        review_rating INT,
        review_text VARCHAR(5000),
        user_id VARCHAR(256),
        review_category VARCHAR(56),
        PRIMARY KEY (rev_id)
        )
        """

In [74]:
result = conn.execute(cmd)

In [75]:
yelpdf.to_sql('yelp_reviews', engine, if_exists='append', index=False)

test updating the ta review category

This section tests updating the review_category column without deleting the entire table.


In [4]:
conn = engine.connect()

In [5]:
cmd = "SELECT biz_review_id, review_text FROM ta_reviews limit 3"

In [21]:
res = conn.execute(cmd)

In [22]:
dat = res.fetchall()

In [23]:
dat


Out[23]:
[(310458194, u'Right down the street from the Yale campus and in near proximity to several good restaurants (including the one in the hotel), t'),
 (309709618, u"We've stayed at a few places at New Haven, and this is by far the best place in terms of service, atmosphere, and location. It i"),
 (308615033, u'I was visiting New Haven for work and had the good fortune to end up at The Study at Yale - The room was spacious with a long de')]

In [15]:
for row in result:
    print(row)


(310458194, u'Right down the street from the Yale campus and in near proximity to several good restaurants (including the one in the hotel), t')
(309709618, u"We've stayed at a few places at New Haven, and this is by far the best place in terms of service, atmosphere, and location. It i")
(308615033, u'I was visiting New Haven for work and had the good fortune to end up at The Study at Yale - The room was spacious with a long de')

In [43]:
bizids = [str(el[0]) for el in dat]

In [44]:
len(bizids)


Out[44]:
3

In [24]:
cats = ['doggies', 'giraffes', 'random']

In [73]:
cmd = 'UPDATE ta_reviews SET review_category = NULL '
cmd += 'WHERE biz_review_id in ('+(',').join(bizids)+')'
cmd


Out[73]:
'UPDATE ta_reviews SET review_category = NULL WHERE biz_review_id in (310458194,309709618,308615033)'

In [74]:
res = conn.execute(cmd)

In [60]:
len(bfdf)


Out[60]:
3039

In [66]:
dids = bfdf[bfdf['review_rating'] == 3]['review_id'].values

In [68]:
dids[:5]


Out[68]:
array([14, 27, 37, 40, 42])

In [ ]: