classify reviews

This notebook describes the binary classification of Yelp hotel reviews on whether or not they are dog related.



In [1]:

    
import numpy as np
from time import time
import matplotlib.pyplot as plt

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import RidgeClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.utils.extmath import density
from sklearn import metrics

import pandas as pd
import connect_aws_db as cadb



In [2]:

    
%matplotlib inline

Connect to DB



In [2]:

    
engine = cadb.connect_aws_db(write_unicode=True)

Restore BF Reviews



In [3]:

    
cmd = "SELECT review_id, review_rating, review_text FROM bf_reviews"



In [4]:

    
bfdf = pd.read_sql_query(cmd, engine)



In [5]:

    
print(len(bfdf))
bfdf.head(5)









    



3039






    Out[5]:






  
    
      
      review_id
      review_rating
      review_text
    
  
  
    
      0
      1
      2
      When you first go to your room you notice the ...
    
    
      1
      2
      2
      We were going to the Pre-Westminster event tha...
    
    
      2
      3
      2
      While the room was not the fanciest that one c...
    
    
      3
      4
      1
      I stayed at the Hotel Penn in Manhattan becaus...
    
    
      4
      5
      5
      We loved this hotel! The are very friendly and...



In [7]:

    
len(bfdf[bfdf['review_text'].str.len() > 500])









    Out[7]:





622



In [9]:

    
num_cities = 'all'
if num_cities is 'all':
    print('hello')









    



hello

Restore Yelp Reviews



In [7]:

    
cmd = "SELECT * FROM yelp_reviews"



In [8]:

    
yelpdf = pd.read_sql_query(cmd, engine)



In [9]:

    
print(len(yelpdf))
yelpdf.head(5)









    



6263






    Out[9]:






  
    
      
      rev_id
      business_id
      yelp_review_date
      yelp_review_id
      review_rating
      review_text
      user_id
      review_category
    
  
  
    
      0
      1
      bWWrrsPWuoHuAGUCgH8Tyg
      2007-10-28
      TMiAAS5RB-P5EwXs7B9aBw
      4
      I stayed at the hotel for several months durin...
      ZqoMKTrJOtyYZOcvP8rTUA
      general
    
    
      1
      2
      bWWrrsPWuoHuAGUCgH8Tyg
      2008-01-05
      F78V6FFuUr9pawcok4YRyg
      5
      I have been coming to Pittsburgh for quite a w...
      uiMTavYKLw9hxskV4xB2tg
      general
    
    
      2
      3
      bWWrrsPWuoHuAGUCgH8Tyg
      2008-03-08
      g6vMKao00XSECOd6JtKPEA
      4
      This is a very good hotel and with a corporate...
      TckShYnQa0eD3WQxdoE3GQ
      general
    
    
      3
      4
      bWWrrsPWuoHuAGUCgH8Tyg
      2008-03-29
      PNS9z4aFDbfhpIsHCfNvNg
      4
      i got upgraded to a junior suite, and then aga...
      MquuHY8ar78FUjkbcmVyPw
      general
    
    
      4
      5
      bWWrrsPWuoHuAGUCgH8Tyg
      2008-07-07
      gyYXmBwP3jSinLuXtuGRuQ
      5
      This is a fantastic hotel.  I went to a conven...
      pzhiDEp8EFltFmdldP9Oow
      general



In [10]:

    
yelp_review_data = yelpdf['review_text'].values



In [11]:

    
train_data = np.hstack((bfdf['review_text'].values[:1500],
                        yelpdf['review_text'].values[:1500]))



In [12]:

    
len(train_data)









    Out[12]:





3000



In [13]:

    
labels = ['dog'] * 1500
labels.extend(['general'] * 1500)
y_train = labels



In [14]:

    
t0 = time()
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
                             stop_words='english')
X_train = vectorizer.fit_transform(train_data)
duration = time() - t0
print('vectorized in {:.2f} seconds.'.format(duration))
print(X_train.shape)









    



vectorized in 0.40 seconds.
(3000, 12877)



In [15]:

    
feature_names = np.asarray(vectorizer.get_feature_names())



In [16]:

    
len(feature_names)









    Out[16]:





12877



In [17]:

    
penalty = 'l2'
clf = LinearSVC(loss='l2', penalty=penalty, dual=False, tol=1e-3)



In [18]:

    
print(clf)









    



LinearSVC(C=1.0, class_weight=None, dual=False, fit_intercept=True,
     intercept_scaling=1, loss='l2', max_iter=1000, multi_class='ovr',
     penalty='l2', random_state=None, tol=0.001, verbose=0)



In [19]:

    
clf.fit(X_train, y_train)









    



/Applications/anaconda/lib/python2.7/site-packages/sklearn/svm/classes.py:192: DeprecationWarning: loss='l2' has been deprecated in favor of loss='squared_hinge' as of 0.16. Backward compatibility for the loss='l2' will be removed in 1.0
  DeprecationWarning)






    Out[19]:





LinearSVC(C=1.0, class_weight=None, dual=False, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.001,
     verbose=0)



In [20]:

    
#yelp_review_data[:10]



In [21]:

    
X_yrevs = vectorizer.transform(yelp_review_data)



In [22]:

    
pred = clf.predict(X_yrevs)



In [23]:

    
pred.shape









    Out[23]:





(6263,)



In [24]:

    
# print the number of yelp hotel reviews that are identified as dog reviews:
len(np.where(pred == 'dog')[0])









    Out[24]:





483



In [25]:

    
ydogrevs = np.where(pred == 'dog')[0]



In [26]:

    
yelp_review_data[ydogrevs[4]]









    Out[26]:





u"When I first contacted the Windmill Inn, the receptionist was friendly and helpful at getting me a great room at a reasonable rate. As someone who always travels with my pets, I LOVE when hotels are pet friendly and offer complimentary pet accommodations. At the Windmill in, they also have a designated pet friendly section of the hotel, so all of your hotel neighbors are pet people too! No anxiety about your dog barking or bothering other pet-free guests, and quick access to the outdoors as the pet rooms are on the first floor. They even gave us a cute bag of dog treats along with our chocolate-chip cookies at check-in. :)\nThere is also a beautiful, grassy courtyard, with a pond that is full of koi, turtles, and cute ducks! The pool is in the center of this courtyard and is warm, well maintained, and handicap accessible. \nI thoroughly enjoyed my stay, and will definitely return the next time I'm in the area."



In [31]:

    
yelp_review_data[ydogrevs[5]]









    Out[31]:





u"I stayed here specifically because I found out they were a dog friendly hotel on bringfido.com... Me and my doggie stayed there two nights (7/2, 7/3) , the girl at the desk when we checked in was so nice and accommodating and gave my doggie a treat. They also gave me the closest room to a door on the lowest floor just so taking him out would be easier. And there is plenty of outside room and grass to walk dogs around on. There were other people with dogs around the hotel too so I didn't feel weird. The room was very nice and the bed was huge, had a big nice bathroom too. Overall it made my trip with my dog as easy and worry-free as possible. Would def stay there again. :)"



In [32]:

    
ygenrevs = np.where(pred == "general")[0]
ygenrevs









    Out[32]:





array([   0,    1,    2, ..., 6260, 6261, 6262])



In [37]:

    
yelp_review_data[ygenrevs[4]]









    Out[37]:





u"This is a fantastic hotel.  I went to a convention here at the end of June and loved the setting.  The convention center is well setup and the link between the hotel and the convention center works really well, especially if it's raining.\n\nMy only complaint is that the elevators are slow during busy times."



In [27]:

    
print(len(pred))
print(len(yelpdf))



In [28]:

    
pred[:10]









    Out[28]:





array(['general', 'general', 'general', 'general', 'general', 'general',
       'general', 'general', 'general', 'general'], 
      dtype='|S7')

Add a New Column Stating the Review Type



In [42]:

    
yelpdf['review_category'] = pred

Update the yelp_reviews SQL Table with the Dog Friendly Data



In [66]:

    
# conn = engine.connect()



In [67]:

    
# cmd = "ALTER TABLE yelp_reviews "
# cmd += "ADD review_category VARCHAR(56)"



In [68]:

    
# print(cmd)
# result = conn.execute(cmd)



In [69]:

    
# cmd = "UPDATE TABLE yelp_reviews "
# cmd += "SET review_category = ('"
# cmd += "','".join(pred)+"') "
# cmd += "WHERE yelp_review_id = ('"
# cmd += "','".join(yelpdf['yelp_review_id'].values)+"')"
# print(cmd[:500])
# print(cmd[-50:])



In [70]:

    
#result = conn.execute(cmd)



In [71]:

    
cmd = "DROP TABLE yelp_reviews"



In [72]:

    
result = conn.execute(cmd)



In [73]:

    
cmd = """
        CREATE TABLE yelp_reviews
        (
        rev_id MEDIUMINT AUTO_INCREMENT,
        business_id VARCHAR(256),
        yelp_review_date DATE,
        yelp_review_id VARCHAR(256),
        review_rating INT,
        review_text VARCHAR(5000),
        user_id VARCHAR(256),
        review_category VARCHAR(56),
        PRIMARY KEY (rev_id)
        )
        """



In [74]:

    
result = conn.execute(cmd)



In [75]:

    
yelpdf.to_sql('yelp_reviews', engine, if_exists='append', index=False)

test updating the ta review category

This section tests updating the review_category column without deleting the entire table.



In [4]:

    
conn = engine.connect()



In [5]:

    
cmd = "SELECT biz_review_id, review_text FROM ta_reviews limit 3"



In [21]:

    
res = conn.execute(cmd)



In [22]:

    
dat = res.fetchall()



In [23]:

    
dat









    Out[23]:





[(310458194, u'Right down the street from the Yale campus and in near proximity to several good restaurants (including the one in the hotel), t'),
 (309709618, u"We've stayed at a few places at New Haven, and this is by far the best place in terms of service, atmosphere, and location. It i"),
 (308615033, u'I was visiting New Haven for work and had the good fortune to end up at The Study at Yale - The room was spacious with a long de')]



In [15]:

    
for row in result:
    print(row)









    



(310458194, u'Right down the street from the Yale campus and in near proximity to several good restaurants (including the one in the hotel), t')
(309709618, u"We've stayed at a few places at New Haven, and this is by far the best place in terms of service, atmosphere, and location. It i")
(308615033, u'I was visiting New Haven for work and had the good fortune to end up at The Study at Yale - The room was spacious with a long de')



In [43]:

    
bizids = [str(el[0]) for el in dat]



In [44]:

    
len(bizids)









    Out[44]:





3



In [24]:

    
cats = ['doggies', 'giraffes', 'random']



In [73]:

    
cmd = 'UPDATE ta_reviews SET review_category = NULL '
cmd += 'WHERE biz_review_id in ('+(',').join(bizids)+')'
cmd









    Out[73]:





'UPDATE ta_reviews SET review_category = NULL WHERE biz_review_id in (310458194,309709618,308615033)'



In [74]:

    
res = conn.execute(cmd)



In [60]:

    
len(bfdf)









    Out[60]:





3039



In [66]:

    
dids = bfdf[bfdf['review_rating'] == 3]['review_id'].values



In [68]:

    
dids[:5]









    Out[68]:





array([14, 27, 37, 40, 42])



In [ ]:

	review_id	review_rating	review_text
0	1	2	When you first go to your room you notice the ...
1	2	2	We were going to the Pre-Westminster event tha...
2	3	2	While the room was not the fanciest that one c...
3	4	1	I stayed at the Hotel Penn in Manhattan becaus...
4	5	5	We loved this hotel! The are very friendly and...

	rev_id	business_id	yelp_review_date	yelp_review_id	review_rating	review_text	user_id	review_category
0	1	bWWrrsPWuoHuAGUCgH8Tyg	2007-10-28	TMiAAS5RB-P5EwXs7B9aBw	4	I stayed at the hotel for several months durin...	ZqoMKTrJOtyYZOcvP8rTUA	general
1	2	bWWrrsPWuoHuAGUCgH8Tyg	2008-01-05	F78V6FFuUr9pawcok4YRyg	5	I have been coming to Pittsburgh for quite a w...	uiMTavYKLw9hxskV4xB2tg	general
2	3	bWWrrsPWuoHuAGUCgH8Tyg	2008-03-08	g6vMKao00XSECOd6JtKPEA	4	This is a very good hotel and with a corporate...	TckShYnQa0eD3WQxdoE3GQ	general
3	4	bWWrrsPWuoHuAGUCgH8Tyg	2008-03-29	PNS9z4aFDbfhpIsHCfNvNg	4	i got upgraded to a junior suite, and then aga...	MquuHY8ar78FUjkbcmVyPw	general
4	5	bWWrrsPWuoHuAGUCgH8Tyg	2008-07-07	gyYXmBwP3jSinLuXtuGRuQ	5	This is a fantastic hotel. I went to a conven...	pzhiDEp8EFltFmdldP9Oow	general