In [1]:
import numpy as np
from time import time
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import RidgeClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.utils.extmath import density
from sklearn import metrics
import pandas as pd
import connect_aws_db as cadb
In [2]:
%matplotlib inline
In [2]:
engine = cadb.connect_aws_db(write_unicode=True)
In [3]:
cmd = "SELECT review_id, review_rating, review_text FROM bf_reviews"
In [4]:
bfdf = pd.read_sql_query(cmd, engine)
In [5]:
print(len(bfdf))
bfdf.head(5)
Out[5]:
In [7]:
len(bfdf[bfdf['review_text'].str.len() > 500])
Out[7]:
In [9]:
num_cities = 'all'
if num_cities is 'all':
print('hello')
In [7]:
cmd = "SELECT * FROM yelp_reviews"
In [8]:
yelpdf = pd.read_sql_query(cmd, engine)
In [9]:
print(len(yelpdf))
yelpdf.head(5)
Out[9]:
In [10]:
yelp_review_data = yelpdf['review_text'].values
In [11]:
train_data = np.hstack((bfdf['review_text'].values[:1500],
yelpdf['review_text'].values[:1500]))
In [12]:
len(train_data)
Out[12]:
In [13]:
labels = ['dog'] * 1500
labels.extend(['general'] * 1500)
y_train = labels
In [14]:
t0 = time()
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
stop_words='english')
X_train = vectorizer.fit_transform(train_data)
duration = time() - t0
print('vectorized in {:.2f} seconds.'.format(duration))
print(X_train.shape)
In [15]:
feature_names = np.asarray(vectorizer.get_feature_names())
In [16]:
len(feature_names)
Out[16]:
In [17]:
penalty = 'l2'
clf = LinearSVC(loss='l2', penalty=penalty, dual=False, tol=1e-3)
In [18]:
print(clf)
In [19]:
clf.fit(X_train, y_train)
Out[19]:
In [20]:
#yelp_review_data[:10]
In [21]:
X_yrevs = vectorizer.transform(yelp_review_data)
In [22]:
pred = clf.predict(X_yrevs)
In [23]:
pred.shape
Out[23]:
In [24]:
# print the number of yelp hotel reviews that are identified as dog reviews:
len(np.where(pred == 'dog')[0])
Out[24]:
In [25]:
ydogrevs = np.where(pred == 'dog')[0]
In [26]:
yelp_review_data[ydogrevs[4]]
Out[26]:
In [31]:
yelp_review_data[ydogrevs[5]]
Out[31]:
In [32]:
ygenrevs = np.where(pred == "general")[0]
ygenrevs
Out[32]:
In [37]:
yelp_review_data[ygenrevs[4]]
Out[37]:
In [27]:
print(len(pred))
print(len(yelpdf))
In [28]:
pred[:10]
Out[28]:
In [42]:
yelpdf['review_category'] = pred
In [66]:
# conn = engine.connect()
In [67]:
# cmd = "ALTER TABLE yelp_reviews "
# cmd += "ADD review_category VARCHAR(56)"
In [68]:
# print(cmd)
# result = conn.execute(cmd)
In [69]:
# cmd = "UPDATE TABLE yelp_reviews "
# cmd += "SET review_category = ('"
# cmd += "','".join(pred)+"') "
# cmd += "WHERE yelp_review_id = ('"
# cmd += "','".join(yelpdf['yelp_review_id'].values)+"')"
# print(cmd[:500])
# print(cmd[-50:])
In [70]:
#result = conn.execute(cmd)
In [71]:
cmd = "DROP TABLE yelp_reviews"
In [72]:
result = conn.execute(cmd)
In [73]:
cmd = """
CREATE TABLE yelp_reviews
(
rev_id MEDIUMINT AUTO_INCREMENT,
business_id VARCHAR(256),
yelp_review_date DATE,
yelp_review_id VARCHAR(256),
review_rating INT,
review_text VARCHAR(5000),
user_id VARCHAR(256),
review_category VARCHAR(56),
PRIMARY KEY (rev_id)
)
"""
In [74]:
result = conn.execute(cmd)
In [75]:
yelpdf.to_sql('yelp_reviews', engine, if_exists='append', index=False)
In [4]:
conn = engine.connect()
In [5]:
cmd = "SELECT biz_review_id, review_text FROM ta_reviews limit 3"
In [21]:
res = conn.execute(cmd)
In [22]:
dat = res.fetchall()
In [23]:
dat
Out[23]:
In [15]:
for row in result:
print(row)
In [43]:
bizids = [str(el[0]) for el in dat]
In [44]:
len(bizids)
Out[44]:
In [24]:
cats = ['doggies', 'giraffes', 'random']
In [73]:
cmd = 'UPDATE ta_reviews SET review_category = NULL '
cmd += 'WHERE biz_review_id in ('+(',').join(bizids)+')'
cmd
Out[73]:
In [74]:
res = conn.execute(cmd)
In [60]:
len(bfdf)
Out[60]:
In [66]:
dids = bfdf[bfdf['review_rating'] == 3]['review_id'].values
In [68]:
dids[:5]
Out[68]:
In [ ]: