In [1]:
import numpy as np
from time import time
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestCentroid
from sklearn.utils.extmath import density
from sklearn import metrics
from scipy.stats import pearsonr
import pandas as pd
import connect_aws_db as cadb
In [2]:
%matplotlib inline
In [53]:
engine = cadb.connect_aws_db(write_unicode=True)
In [4]:
cmd = "SELECT review_rating, review_text FROM bf_reviews"
In [5]:
bfdf = pd.read_sql_query(cmd, engine)
In [6]:
print(len(bfdf))
bfdf.head(5)
Out[6]:
In [7]:
bfdfl = bfdf[bfdf['review_text'].str.len() > 300].copy()
In [8]:
len(bfdfl)
Out[8]:
In [9]:
train_data = bfdfl['review_text'].values
In [10]:
y_train = bfdfl['review_rating'].values
In [11]:
t0 = time()
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
stop_words='english')
X_train = vectorizer.fit_transform(train_data)
duration = time() - t0
print('vectorized in {:.2f} seconds.'.format(duration))
print(X_train.shape)
In [17]:
cmd = "SELECT * FROM yelp_reviews WHERE review_category = 'dog'"
In [18]:
yelpdf = pd.read_sql_query(cmd, engine)
In [19]:
print(len(yelpdf))
yelpdf.head(5)
Out[19]:
In [21]:
yelp_review_text = yelpdf['review_text'].values
In [29]:
len(yelpdf)
Out[29]:
In [31]:
len(np.unique(yelpdf['business_id']))
Out[31]:
In [22]:
t0 = time()
X_pred = vectorizer.transform(yelp_review_text)
duration = time() - t0
print('transformed test data in {:.2f} seconds.'.format(duration))
In [21]:
clf = NearestCentroid()
print(clf)
In [22]:
clf.fit(X_train, y_train)
Out[22]:
In [25]:
y_pred = clf.predict(X_pred)
In [26]:
y_pred.shape
Out[26]:
In [27]:
y_pred[:10]
Out[27]:
In [28]:
yelpdf['dog_review'] = y_pred
In [15]:
cmd = "SELECT * FROM ta_reviews WHERE review_category = 'dog'"
In [16]:
tadf = pd.read_sql_query(cmd, engine)
In [17]:
len(tadf)
Out[17]:
In [18]:
ta_review_text = tadf['review_text'].values
In [19]:
t0 = time()
X_pred = vectorizer.transform(ta_review_text)
duration = time() - t0
print('transformed test data in {:.2f} seconds.'.format(duration))
In [23]:
X_pred.shape
Out[23]:
In [24]:
y_pred = clf.predict(X_pred)
In [25]:
len(y_pred)
Out[25]:
In [33]:
np.int64(np.random.uniform(low=0, high=len(y_pred), size=5))
Out[33]:
In [39]:
y_pred[np.int64(np.random.uniform(low=0, high=len(y_pred), size=10))]
Out[39]:
In [40]:
x = plt.hist(y_pred)
In [41]:
tadf.columns
Out[41]:
In [42]:
tadf['dog_rating'] = y_pred
In [43]:
print(len(tadf))
In [55]:
rat_1 = len(tadf[tadf['dog_rating'] == 1])
rat_2 = len(tadf[tadf['dog_rating'] == 2])
rat_3 = len(tadf[tadf['dog_rating'] == 3])
rat_4 = len(tadf[tadf['dog_rating'] == 4])
rat_5 = len(tadf[tadf['dog_rating'] == 5])
print(rat_1+rat_2+rat_3+rat_4+rat_5)
print(rat_1)
print(rat_2)
print(rat_3)
print(rat_4)
print(rat_5)
In [ ]:
tadf[tadf['dog_rating'] == 1]['business_id']
In [51]:
def update_table_rev_cat(df, engine, rating):
brids = df[df['review_category'] == 'dog']['biz_review_id'].values
sbrids = [str(brid) for brid in brids]
cmd = 'UPDATE ta_reviews SET dog_rating = '+str(rating)+' '
cmd += 'WHERE biz_review_id in ('+(',').join(sbrids)+')'
conn = engine.connect()
conn.execute(cmd)
In [54]:
for i in range(1, 6):
print(i)
catdf = tadf[tadf['dog_rating'] == i]
update_table_rev_cat(catdf, engine, i)
In [ ]: