In [1]:
import numpy as np
from time import time
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import RidgeClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils.extmath import density
from sklearn import metrics
from scipy.stats import pearsonr
import pandas as pd
import connect_aws_db as cadb
In [2]:
%matplotlib inline
In [3]:
engine = cadb.connect_aws_db(write_unicode=True)
In [4]:
categories = ['dogs', 'general']
In [5]:
cmd = "SELECT review_rating, review_text FROM bf_reviews"
In [6]:
bfdf = pd.read_sql_query(cmd, engine)
In [7]:
print(len(bfdf))
bfdf.head(5)
Out[7]:
Now limit the reviews used in training to only reviews with more than 350 characters.
In [8]:
bfdfl = bfdf[bfdf['review_text'].str.len() > 350].copy()
In [9]:
len(bfdfl)
Out[9]:
In [10]:
train_data = bfdfl['review_text'].values[:750]
In [11]:
y_train = bfdfl['review_rating'].values[:750]
In [12]:
t0 = time()
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
stop_words='english')
X_train = vectorizer.fit_transform(train_data)
duration = time() - t0
print('vectorized in {:.2f} seconds.'.format(duration))
print(X_train.shape)
In [13]:
test_data = bfdfl['review_text'].values[750:]
In [14]:
t0 = time()
X_test = vectorizer.transform(test_data)
duration = time() - t0
print('transformed test data in {:.2f} seconds.'.format(duration))
In [15]:
feature_names = np.asarray(vectorizer.get_feature_names())
In [16]:
len(feature_names)
Out[16]:
In [17]:
feature_names[:5]
Out[17]:
In [18]:
y_test = bfdfl['review_rating'].values[750:]
In [19]:
def benchmark(clf, pos_label=None):
print('_' * 80)
print("Training: ")
print(clf)
t0 = time()
clf.fit(X_train, y_train)
train_time = time() - t0
print("train time: %0.3fs" % train_time)
t0 = time()
pred = clf.predict(X_test)
test_time = time() - t0
print("test time: %0.3fs" % test_time)
score = metrics.f1_score(y_test, pred, pos_label=pos_label)
print("f1-score: %0.3f" % score)
if hasattr(clf, 'coef_'):
print("dimensionality: %d" % clf.coef_.shape[1])
print("density: %f" % density(clf.coef_))
# if opts.print_top10 and feature_names is not None:
# print("top 10 keywords per class:")
# for i, category in enumerate(categories):
# top10 = np.argsort(clf.coef_[i])[-10:]
# print(trim("%s: %s"
# % (category, " ".join(feature_names[top10]))))
print()
# if opts.print_report:
# print("classification report:")
# print(metrics.classification_report(y_test, pred,
# target_names=categories))
# if opts.print_cm:
# print("confusion matrix:")
# print(metrics.confusion_matrix(y_test, pred))
print()
clf_descr = str(clf).split('(')[0]
return clf_descr, score, train_time, test_time, pred
In [20]:
results = []
for clf, name in (
(RidgeClassifier(tol=1e-2, solver="lsqr"), "Ridge Classifier"),
(Perceptron(n_iter=50), "Perceptron"),
(PassiveAggressiveClassifier(n_iter=50), "Passive-Aggressive"),
(KNeighborsClassifier(n_neighbors=10), "kNN"),
(RandomForestClassifier(n_estimators=20), 'RandomForest')):
print('=' * 80)
print(name)
results.append(benchmark(clf))
In [21]:
for penalty in ["l2", "l1"]:
print('=' * 80)
print("%s penalty" % penalty.upper())
# Train Liblinear model
results.append(benchmark(LinearSVC(loss='l2', penalty=penalty,
dual=False, tol=1e-3)))
# Train SGD model
results.append(benchmark(SGDClassifier(alpha=.0001, n_iter=50,
penalty=penalty)))
In [22]:
# Train SGD with Elastic Net penalty
print('=' * 80)
print("Elastic-Net penalty")
results.append(benchmark(SGDClassifier(alpha=.0001, n_iter=50,
penalty="elasticnet")))
# Train NearestCentroid without threshold
print('=' * 80)
print("NearestCentroid (aka Rocchio classifier)")
results.append(benchmark(NearestCentroid()))
# Train sparse Naive Bayes classifiers
print('=' * 80)
print("Naive Bayes")
results.append(benchmark(MultinomialNB(alpha=.01)))
results.append(benchmark(BernoulliNB(alpha=.01)))
In [23]:
class L1LinearSVC(LinearSVC):
def fit(self, X, y):
# The smaller C, the stronger the regularization.
# The more regularization, the more sparsity.
self.transformer_ = LinearSVC(penalty="l1",
dual=False, tol=1e-3)
X = self.transformer_.fit_transform(X, y)
return LinearSVC.fit(self, X, y)
def predict(self, X):
X = self.transformer_.transform(X)
return LinearSVC.predict(self, X)
print('=' * 80)
print("LinearSVC with L1-based feature selection")
results.append(benchmark(L1LinearSVC()))
In [24]:
indices = np.arange(len(results))
results = [[x[i] for x in results] for i in range(5)]
In [25]:
font = {'family' : 'normal',
'weight' : 'bold',
'size' : 16}
plt.rc('font', **font)
plt.rcParams['figure.figsize'] = 12.94, 8
clf_names, score, training_time, test_time, pred = results
training_time = np.array(training_time) / np.max(training_time)
test_time = np.array(test_time) / np.max(test_time)
#plt.figure(figsize=(12, 8))
plt.title("Score")
plt.barh(indices, score, .2, label="score", color='#982023')
plt.barh(indices + .3, training_time, .2, label="training time", color='#46959E')
plt.barh(indices + .6, test_time, .2, label="test time", color='#C7B077')
plt.yticks(())
plt.legend(loc='best')
plt.subplots_adjust(left=.25)
plt.subplots_adjust(top=.95)
plt.subplots_adjust(bottom=.05)
plt.ylim(0, 14)
print(indices)
for i, c in zip(indices, clf_names):
plt.text(-0.025, i, c, horizontalalignment='right')
In [26]:
clf_names[0] = 'Ridge'
clf_names[2] = 'PassAggress'
clf_names[3] = 'KNN'
clf_names[4] = 'RandomForest'
clf_names[5] = 'LinearSVC L2'
clf_names[6] = 'SGDC SVM L2'
clf_names[7] = 'LinearSVC L1'
clf_names[8] = 'SGDC L1'
clf_names[9] = 'SGDC ElNet'
clf_names[13] = 'LinearSVC L1FS'
In [27]:
fig, ax = plt.subplots(1, 1)
clf_names, score, training_time, test_time, pred = results
ax.plot(indices, score, '-o', label="score", color='#982023')
ax.plot(indices, training_time, '-o', label="training time (s)", color='#46959E')
ax.plot(indices, test_time, '-o', label="test time (s)", color='#C7B077')
#labels = [item.get_text() for item in ax.get_xticklabels()]
labels = clf_names
ax.xaxis.set_ticks(np.arange(np.min(indices), np.max(indices)+1, 1))
ax.set_xticklabels(clf_names, rotation='70', horizontalalignment='right')
ax.set_xlim([-1, 14])
ax.set_ylim([0, 1])
ax.legend(loc='best')
plt.subplots_adjust(left=0.05, bottom=0.3, top=.98)
plt.savefig('ratingClassifierScores.png', dpi=144)
In [28]:
for name, scr in zip(clf_names, score):
print('{}: {:.3f}'.format(name, scr))
In [29]:
fig, ax = plt.subplots(1, 1)
ax.plot(y_test + 0.1*np.random.random(len(y_test)) - 0.05, pred[0] + 0.1*np.random.random(len(y_test)) - 0.05, '.')
ax.set_xlim([0, 6])
ax.set_ylim([0, 6])
ax.set_xlabel('Given Rating')
ax.set_ylabel('Predicted Rating')
Out[29]:
In [30]:
ms = np.zeros((5, 5))
for row in range(5):
for col in range(5):
#print('row {}, col {}'.format(row, col))
ms[row, col] = len(np.where((y_test == col+1) & (pred[0] == row+1))[0])
ms
Out[30]:
In [31]:
logms = 5*np.log(ms+1)
logms
Out[31]:
In [32]:
fig, ax = plt.subplots(1, 1)
for row in range(5):
for col in range(5):
ax.plot(col+1, row+1, 'o', ms=logms[row, col], color='#83A7C8', alpha=0.5)
ax.set_xlim([0, 6])
ax.set_ylim([0, 6])
ax.set_xlabel('Given Rating')
ax.set_ylabel('Predicted Rating')
#plt.savefig('Predicted_Vs_Given_Bubbles.png', dpi=144)
Out[32]:
In [33]:
for idx, prediction in enumerate(pred):
print(idx, pearsonr(y_test, prediction))
In [34]:
fig, ax = plt.subplots(1, 1)
ax.hist(y_test, bins=range(1, 7), align='left', color='#83A7C8', alpha=0.25, label='Given')
ax.hist(pred[10], bins=range(1, 7), align='left', color='#BA4C37', alpha=0.25, label='Predicted')
#ax.set_xlim([0, 6])
ax.xaxis.set_ticks([1, 2, 3, 4, 5])
ax.set_xlabel('Rating')
ax.set_ylabel('Number of Reviews')
ax.legend(loc='best')
#plt.savefig('PredictedGivenDist.png', dpi=144)
Out[34]:
In [35]:
from sklearn import metrics
In [36]:
def plot_confusion_matrix(y_pred, y, normalize=False, cmap=plt.cm.binary):
cm = metrics.confusion_matrix(y, y_pred)
cm = np.flipud(cm)
if normalize:
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
plt.imshow(cm, cmap=cmap, interpolation='nearest')
plt.colorbar()
plt.xticks(np.arange(0, 5), np.arange(1, 6))
plt.yticks(np.arange(0, 5), np.arange(1, 6)[::-1])
plt.xlabel('bringfido.com rating (true rating)')
plt.ylabel('predicted rating')
print "classification accuracy:", metrics.accuracy_score(y_test, pred[10])
plot_confusion_matrix(y_test, pred[10], normalize=True, cmap=plt.cm.Blues)
#plt.savefig('rating_confusion_matrix.png', dpi=144)
In [39]:
clf = NearestCentroid()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
In [49]:
cens = clf.centroids_
In [42]:
clf.get_params()
Out[42]:
In [73]:
words = vectorizer.get_feature_names()
len(words)
Out[73]:
In [50]:
cens.shape
Out[50]:
Which features/words have the highest weight towards rating 1?
In [146]:
wgtarr = cens[4,:]
In [153]:
ratwords = np.argsort(wgtarr).tolist()[::-1]
In [155]:
for i in range(20):
print(wgtarr[ratwords[i]], words[ratwords[i]], ratwords[i])
In [159]:
cens[:, 1148]
Out[159]:
In [ ]:
In [105]:
cen_tot = np.sum(cens, axis=0)
In [106]:
cen_tot.shape
Out[106]:
In [139]:
wgtarr = cens[4,:]/cen_tot
In [140]:
words[np.argsort(wgtarr)[0]]
Out[140]:
In [144]:
ratwords = np.argsort(wgtarr).tolist()[::-1]
In [145]:
for i in range(20):
print(wgtarr[ratwords[i]], words[ratwords[i]])
In [209]:
t0 = time()
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
stop_words='english', ngram_range=(1, 3))
X_train = vectorizer.fit_transform(train_data)
duration = time() - t0
print('vectorized in {:.2f} seconds.'.format(duration))
print(X_train.shape)
In [191]:
t0 = time()
X_test = vectorizer.transform(test_data)
duration = time() - t0
print('transformed test data in {:.2f} seconds.'.format(duration))
In [192]:
results = []
for clf, name in (
(RidgeClassifier(tol=1e-2, solver="lsqr"), "Ridge Classifier"),
(Perceptron(n_iter=50), "Perceptron"),
(PassiveAggressiveClassifier(n_iter=50), "Passive-Aggressive"),
(KNeighborsClassifier(n_neighbors=10), "kNN"),
(RandomForestClassifier(n_estimators=20), 'RandomForest')):
print('=' * 80)
print(name)
results.append(benchmark(clf))
In [193]:
for penalty in ["l2", "l1"]:
print('=' * 80)
print("%s penalty" % penalty.upper())
# Train Liblinear model
results.append(benchmark(LinearSVC(loss='l2', penalty=penalty,
dual=False, tol=1e-3)))
# Train SGD model
results.append(benchmark(SGDClassifier(alpha=.0001, n_iter=50,
penalty=penalty)))
In [194]:
# Train SGD with Elastic Net penalty
print('=' * 80)
print("Elastic-Net penalty")
results.append(benchmark(SGDClassifier(alpha=.0001, n_iter=50,
penalty="elasticnet")))
# Train NearestCentroid without threshold
print('=' * 80)
print("NearestCentroid (aka Rocchio classifier)")
results.append(benchmark(NearestCentroid()))
# Train sparse Naive Bayes classifiers
print('=' * 80)
print("Naive Bayes")
results.append(benchmark(MultinomialNB(alpha=.01)))
results.append(benchmark(BernoulliNB(alpha=.01)))
In [195]:
class L1LinearSVC(LinearSVC):
def fit(self, X, y):
# The smaller C, the stronger the regularization.
# The more regularization, the more sparsity.
self.transformer_ = LinearSVC(penalty="l1",
dual=False, tol=1e-3)
X = self.transformer_.fit_transform(X, y)
return LinearSVC.fit(self, X, y)
def predict(self, X):
X = self.transformer_.transform(X)
return LinearSVC.predict(self, X)
print('=' * 80)
print("LinearSVC with L1-based feature selection")
results.append(benchmark(L1LinearSVC()))
In [196]:
indices = np.arange(len(results))
results = [[x[i] for x in results] for i in range(5)]
In [197]:
fig, ax = plt.subplots(1, 1)
clf_names, score, training_time, test_time, pred = results
ax.plot(indices, score, '-o', label="score", color='#982023')
ax.plot(indices, training_time, '-o', label="training time (s)", color='#46959E')
ax.plot(indices, test_time, '-o', label="test time (s)", color='#C7B077')
#labels = [item.get_text() for item in ax.get_xticklabels()]
labels = clf_names
ax.xaxis.set_ticks(np.arange(np.min(indices), np.max(indices)+1, 1))
ax.set_xticklabels(clf_names, rotation='70', horizontalalignment='right')
ax.set_xlim([-1, 14])
ax.set_ylim([0, 1])
ax.legend(loc='best')
plt.subplots_adjust(left=0.05, bottom=0.3, top=.98)
#plt.savefig('ratingClassifierScores.png', dpi=144)
In [198]:
t0 = time()
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
stop_words='english', ngram_range=(1, 2))
X_train = vectorizer.fit_transform(train_data)
duration = time() - t0
print('vectorized in {:.2f} seconds.'.format(duration))
print(X_train.shape)
In [199]:
t0 = time()
X_test = vectorizer.transform(test_data)
duration = time() - t0
print('transformed test data in {:.2f} seconds.'.format(duration))
In [203]:
results = []
for clf, name in (
(RidgeClassifier(tol=1e-2, solver="lsqr"), "Ridge Classifier"),
(Perceptron(n_iter=50), "Perceptron"),
(PassiveAggressiveClassifier(n_iter=50), "Passive-Aggressive"),
(KNeighborsClassifier(n_neighbors=10), "kNN"),
(RandomForestClassifier(n_estimators=20), 'RandomForest'),
(LinearSVC(loss='l2', penalty="L2",dual=False, tol=1e-3), "LinearSVC L2"),
(SGDClassifier(alpha=.0001, n_iter=50, penalty="L2"), "SGDC SVM L2"),
(LinearSVC(loss='l2', penalty="L1",dual=False, tol=1e-3), "LinearSVC L1"),
(SGDClassifier(alpha=.0001, n_iter=50, penalty="L1"), "SGDC SVM L1"),
(SGDClassifier(alpha=.0001, n_iter=50, penalty="elasticnet"), "Elastic Net"),
(NearestCentroid(), "Nearest Centroid"),
(MultinomialNB(alpha=.01), "MultinomialNB"),
(BernoulliNB(alpha=.01), "BernouliNB")):
print('=' * 80)
print(name)
results.append(benchmark(clf))
In [204]:
class L1LinearSVC(LinearSVC):
def fit(self, X, y):
# The smaller C, the stronger the regularization.
# The more regularization, the more sparsity.
self.transformer_ = LinearSVC(penalty="l1",
dual=False, tol=1e-3)
X = self.transformer_.fit_transform(X, y)
return LinearSVC.fit(self, X, y)
def predict(self, X):
X = self.transformer_.transform(X)
return LinearSVC.predict(self, X)
print('=' * 80)
print("LinearSVC with L1-based feature selection")
results.append(benchmark(L1LinearSVC()))
In [205]:
indices = np.arange(len(results))
results = [[x[i] for x in results] for i in range(5)]
In [206]:
fig, ax = plt.subplots(1, 1)
clf_names, score, training_time, test_time, pred = results
ax.plot(indices, score, '-o', label="score", color='#982023')
ax.plot(indices, training_time, '-o', label="training time (s)", color='#46959E')
ax.plot(indices, test_time, '-o', label="test time (s)", color='#C7B077')
#labels = [item.get_text() for item in ax.get_xticklabels()]
labels = clf_names
ax.xaxis.set_ticks(np.arange(np.min(indices), np.max(indices)+1, 1))
ax.set_xticklabels(clf_names, rotation='70', horizontalalignment='right')
ax.set_xlim([-1, 14])
ax.set_ylim([0, 1])
ax.legend(loc='best')
plt.subplots_adjust(left=0.05, bottom=0.3, top=.98)
#plt.savefig('ratingClassifierScores.png', dpi=144)
In [207]:
for name, scr in zip(clf_names, score):
print('{}: {:.3f}'.format(name, scr))
In [64]:
engine = cadb.connect_aws_db(write_unicode=True)
In [66]:
city = 'palo_alto'
In [71]:
cmd = 'select h.hotel_id, h.business_id, count(*) as count from '
cmd += 'ta_reviews r inner join ta_hotels h on r.business_id = '
cmd += 'h.business_id where h.hotel_city = "'
cmd += (' ').join(city.split('_'))+'" '
cmd += 'GROUP BY r.business_id'
cmd
Out[71]:
In [72]:
pd.read_sql_query(cmd, engine)
Out[72]:
In [86]:
cmd = 'select distinct r.business_id from '
cmd += 'ta_reviews r inner join ta_hotels h on r.business_id = '
cmd += 'h.business_id where h.hotel_city = "'
cmd += (' ').join(city.split('_'))+'" '
cmd
Out[86]:
In [87]:
[int(bid[0]) for bid in pd.read_sql_query(cmd, engine).values]
Out[87]:
In [73]:
bids = [1, 2, 5, 10, 20, 54325]
In [76]:
if 3 not in bids:
print('it is clear!')
else:
print('already exists')
In [89]:
np.where((y_test == 5) & (pred[10] == 1))
Out[89]:
In [98]:
len(test_data)
Out[98]:
In [99]:
test_data[47]
Out[99]:
In [100]:
test_data[354]
Out[100]:
In [101]:
np.where((y_test == 1) & (pred[10] == 5))
Out[101]:
In [102]:
np.where((y_test == 5) & (pred[10] == 5))
Out[102]:
In [104]:
test_data[4]
Out[104]:
In [ ]: