In [42]:
import numpy as np
from time import time
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import RidgeClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils.extmath import density
from sklearn import metrics
from sklearn import cross_validation
from sklearn.pipeline import Pipeline
from sklearn.cross_validation import train_test_split
import pandas as pd
import connect_aws_db as cadb
In [2]:
%matplotlib inline
In [3]:
engine = cadb.connect_aws_db(write_unicode=True)
In [4]:
categories = ['dogs', 'general']
In [5]:
cmd = "SELECT review_rating, review_text FROM bf_reviews"
In [6]:
bfdf = pd.read_sql_query(cmd, engine)
In [7]:
print(len(bfdf))
bfdf.head(5)
Out[7]:
In [8]:
len(bfdf[bfdf['review_text'].str.len() > 300])
Out[8]:
In [9]:
lbfdf = bfdf[bfdf['review_text'].str.len() > 300].copy()
In [10]:
cmd = "SELECT review_rating, review_text FROM yelp_reviews"
In [11]:
yelpdf = pd.read_sql_query(cmd, engine)
In [12]:
print(len(yelpdf))
yelpdf.head(5)
Out[12]:
In [13]:
len(yelpdf[yelpdf['review_text'].str.len() > 300])
Out[13]:
In [14]:
lydf = yelpdf[yelpdf['review_text'].str.len() > 300].copy()
In [48]:
# OLD WAY OF SPLITTING TRAINING AND TEST DATA:
##################################################
# train_data = np.hstack((lbfdf['review_text'].values[:len_revs],
# lydf['review_text'].values[:len_revs]))
# len(train_data)
# labels = ['dog'] * len_revs
# labels.extend(['general'] * len_revs)
# len(labels)
# y_train = labels
# test_data = np.hstack((lbfdf['review_text'].values[1000:1500],
# lydf['review_text'].values[1000:1500]))
# len(test_data)
# labels = ['dog'] * 500
# labels.extend(['general'] * 500)
# y_test = labels
# len(y_test)
In [49]:
len_revs = len(lbfdf)
In [50]:
data_revs = np.hstack((lbfdf['review_text'].values[:len_revs],
lydf['review_text'].values[:len_revs]))
In [51]:
len(data_revs)
Out[51]:
In [52]:
labels = ['dog'] * len_revs
labels.extend(['general'] * len_revs)
In [53]:
len(labels)
Out[53]:
In [54]:
train_data, test_data, y_train, y_test = train_test_split(
data_revs, labels, test_size=0.33, random_state=18)
In [55]:
t0 = time()
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
stop_words='english')
X_train = vectorizer.fit_transform(train_data)
duration = time() - t0
print('vectorized in {:.2f} seconds.'.format(duration))
In [56]:
t0 = time()
X_test = vectorizer.transform(test_data)
duration = time() - t0
print('transformed test data in {:.2f} seconds.'.format(duration))
In [57]:
feature_names = np.asarray(vectorizer.get_feature_names())
In [58]:
clf = RidgeClassifier(tol=1e-2, solver="lsqr")
In [59]:
print(clf)
In [60]:
clf.fit(X_train, y_train)
Out[60]:
In [61]:
pred = clf.predict(X_test)
In [62]:
print(len(y_test))
print(len(pred))
In [63]:
score = metrics.f1_score(y_test, pred, labels=None, pos_label=None)
acc = metrics.accuracy_score(y_test, pred, normalize=True)
In [64]:
len(y_test) == len(pred)
Out[64]:
In [65]:
#help(metrics.f1_score)
In [66]:
def benchmark(clf, pos_label=None):
print('_' * 80)
print("Training: ")
print(clf)
t0 = time()
clf.fit(X_train, y_train)
train_time = time() - t0
print("train time: %0.3fs" % train_time)
t0 = time()
pred = clf.predict(X_test)
test_time = time() - t0
print("test time: %0.3fs" % test_time)
score = metrics.f1_score(y_test, pred, pos_label=pos_label)
print("f1-score: %0.3f" % score)
acc = metrics.accuracy_score(y_test, pred, normalize=True)
print('accuracy: {:.2f}'.format(acc))
if hasattr(clf, 'coef_'):
print("dimensionality: %d" % clf.coef_.shape[1])
print("density: %f" % density(clf.coef_))
# if opts.print_top10 and feature_names is not None:
# print("top 10 keywords per class:")
# for i, category in enumerate(categories):
# top10 = np.argsort(clf.coef_[i])[-10:]
# print(trim("%s: %s"
# % (category, " ".join(feature_names[top10]))))
print()
# if opts.print_report:
# print("classification report:")
# print(metrics.classification_report(y_test, pred,
# target_names=categories))
# if opts.print_cm:
# print("confusion matrix:")
# print(metrics.confusion_matrix(y_test, pred))
print()
clf_descr = str(clf).split('(')[0]
return clf_descr, score, acc, train_time, test_time
In [67]:
results = []
for clf, name in (
(RidgeClassifier(tol=1e-2, solver="lsqr"), "Ridge Classifier"),
(Perceptron(n_iter=50), "Perceptron"),
(PassiveAggressiveClassifier(n_iter=50), "Passive-Aggressive"),
(KNeighborsClassifier(n_neighbors=10), "kNN"),
(RandomForestClassifier(n_estimators=10), 'RandomForest')):
print('=' * 80)
print(name)
results.append(benchmark(clf))
In [68]:
results
Out[68]:
In [69]:
for penalty in ["l2", "l1"]:
print('=' * 80)
print("%s penalty" % penalty.upper())
# Train Liblinear model
results.append(benchmark(LinearSVC(loss='l2', penalty=penalty,
dual=False, tol=1e-3)))
# Train SGD model
results.append(benchmark(SGDClassifier(alpha=.0001, n_iter=50,
penalty=penalty)))
In [70]:
results
Out[70]:
In [71]:
# Train SGD with Elastic Net penalty
print('=' * 80)
print("Elastic-Net penalty")
results.append(benchmark(SGDClassifier(alpha=.0001, n_iter=50,
penalty="elasticnet")))
# Train NearestCentroid without threshold
print('=' * 80)
print("NearestCentroid (aka Rocchio classifier)")
results.append(benchmark(NearestCentroid()))
# Train sparse Naive Bayes classifiers
print('=' * 80)
print("Naive Bayes")
results.append(benchmark(MultinomialNB(alpha=.01)))
results.append(benchmark(BernoulliNB(alpha=.01)))
In [72]:
results
Out[72]:
In [73]:
class L1LinearSVC(LinearSVC):
def fit(self, X, y):
# The smaller C, the stronger the regularization.
# The more regularization, the more sparsity.
self.transformer_ = LinearSVC(penalty="l1",
dual=False, tol=1e-3)
X = self.transformer_.fit_transform(X, y)
return LinearSVC.fit(self, X, y)
def predict(self, X):
X = self.transformer_.transform(X)
return LinearSVC.predict(self, X)
print('=' * 80)
print("LinearSVC with L1-based feature selection")
results.append(benchmark(L1LinearSVC()))
In [74]:
indices = np.arange(len(results))
results = [[x[i] for x in results] for i in range(5)]
In [75]:
clf_names, score, acc, training_time, test_time = results
training_time = np.array(training_time) / np.max(training_time)
test_time = np.array(test_time) / np.max(test_time)
In [76]:
clf_names
Out[76]:
In [77]:
font = {'family' : 'normal',
'weight' : 'bold',
'size' : 16}
plt.rc('font', **font)
plt.rcParams['figure.figsize'] = 12.94, 8
#plt.figure(figsize=(12, 8))
plt.title("Score")
plt.barh(indices, score, .2, label="score", color='#982023')
plt.barh(indices + .3, training_time, .2, label="training time", color='#46959E')
plt.barh(indices + .6, test_time, .2, label="test time", color='#C7B077')
plt.yticks(())
plt.legend(loc='best')
plt.subplots_adjust(left=.25)
plt.subplots_adjust(top=.95)
plt.subplots_adjust(bottom=.05)
plt.ylim(0, 17)
print(indices)
for i, c in zip(indices, clf_names):
plt.text(-0.025, i, c, horizontalalignment='right')
In [78]:
clf_names
Out[78]:
In [79]:
clf_names[0] = 'Ridge'
clf_names[2] = 'PassAggress'
clf_names[3] = 'KNN'
clf_names[4] = 'RandomForest'
clf_names[5] = 'LinearSVC L2'
clf_names[6] = 'SGDC SVM L2'
clf_names[7] = 'LinearSVC L1'
clf_names[8] = 'SGDC L1'
clf_names[9] = 'SGDC ElNet'
clf_names[13] = 'LinearSVC L1FS'
In [80]:
fig, ax = plt.subplots(1, 1)
training_timen = np.array(training_time) / np.max(training_time)
test_timen = np.array(test_time) / np.max(test_time)
ax.plot(indices, score, '-o', label="f-score", color='#982023')
ax.plot(indices, acc, '-o', label="accuracy", color='#BA4C37')
ax.plot(indices, training_timen, '-o', label="training time", color='#46959E')
ax.plot(indices, test_timen, '-o', label="test time", color='#C7B077')
#labels = [item.get_text() for item in ax.get_xticklabels()]
print(len(labels))
print(len(clf_names))
labels = clf_names
ax.xaxis.set_ticks(np.arange(np.min(indices), np.max(indices)+1, 1))
ax.set_xticklabels(clf_names, rotation='70', horizontalalignment='right')
ax.set_xlim([-1, 14])
ax.set_ylim([0, 1])
ax.legend(loc='best')
plt.subplots_adjust(left=0.05, bottom=0.3, top=.98)
#plt.savefig('classifierScoresNorm.png', dpi=144)
In [81]:
fig, ax = plt.subplots(1, 1)
clf_names, score, accs, training_time, test_time = results
ax.plot(indices, score, '-o', label="score", color='#982023')
ax.plot(indices, accs, '-o', label="accuracy", color='#BA4C37')
ax.plot(indices, training_time, '-o', label="training time (s)", color='#46959E')
ax.plot(indices, test_time, '-o', label="test time (s)", color='#C7B077')
#labels = [item.get_text() for item in ax.get_xticklabels()]
print(len(labels))
print(len(clf_names))
labels = clf_names
ax.xaxis.set_ticks(np.arange(np.min(indices), np.max(indices)+1, 1))
ax.set_xticklabels(clf_names, rotation='70', horizontalalignment='right')
ax.set_xlim([-1, 14])
ax.set_ylim([0, 1])
ax.legend(loc='best')
plt.subplots_adjust(left=0.05, bottom=0.3, top=.98)
#plt.savefig('classifierScores.png', dpi=144)
In [82]:
for name, scr, acc in zip(clf_names, score, accs):
print('{}: {:.3f}, {:.3f}'.format(name, scr, acc))
In [83]:
#clf = RidgeClassifier(tol=1e-2, solver="lsqr")
clf = LinearSVC(loss='l2', penalty=penalty, dual=False, tol=1e-3)
In [84]:
len(lbfdf['review_text'].values)
Out[84]:
In [85]:
# make a set containing all the data:
fulldat = np.hstack((lbfdf['review_text'].values,
lydf['review_text'].values[:1508]))
In [86]:
labels = ['dog'] * 1508
labels.extend(['general'] * 1508)
fulltarget = labels
In [87]:
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
stop_words='english')
X_train = vectorizer.fit_transform(train_data)
X_test = vectorizer.transform(test_data)
In [88]:
clf = Pipeline([
('vectfidf', TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english')),
('feature_selection', LinearSVC(penalty="l1", dual=False, tol=1e-3)),
('classification', LinearSVC())])
In [89]:
scores = cross_validation.cross_val_score(clf, fulldat, fulltarget, cv=10)
print(scores)
In [90]:
print(np.median(scores))
In [215]:
clf = Pipeline([
('vectfidf', TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english')),
('feature_selection', LinearSVC(penalty="l1", dual=False, tol=1e-3)),
])
In [216]:
scores = cross_validation.cross_val_score(clf, fulldat, fulltarget, cv=10)
print(scores)
print(np.median(scores))
In [91]:
pars = clf.get_params()
In [97]:
pars.values()
Out[97]: