testing_classifiers



In [42]:

    
import numpy as np
from time import time
import matplotlib.pyplot as plt

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import RidgeClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils.extmath import density
from sklearn import metrics
from sklearn import cross_validation
from sklearn.pipeline import Pipeline
from sklearn.cross_validation import train_test_split

import pandas as pd
import connect_aws_db as cadb



In [2]:

    
%matplotlib inline



In [3]:

    
engine = cadb.connect_aws_db(write_unicode=True)



In [4]:

    
categories = ['dogs', 'general']

Restore BF Reviews and Ratings



In [5]:

    
cmd = "SELECT review_rating, review_text FROM bf_reviews"



In [6]:

    
bfdf = pd.read_sql_query(cmd, engine)



In [7]:

    
print(len(bfdf))
bfdf.head(5)









    



3150






    Out[7]:






  
    
      
      review_rating
      review_text
    
  
  
    
      0
      2
      When you first go to your room you notice the ...
    
    
      1
      2
      We were going to the Pre-Westminster event tha...
    
    
      2
      2
      While the room was not the fanciest that one c...
    
    
      3
      1
      I stayed at the Hotel Penn in Manhattan becaus...
    
    
      4
      5
      We loved this hotel! The are very friendly and...



In [8]:

    
len(bfdf[bfdf['review_text'].str.len() > 300])









    Out[8]:





1508



In [9]:

    
lbfdf = bfdf[bfdf['review_text'].str.len() > 300].copy()

Restore Yelp Reviews and Ratings



In [10]:

    
cmd = "SELECT review_rating, review_text FROM yelp_reviews"



In [11]:

    
yelpdf = pd.read_sql_query(cmd, engine)



In [12]:

    
print(len(yelpdf))
yelpdf.head(5)









    



6263






    Out[12]:






  
    
      
      review_rating
      review_text
    
  
  
    
      0
      4
      I stayed at the hotel for several months durin...
    
    
      1
      5
      I have been coming to Pittsburgh for quite a w...
    
    
      2
      4
      This is a very good hotel and with a corporate...
    
    
      3
      4
      i got upgraded to a junior suite, and then aga...
    
    
      4
      5
      This is a fantastic hotel.  I went to a conven...



In [13]:

    
len(yelpdf[yelpdf['review_text'].str.len() > 300])









    Out[13]:





5114



In [14]:

    
lydf = yelpdf[yelpdf['review_text'].str.len() > 300].copy()

Create a training and test data sets



In [48]:

    
# OLD WAY OF SPLITTING TRAINING AND TEST DATA:
##################################################

# train_data = np.hstack((lbfdf['review_text'].values[:len_revs],
#                         lydf['review_text'].values[:len_revs]))

# len(train_data)
# labels = ['dog'] * len_revs
# labels.extend(['general'] * len_revs)

# len(labels)

# y_train = labels

# test_data = np.hstack((lbfdf['review_text'].values[1000:1500],
#                        lydf['review_text'].values[1000:1500]))

# len(test_data)

# labels = ['dog'] * 500
# labels.extend(['general'] * 500)

# y_test = labels
# len(y_test)



In [49]:

    
len_revs = len(lbfdf)



In [50]:

    
data_revs = np.hstack((lbfdf['review_text'].values[:len_revs],
                       lydf['review_text'].values[:len_revs]))



In [51]:

    
len(data_revs)









    Out[51]:





3016



In [52]:

    
labels = ['dog'] * len_revs
labels.extend(['general'] * len_revs)



In [53]:

    
len(labels)









    Out[53]:





3016



In [54]:

    
train_data, test_data, y_train, y_test = train_test_split(
    data_revs, labels, test_size=0.33, random_state=18)



In [55]:

    
t0 = time()
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
                             stop_words='english')
X_train = vectorizer.fit_transform(train_data)
duration = time() - t0
print('vectorized in {:.2f} seconds.'.format(duration))









    



vectorized in 0.48 seconds.



In [56]:

    
t0 = time()
X_test = vectorizer.transform(test_data)
duration = time() - t0
print('transformed test data in {:.2f} seconds.'.format(duration))









    



transformed test data in 0.15 seconds.



In [57]:

    
feature_names = np.asarray(vectorizer.get_feature_names())

Try RidgeClassifier



In [58]:

    
clf = RidgeClassifier(tol=1e-2, solver="lsqr")



In [59]:

    
print(clf)









    



RidgeClassifier(alpha=1.0, class_weight=None, copy_X=True, fit_intercept=True,
        max_iter=None, normalize=False, solver='lsqr', tol=0.01)



In [60]:

    
clf.fit(X_train, y_train)









    Out[60]:





RidgeClassifier(alpha=1.0, class_weight=None, copy_X=True, fit_intercept=True,
        max_iter=None, normalize=False, solver='lsqr', tol=0.01)



In [61]:

    
pred = clf.predict(X_test)



In [62]:

    
print(len(y_test))
print(len(pred))



In [63]:

    
score = metrics.f1_score(y_test, pred, labels=None, pos_label=None)
acc = metrics.accuracy_score(y_test, pred, normalize=True)









    



/Applications/anaconda/lib/python2.7/site-packages/sklearn/metrics/classification.py:676: DeprecationWarning: The default `weighted` averaging is deprecated, and from version 0.18, use of precision, recall or F-score with multiclass or multilabel data or pos_label=None will result in an exception. Please set an explicit value for `average`, one of (None, 'micro', 'macro', 'weighted', 'samples'). In cross validation use, for instance, scoring="f1_weighted" instead of scoring="f1".
  sample_weight=sample_weight)



In [64]:

    
len(y_test) == len(pred)









    Out[64]:





True



In [65]:

    
#help(metrics.f1_score)

Now benchmark multiple classifiers



In [66]:

    
def benchmark(clf, pos_label=None):
    print('_' * 80)
    print("Training: ")
    print(clf)
    t0 = time()
    clf.fit(X_train, y_train)
    train_time = time() - t0
    print("train time: %0.3fs" % train_time)

    t0 = time()
    pred = clf.predict(X_test)
    test_time = time() - t0
    print("test time:  %0.3fs" % test_time)

    score = metrics.f1_score(y_test, pred, pos_label=pos_label)
    print("f1-score:   %0.3f" % score)
    
    acc = metrics.accuracy_score(y_test, pred, normalize=True)
    print('accuracy: {:.2f}'.format(acc))

    if hasattr(clf, 'coef_'):
        print("dimensionality: %d" % clf.coef_.shape[1])
        print("density: %f" % density(clf.coef_))

#         if opts.print_top10 and feature_names is not None:
#             print("top 10 keywords per class:")
#             for i, category in enumerate(categories):
#                 top10 = np.argsort(clf.coef_[i])[-10:]
#                 print(trim("%s: %s"
#                       % (category, " ".join(feature_names[top10]))))
        print()

#     if opts.print_report:
#         print("classification report:")
#         print(metrics.classification_report(y_test, pred,
#                                             target_names=categories))

#     if opts.print_cm:
#         print("confusion matrix:")
#         print(metrics.confusion_matrix(y_test, pred))

    print()
    clf_descr = str(clf).split('(')[0]
    return clf_descr, score, acc, train_time, test_time



In [67]:

    
results = []
for clf, name in (
        (RidgeClassifier(tol=1e-2, solver="lsqr"), "Ridge Classifier"),
        (Perceptron(n_iter=50), "Perceptron"),
        (PassiveAggressiveClassifier(n_iter=50), "Passive-Aggressive"),
        (KNeighborsClassifier(n_neighbors=10), "kNN"),
        (RandomForestClassifier(n_estimators=10), 'RandomForest')):
    print('=' * 80)
    print(name)
    results.append(benchmark(clf))









    



================================================================================
Ridge Classifier
________________________________________________________________________________
Training: 
RidgeClassifier(alpha=1.0, class_weight=None, copy_X=True, fit_intercept=True,
        max_iter=None, normalize=False, solver='lsqr', tol=0.01)
train time: 0.008s
test time:  0.000s
f1-score:   0.936
accuracy: 0.94
dimensionality: 12227
density: 1.000000
()
()
================================================================================
Perceptron
________________________________________________________________________________
Training: 
Perceptron(alpha=0.0001, class_weight=None, eta0=1.0, fit_intercept=True,
      n_iter=50, n_jobs=1, penalty=None, random_state=0, shuffle=True,
      verbose=0, warm_start=False)
train time: 0.017s
test time:  0.000s
f1-score:   0.914
accuracy: 0.91
dimensionality: 12227
density: 0.406559
()
()
================================================================================
Passive-Aggressive
________________________________________________________________________________
Training: 
PassiveAggressiveClassifier(C=1.0, fit_intercept=True, loss='hinge',
              n_iter=50, n_jobs=1, random_state=None, shuffle=True,
              verbose=0, warm_start=False)
train time: 0.018s
test time:  0.000s
f1-score:   0.929
accuracy: 0.93
dimensionality: 12227
density: 0.770671
()
()
================================================================================
kNN
________________________________________________________________________________
Training: 
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_neighbors=10, p=2, weights='uniform')
train time: 0.001s
test time:  0.160s





    



/Applications/anaconda/lib/python2.7/site-packages/sklearn/metrics/classification.py:676: DeprecationWarning: The default `weighted` averaging is deprecated, and from version 0.18, use of precision, recall or F-score with multiclass or multilabel data or pos_label=None will result in an exception. Please set an explicit value for `average`, one of (None, 'micro', 'macro', 'weighted', 'samples'). In cross validation use, for instance, scoring="f1_weighted" instead of scoring="f1".
  sample_weight=sample_weight)
/Applications/anaconda/lib/python2.7/site-packages/sklearn/metrics/classification.py:676: DeprecationWarning: The default `weighted` averaging is deprecated, and from version 0.18, use of precision, recall or F-score with multiclass or multilabel data or pos_label=None will result in an exception. Please set an explicit value for `average`, one of (None, 'micro', 'macro', 'weighted', 'samples'). In cross validation use, for instance, scoring="f1_weighted" instead of scoring="f1".
  sample_weight=sample_weight)
/Applications/anaconda/lib/python2.7/site-packages/sklearn/metrics/classification.py:676: DeprecationWarning: The default `weighted` averaging is deprecated, and from version 0.18, use of precision, recall or F-score with multiclass or multilabel data or pos_label=None will result in an exception. Please set an explicit value for `average`, one of (None, 'micro', 'macro', 'weighted', 'samples'). In cross validation use, for instance, scoring="f1_weighted" instead of scoring="f1".
  sample_weight=sample_weight)
/Applications/anaconda/lib/python2.7/site-packages/sklearn/metrics/classification.py:676: DeprecationWarning: The default `weighted` averaging is deprecated, and from version 0.18, use of precision, recall or F-score with multiclass or multilabel data or pos_label=None will result in an exception. Please set an explicit value for `average`, one of (None, 'micro', 'macro', 'weighted', 'samples'). In cross validation use, for instance, scoring="f1_weighted" instead of scoring="f1".
  sample_weight=sample_weight)






    



f1-score:   0.903
accuracy: 0.90
()
================================================================================
RandomForest
________________________________________________________________________________
Training: 
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
train time: 0.138s
test time:  0.005s
f1-score:   0.874
accuracy: 0.87
()






    



/Applications/anaconda/lib/python2.7/site-packages/sklearn/metrics/classification.py:676: DeprecationWarning: The default `weighted` averaging is deprecated, and from version 0.18, use of precision, recall or F-score with multiclass or multilabel data or pos_label=None will result in an exception. Please set an explicit value for `average`, one of (None, 'micro', 'macro', 'weighted', 'samples'). In cross validation use, for instance, scoring="f1_weighted" instead of scoring="f1".
  sample_weight=sample_weight)



In [68]:

    
results









    Out[68]:





[('RidgeClassifier',
  0.93570589578845154,
  0.93574297188755018,
  0.0076639652252197266,
  0.0002980232238769531),
 ('Perceptron',
  0.9136267543388995,
  0.91365461847389562,
  0.01668381690979004,
  0.0002639293670654297),
 ('PassiveAggressiveClassifier',
  0.92870645051838563,
  0.92871485943775101,
  0.01839900016784668,
  0.00027108192443847656),
 ('KNeighborsClassifier',
  0.90323999962554169,
  0.90361445783132532,
  0.0013899803161621094,
  0.1595449447631836),
 ('RandomForestClassifier',
  0.87414219270009197,
  0.87449799196787148,
  0.1378939151763916,
  0.004790067672729492)]



In [69]:

    
for penalty in ["l2", "l1"]:
    print('=' * 80)
    print("%s penalty" % penalty.upper())
    # Train Liblinear model
    results.append(benchmark(LinearSVC(loss='l2', penalty=penalty,
                                            dual=False, tol=1e-3)))

    # Train SGD model
    results.append(benchmark(SGDClassifier(alpha=.0001, n_iter=50,
                                           penalty=penalty)))









    



================================================================================
L2 penalty
________________________________________________________________________________
Training: 
LinearSVC(C=1.0, class_weight=None, dual=False, fit_intercept=True,
     intercept_scaling=1, loss='l2', max_iter=1000, multi_class='ovr',
     penalty='l2', random_state=None, tol=0.001, verbose=0)
train time: 0.037s
test time:  0.000s
f1-score:   0.930
accuracy: 0.93
dimensionality: 12227
density: 1.000000
()
()
________________________________________________________________________________
Training: 
SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', n_iter=50, n_jobs=1,
       penalty='l2', power_t=0.5, random_state=None, shuffle=True,
       verbose=0, warm_start=False)
train time: 0.015s





    



/Applications/anaconda/lib/python2.7/site-packages/sklearn/svm/classes.py:192: DeprecationWarning: loss='l2' has been deprecated in favor of loss='squared_hinge' as of 0.16. Backward compatibility for the loss='l2' will be removed in 1.0
  DeprecationWarning)
/Applications/anaconda/lib/python2.7/site-packages/sklearn/metrics/classification.py:676: DeprecationWarning: The default `weighted` averaging is deprecated, and from version 0.18, use of precision, recall or F-score with multiclass or multilabel data or pos_label=None will result in an exception. Please set an explicit value for `average`, one of (None, 'micro', 'macro', 'weighted', 'samples'). In cross validation use, for instance, scoring="f1_weighted" instead of scoring="f1".
  sample_weight=sample_weight)
/Applications/anaconda/lib/python2.7/site-packages/sklearn/metrics/classification.py:676: DeprecationWarning: The default `weighted` averaging is deprecated, and from version 0.18, use of precision, recall or F-score with multiclass or multilabel data or pos_label=None will result in an exception. Please set an explicit value for `average`, one of (None, 'micro', 'macro', 'weighted', 'samples'). In cross validation use, for instance, scoring="f1_weighted" instead of scoring="f1".
  sample_weight=sample_weight)






    



test time:  0.000s
f1-score:   0.928
accuracy: 0.93
dimensionality: 12227
density: 0.722499
()
()
================================================================================
L1 penalty
________________________________________________________________________________
Training: 
LinearSVC(C=1.0, class_weight=None, dual=False, fit_intercept=True,
     intercept_scaling=1, loss='l2', max_iter=1000, multi_class='ovr',
     penalty='l1', random_state=None, tol=0.001, verbose=0)
train time: 0.055s





    



/Applications/anaconda/lib/python2.7/site-packages/sklearn/svm/classes.py:192: DeprecationWarning: loss='l2' has been deprecated in favor of loss='squared_hinge' as of 0.16. Backward compatibility for the loss='l2' will be removed in 1.0
  DeprecationWarning)
/Applications/anaconda/lib/python2.7/site-packages/sklearn/metrics/classification.py:676: DeprecationWarning: The default `weighted` averaging is deprecated, and from version 0.18, use of precision, recall or F-score with multiclass or multilabel data or pos_label=None will result in an exception. Please set an explicit value for `average`, one of (None, 'micro', 'macro', 'weighted', 'samples'). In cross validation use, for instance, scoring="f1_weighted" instead of scoring="f1".
  sample_weight=sample_weight)






    



test time:  0.000s
f1-score:   0.928
accuracy: 0.93
dimensionality: 12227
density: 0.022573
()
()
________________________________________________________________________________
Training: 
SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', n_iter=50, n_jobs=1,
       penalty='l1', power_t=0.5, random_state=None, shuffle=True,
       verbose=0, warm_start=False)
train time: 0.070s
test time:  0.000s
f1-score:   0.923
accuracy: 0.92
dimensionality: 12227
density: 0.061994
()
()






    



/Applications/anaconda/lib/python2.7/site-packages/sklearn/metrics/classification.py:676: DeprecationWarning: The default `weighted` averaging is deprecated, and from version 0.18, use of precision, recall or F-score with multiclass or multilabel data or pos_label=None will result in an exception. Please set an explicit value for `average`, one of (None, 'micro', 'macro', 'weighted', 'samples'). In cross validation use, for instance, scoring="f1_weighted" instead of scoring="f1".
  sample_weight=sample_weight)



In [70]:

    
results









    Out[70]:





[('RidgeClassifier',
  0.93570589578845154,
  0.93574297188755018,
  0.0076639652252197266,
  0.0002980232238769531),
 ('Perceptron',
  0.9136267543388995,
  0.91365461847389562,
  0.01668381690979004,
  0.0002639293670654297),
 ('PassiveAggressiveClassifier',
  0.92870645051838563,
  0.92871485943775101,
  0.01839900016784668,
  0.00027108192443847656),
 ('KNeighborsClassifier',
  0.90323999962554169,
  0.90361445783132532,
  0.0013899803161621094,
  0.1595449447631836),
 ('RandomForestClassifier',
  0.87414219270009197,
  0.87449799196787148,
  0.1378939151763916,
  0.004790067672729492),
 ('LinearSVC',
  0.92970526941192211,
  0.92971887550200805,
  0.03654813766479492,
  0.00026798248291015625),
 ('SGDClassifier',
  0.9277064706752034,
  0.92771084337349397,
  0.015182971954345703,
  0.0002868175506591797),
 ('LinearSVC',
  0.92763642827977844,
  0.92771084337349397,
  0.05474710464477539,
  0.0002849102020263672),
 ('SGDClassifier',
  0.92267353518469541,
  0.92269076305220887,
  0.06950592994689941,
  0.00030803680419921875)]



In [71]:

    
# Train SGD with Elastic Net penalty
print('=' * 80)
print("Elastic-Net penalty")
results.append(benchmark(SGDClassifier(alpha=.0001, n_iter=50,
                                       penalty="elasticnet")))

# Train NearestCentroid without threshold
print('=' * 80)
print("NearestCentroid (aka Rocchio classifier)")
results.append(benchmark(NearestCentroid()))

# Train sparse Naive Bayes classifiers
print('=' * 80)
print("Naive Bayes")
results.append(benchmark(MultinomialNB(alpha=.01)))
results.append(benchmark(BernoulliNB(alpha=.01)))









    



================================================================================
Elastic-Net penalty
________________________________________________________________________________
Training: 
SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', n_iter=50, n_jobs=1,
       penalty='elasticnet', power_t=0.5, random_state=None, shuffle=True,
       verbose=0, warm_start=False)
train time: 0.074s
test time:  0.000s
f1-score:   0.930
accuracy: 0.93
dimensionality: 12227
density: 0.378752
()
()
================================================================================
NearestCentroid (aka Rocchio classifier)
________________________________________________________________________________
Training: 
NearestCentroid(metric='euclidean', shrink_threshold=None)
train time: 0.007s
test time:  0.002s
f1-score:   0.907
accuracy: 0.91
()
================================================================================
Naive Bayes
________________________________________________________________________________
Training: 
MultinomialNB(alpha=0.01, class_prior=None, fit_prior=True)
train time: 0.005s
test time:  0.000s
f1-score:   0.917
accuracy: 0.92
dimensionality: 12227
density: 1.000000
()
()
________________________________________________________________________________
Training: 
BernoulliNB(alpha=0.01, binarize=0.0, class_prior=None, fit_prior=True)
train time: 0.005s
test time:  0.002s
f1-score:   0.895
accuracy: 0.90
dimensionality: 12227
density: 1.000000
()
()






    



/Applications/anaconda/lib/python2.7/site-packages/sklearn/metrics/classification.py:676: DeprecationWarning: The default `weighted` averaging is deprecated, and from version 0.18, use of precision, recall or F-score with multiclass or multilabel data or pos_label=None will result in an exception. Please set an explicit value for `average`, one of (None, 'micro', 'macro', 'weighted', 'samples'). In cross validation use, for instance, scoring="f1_weighted" instead of scoring="f1".
  sample_weight=sample_weight)
/Applications/anaconda/lib/python2.7/site-packages/sklearn/metrics/classification.py:676: DeprecationWarning: The default `weighted` averaging is deprecated, and from version 0.18, use of precision, recall or F-score with multiclass or multilabel data or pos_label=None will result in an exception. Please set an explicit value for `average`, one of (None, 'micro', 'macro', 'weighted', 'samples'). In cross validation use, for instance, scoring="f1_weighted" instead of scoring="f1".
  sample_weight=sample_weight)
/Applications/anaconda/lib/python2.7/site-packages/sklearn/metrics/classification.py:676: DeprecationWarning: The default `weighted` averaging is deprecated, and from version 0.18, use of precision, recall or F-score with multiclass or multilabel data or pos_label=None will result in an exception. Please set an explicit value for `average`, one of (None, 'micro', 'macro', 'weighted', 'samples'). In cross validation use, for instance, scoring="f1_weighted" instead of scoring="f1".
  sample_weight=sample_weight)
/Applications/anaconda/lib/python2.7/site-packages/sklearn/metrics/classification.py:676: DeprecationWarning: The default `weighted` averaging is deprecated, and from version 0.18, use of precision, recall or F-score with multiclass or multilabel data or pos_label=None will result in an exception. Please set an explicit value for `average`, one of (None, 'micro', 'macro', 'weighted', 'samples'). In cross validation use, for instance, scoring="f1_weighted" instead of scoring="f1".
  sample_weight=sample_weight)



In [72]:

    
results









    Out[72]:





[('RidgeClassifier',
  0.93570589578845154,
  0.93574297188755018,
  0.0076639652252197266,
  0.0002980232238769531),
 ('Perceptron',
  0.9136267543388995,
  0.91365461847389562,
  0.01668381690979004,
  0.0002639293670654297),
 ('PassiveAggressiveClassifier',
  0.92870645051838563,
  0.92871485943775101,
  0.01839900016784668,
  0.00027108192443847656),
 ('KNeighborsClassifier',
  0.90323999962554169,
  0.90361445783132532,
  0.0013899803161621094,
  0.1595449447631836),
 ('RandomForestClassifier',
  0.87414219270009197,
  0.87449799196787148,
  0.1378939151763916,
  0.004790067672729492),
 ('LinearSVC',
  0.92970526941192211,
  0.92971887550200805,
  0.03654813766479492,
  0.00026798248291015625),
 ('SGDClassifier',
  0.9277064706752034,
  0.92771084337349397,
  0.015182971954345703,
  0.0002868175506591797),
 ('LinearSVC',
  0.92763642827977844,
  0.92771084337349397,
  0.05474710464477539,
  0.0002849102020263672),
 ('SGDClassifier',
  0.92267353518469541,
  0.92269076305220887,
  0.06950592994689941,
  0.00030803680419921875),
 ('SGDClassifier',
  0.92971207322515004,
  0.92971887550200805,
  0.07359790802001953,
  0.00027489662170410156),
 ('NearestCentroid',
  0.90735851747740093,
  0.90763052208835338,
  0.0068511962890625,
  0.0019030570983886719),
 ('MultinomialNB',
  0.91660571207573494,
  0.91666666666666663,
  0.00455784797668457,
  0.00044989585876464844),
 ('BernoulliNB',
  0.89537860850705198,
  0.89558232931726911,
  0.004575014114379883,
  0.002338886260986328)]



In [73]:

    
class L1LinearSVC(LinearSVC):

    def fit(self, X, y):
        # The smaller C, the stronger the regularization.
        # The more regularization, the more sparsity.
        self.transformer_ = LinearSVC(penalty="l1",
                                      dual=False, tol=1e-3)
        X = self.transformer_.fit_transform(X, y)
        return LinearSVC.fit(self, X, y)

    def predict(self, X):
        X = self.transformer_.transform(X)
        return LinearSVC.predict(self, X)

print('=' * 80)
print("LinearSVC with L1-based feature selection")
results.append(benchmark(L1LinearSVC()))









    



================================================================================
LinearSVC with L1-based feature selection
________________________________________________________________________________
Training: 
L1LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
      intercept_scaling=1, loss='squared_hinge', max_iter=1000,
      multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
      verbose=0)
train time: 0.070s
test time:  0.004s
f1-score:   0.923
accuracy: 0.92
dimensionality: 276
density: 1.000000
()
()






    



/Applications/anaconda/lib/python2.7/site-packages/sklearn/metrics/classification.py:676: DeprecationWarning: The default `weighted` averaging is deprecated, and from version 0.18, use of precision, recall or F-score with multiclass or multilabel data or pos_label=None will result in an exception. Please set an explicit value for `average`, one of (None, 'micro', 'macro', 'weighted', 'samples'). In cross validation use, for instance, scoring="f1_weighted" instead of scoring="f1".
  sample_weight=sample_weight)

Plot Results



In [74]:

    
indices = np.arange(len(results))

results = [[x[i] for x in results] for i in range(5)]



In [75]:

    
clf_names, score, acc, training_time, test_time = results
training_time = np.array(training_time) / np.max(training_time)
test_time = np.array(test_time) / np.max(test_time)



In [76]:

    
clf_names









    Out[76]:





['RidgeClassifier',
 'Perceptron',
 'PassiveAggressiveClassifier',
 'KNeighborsClassifier',
 'RandomForestClassifier',
 'LinearSVC',
 'SGDClassifier',
 'LinearSVC',
 'SGDClassifier',
 'SGDClassifier',
 'NearestCentroid',
 'MultinomialNB',
 'BernoulliNB',
 'L1LinearSVC']



In [77]:

    
font = {'family' : 'normal',
        'weight' : 'bold',
        'size'   : 16}

plt.rc('font', **font)
plt.rcParams['figure.figsize'] = 12.94, 8

#plt.figure(figsize=(12, 8))
plt.title("Score")
plt.barh(indices, score, .2, label="score", color='#982023')
plt.barh(indices + .3, training_time, .2, label="training time", color='#46959E')
plt.barh(indices + .6, test_time, .2, label="test time", color='#C7B077')
plt.yticks(())
plt.legend(loc='best')
plt.subplots_adjust(left=.25)
plt.subplots_adjust(top=.95)
plt.subplots_adjust(bottom=.05)
plt.ylim(0, 17)
print(indices)
for i, c in zip(indices, clf_names):
    plt.text(-0.025, i, c, horizontalalignment='right')









    



[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13]






    



/Applications/anaconda/lib/python2.7/site-packages/matplotlib/font_manager.py:1282: UserWarning: findfont: Font family [u'normal'] not found. Falling back to Bitstream Vera Sans
  (prop.get_family(), self.defaultFamily[fontext]))



In [78]:

    
clf_names









    Out[78]:





['RidgeClassifier',
 'Perceptron',
 'PassiveAggressiveClassifier',
 'KNeighborsClassifier',
 'RandomForestClassifier',
 'LinearSVC',
 'SGDClassifier',
 'LinearSVC',
 'SGDClassifier',
 'SGDClassifier',
 'NearestCentroid',
 'MultinomialNB',
 'BernoulliNB',
 'L1LinearSVC']



In [79]:

    
clf_names[0] = 'Ridge'
clf_names[2] = 'PassAggress'
clf_names[3] = 'KNN'
clf_names[4] = 'RandomForest'
clf_names[5] = 'LinearSVC L2'
clf_names[6] = 'SGDC SVM L2'
clf_names[7] = 'LinearSVC L1'
clf_names[8] = 'SGDC L1'
clf_names[9] = 'SGDC ElNet'
clf_names[13] = 'LinearSVC L1FS'



In [80]:

    
fig, ax = plt.subplots(1, 1)

training_timen = np.array(training_time) / np.max(training_time)
test_timen = np.array(test_time) / np.max(test_time)

ax.plot(indices, score, '-o', label="f-score", color='#982023')
ax.plot(indices, acc, '-o', label="accuracy", color='#BA4C37')
ax.plot(indices, training_timen, '-o', label="training time", color='#46959E')
ax.plot(indices, test_timen, '-o', label="test time", color='#C7B077')
#labels = [item.get_text() for item in ax.get_xticklabels()]
print(len(labels))
print(len(clf_names))
labels = clf_names
ax.xaxis.set_ticks(np.arange(np.min(indices), np.max(indices)+1, 1))
ax.set_xticklabels(clf_names, rotation='70', horizontalalignment='right')
ax.set_xlim([-1, 14])
ax.set_ylim([0, 1])
ax.legend(loc='best')
plt.subplots_adjust(left=0.05, bottom=0.3, top=.98)
#plt.savefig('classifierScoresNorm.png', dpi=144)



In [81]:

    
fig, ax = plt.subplots(1, 1)

clf_names, score, accs, training_time, test_time = results

ax.plot(indices, score, '-o', label="score", color='#982023')
ax.plot(indices, accs, '-o', label="accuracy", color='#BA4C37')
ax.plot(indices, training_time, '-o', label="training time (s)", color='#46959E')
ax.plot(indices, test_time, '-o', label="test time (s)", color='#C7B077')
#labels = [item.get_text() for item in ax.get_xticklabels()]
print(len(labels))
print(len(clf_names))
labels = clf_names
ax.xaxis.set_ticks(np.arange(np.min(indices), np.max(indices)+1, 1))
ax.set_xticklabels(clf_names, rotation='70', horizontalalignment='right')
ax.set_xlim([-1, 14])
ax.set_ylim([0, 1])
ax.legend(loc='best')
plt.subplots_adjust(left=0.05, bottom=0.3, top=.98)
#plt.savefig('classifierScores.png', dpi=144)



In [82]:

    
for name, scr, acc in zip(clf_names, score, accs):
    print('{}: {:.3f}, {:.3f}'.format(name, scr, acc))









    



Ridge: 0.936, 0.936
Perceptron: 0.914, 0.914
PassAggress: 0.929, 0.929
KNN: 0.903, 0.904
RandomForest: 0.874, 0.874
LinearSVC L2: 0.930, 0.930
SGDC SVM L2: 0.928, 0.928
LinearSVC L1: 0.928, 0.928
SGDC L1: 0.923, 0.923
SGDC ElNet: 0.930, 0.930
NearestCentroid: 0.907, 0.908
MultinomialNB: 0.917, 0.917
BernoulliNB: 0.895, 0.896
LinearSVC L1FS: 0.923, 0.923

Cross-validation

In the above code, I only used one set of training and test data. Below, I will calculate a more accurate accuracy using cross-validation.



In [83]:

    
#clf = RidgeClassifier(tol=1e-2, solver="lsqr")
clf = LinearSVC(loss='l2', penalty=penalty, dual=False, tol=1e-3)



In [84]:

    
len(lbfdf['review_text'].values)









    Out[84]:





1508



In [85]:

    
# make a set containing all the data:
fulldat = np.hstack((lbfdf['review_text'].values,
                        lydf['review_text'].values[:1508]))



In [86]:

    
labels = ['dog'] * 1508
labels.extend(['general'] * 1508)
fulltarget = labels



In [87]:

    
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
                             stop_words='english')
X_train = vectorizer.fit_transform(train_data)
X_test = vectorizer.transform(test_data)



In [88]:

    
clf = Pipeline([
      ('vectfidf', TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english')),
      ('feature_selection', LinearSVC(penalty="l1", dual=False, tol=1e-3)),
      ('classification', LinearSVC())])



In [89]:

    
scores = cross_validation.cross_val_score(clf, fulldat, fulltarget, cv=10)
print(scores)









    



[ 0.87086093  0.89072848  0.93046358  0.9205298   0.95033113  0.95364238
  0.9205298   0.90066225  0.95666667  0.96666667]



In [90]:

    
print(np.median(scores))









    



0.925496688742



In [215]:

    
clf = Pipeline([
      ('vectfidf', TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english')),
      ('feature_selection', LinearSVC(penalty="l1", dual=False, tol=1e-3)),
      ])



In [216]:

    
scores = cross_validation.cross_val_score(clf, fulldat, fulltarget, cv=10)
print(scores)
print(np.median(scores))









    



[ 0.87417219  0.88741722  0.93708609  0.91059603  0.95695364  0.96357616
  0.91721854  0.91721854  0.96        0.96      ]
0.927152317881

Examine Features



In [91]:

    
pars = clf.get_params()



In [97]:

    
pars.values()









    Out[97]:





[None, LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
      intercept_scaling=1, loss='squared_hinge', max_iter=1000,
      multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
      verbose=0), True, 'l1', True, 1.0, u'strict', 'l2', 'squared_hinge', u'l2', 'english', u'utf-8', False, 1000, 0.001, 1.0, None, 1000, 0.5, None, (1,
  1), u'content', 0.0001, 'ovr', None, True, numpy.int64, 1, 1, LinearSVC(C=1.0, class_weight=None, dual=False, fit_intercept=True,
      intercept_scaling=1, loss='squared_hinge', max_iter=1000,
      multi_class='ovr', penalty='l1', random_state=None, tol=0.001,
      verbose=0), u'word', None, True, True, None, True, 'squared_hinge', 'ovr', TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
         dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
         lowercase=True, max_df=0.5, max_features=None, min_df=1,
         ngram_range=(1, 1), norm=u'l2', preprocessor=None, smooth_idf=True,
         stop_words='english', strip_accents=None, sublinear_tf=True,
         token_pattern=u'(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
         vocabulary=None), True, False, 0, 0, None, 1, None, u'(?u)\\b\\w\\w+\\b', None]

Conclusions

All classifiers tested resulted in approximately the same score, and all had negligible training and testing time. The Linear SVC Classifier with L1 Penalization was the top-performer, so I'll go with that.

	review_rating	review_text
0	2	When you first go to your room you notice the ...
1	2	We were going to the Pre-Westminster event tha...
2	2	While the room was not the fanciest that one c...
3	1	I stayed at the Hotel Penn in Manhattan becaus...
4	5	We loved this hotel! The are very friendly and...

	review_rating	review_text
0	4	I stayed at the hotel for several months durin...
1	5	I have been coming to Pittsburgh for quite a w...
2	4	This is a very good hotel and with a corporate...
3	4	i got upgraded to a junior suite, and then aga...
4	5	This is a fantastic hotel. I went to a conven...