In [21]:
# Author: Huade Huo <hh561@georegtown.edu>
# License: MIT

print "Initializing ..."
from time import time
t0 = time()
import pandas as pd
import numpy as np
import nltk

from sklearn import cross_validation as cv
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import RidgeClassifier
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils.extmath import density
from sklearn import metrics

# Load data

print "Loading data set... "
sr = pd.io.parsers.read_csv("../3-MergedData/SR_merged.csv")[["speaking","vv_score","Ideology"]]


# Remove NaN
print "Removing NaN... "
sr = sr[pd.notnull(sr['speaking'])]
print "Done in %0.3fs, %i rows available" % ((time() - t0) , len(sr))

# Feature extraction
print "Starting feature extraction... "
t0 = time()
## Bigram
bigrams_vectorizer = CountVectorizer(ngram_range=(2,2),
                                    token_pattern=r'\b\w+\b', min_df=0.005)
## Trigram
trigrams_vectorizer = CountVectorizer(ngram_range=(3,3),
                                      token_pattern=r'\b\w+\b', min_df=0.002)

## Vector
print "Bigrams vectorizer initialized ... "
X_bigrams = bigrams_vectorizer.fit_transform(sr.speaking).toarray()

print "Trigrams vectorizer initialized... "
X_trigrams = trigrams_vectorizer.fit_transform(sr.speaking).toarray()

## Feature names
print "Saving feature names ... "
bigrams_feature_names = bigrams_vectorizer.get_feature_names()
trigrams_feature_names = trigrams_vectorizer.get_feature_names()

print "Done in %0.3fs, %i bigrams and %i trigrams available" % ((time() - t0) , len(bigrams_feature_names),
                                                            len(trigrams_feature_names))

# Feature selection
print "Starting feature selection... "
t0 = time()

## Bigrams
print "Selecting bigrams based on chi2... "
ch2 = SelectKBest(chi2, k=500)
X_bigrams_selected = ch2.fit_transform(X_bigrams, sr.Ideology)
Chi2_Selected_bigrams = [bigrams_feature_names[i] for i in ch2.get_support(indices=True)]

## Trigrams
print "Selecting trigrams based on chi2... "
X_trigrams_selected = ch2.fit_transform(X_trigrams, sr.Ideology)
Chi2_Selected_trigrams = [trigrams_feature_names[i] for i in ch2.get_support(indices=True)]
print "Done in %0.3fs" % (time() - t0)



# Split train test data

X_train_Bigrams, X_test_Bigrams, y_train, y_test = cv.train_test_split(
    X_bigrams_selected, sr.Ideology, test_size=0.4, random_state=0)
X_train_Trigrams, X_test_Trigrams, y_train, y_test = cv.train_test_split(
    X_trigrams_selected, sr.Ideology, test_size=0.4, random_state=0)

print "Preprocessing completed"

def benchmark_bigrams(clf,X_train = X_train_Trigrams, X_test = X_test_Trigrams, feature_names = Chi2_Selected_trigrams):
    print('_' * 80)
    print("Training: ")
    print(clf)
    t0 = time()
    clf.fit(X_train, y_train)
    train_time = time() - t0
    print("train time: %0.3fs" % train_time)

    t0 = time()
    pred = clf.predict(X_test)
    test_time = time() - t0
    print("test time:  %0.3fs" % test_time)

    score = metrics.accuracy_score(y_test, pred)
    print("accuracy:   %0.3f" % score)

    if hasattr(clf, 'coef_'):
        print("dimensionality: %d" % clf.coef_.shape[1])
        print("density: %f" % density(clf.coef_))

    print("classification report:")
    print(metrics.classification_report(y_test, pred,
                                            target_names=['R','D']))


    print("confusion matrix:")
    print(metrics.confusion_matrix(y_test, pred))


    clf_descr = str(clf).split('(')[0]
    return clf_descr, score, train_time, test_time

def benchmark_trigrams(clf,X_train = X_train_Bigrams, X_test = X_test_Bigrams, feature_names = Chi2_Selected_bigrams):
    print('_' * 80)
    print("Training: ")
    print(clf)
    t0 = time()
    clf.fit(X_train, y_train)
    train_time = time() - t0
    print("train time: %0.3fs" % train_time)

    t0 = time()
    pred = clf.predict(X_test)
    test_time = time() - t0
    print("test time:  %0.3fs" % test_time)

    score = metrics.accuracy_score(y_test, pred)
    print("accuracy:   %0.3f" % score)

    if hasattr(clf, 'coef_'):
        print("dimensionality: %d" % clf.coef_.shape[1])
        print("density: %f" % density(clf.coef_))

    print("classification report:")
    print(metrics.classification_report(y_test, pred,
                                            target_names=['R','D']))


    print("confusion matrix:")
    print(metrics.confusion_matrix(y_test, pred))


    clf_descr = str(clf).split('(')[0]
    return clf_descr, score, train_time, test_time

# Bigrams
print('=' * 80)
print('=' * 80)
print('Bigrams')
print('=' * 80)
print('=' * 80)
results_bigrams = []
for clf, name in (
        (RidgeClassifier(tol=1e-2, solver="lsqr"), "Ridge Classifier"),
        (Perceptron(n_iter=50), "Perceptron"),
        (PassiveAggressiveClassifier(n_iter=50), "Passive-Aggressive"),
        (KNeighborsClassifier(n_neighbors=10), "kNN"),
        (RandomForestClassifier(n_estimators=100), "Random forest")):
    print('=' * 80)
    print(name)
    results_bigrams.append(benchmark_bigrams(clf))

for penalty in ["l2", "l1"]:
    print('=' * 80)
    print("%s penalty" % penalty.upper())
    # Train Liblinear model
    results_bigrams.append(benchmark_bigrams(LinearSVC(loss='l2', penalty=penalty,
                                            dual=False, tol=1e-3)))

    # Train SGD model
    results_bigrams.append(benchmark_bigrams(SGDClassifier(alpha=.0001, n_iter=50,penalty=penalty)))


# Train SGD with Elastic Net penalty

print('=' * 80)

print("Elastic-Net penalty")
results_bigrams.append(benchmark_bigrams(SGDClassifier(alpha=.0001, n_iter=50,
                                       penalty="elasticnet")))

# Train NearestCentroid without threshold
print('=' * 80)
print("NearestCentroid (aka Rocchio classifier)")
results_bigrams.append(benchmark_bigrams(NearestCentroid()))

# Train sparse Naive Bayes classifiers
print('=' * 80)
print("Naive Bayes")
results_bigrams.append(benchmark_bigrams(MultinomialNB(alpha=.01)))
results_bigrams.append(benchmark_bigrams(BernoulliNB(alpha=.01)))

print('=' * 80)
print("LinearSVC with L1-based feature selection")
# The smaller C, the stronger the regularization.
# The more regularization, the more sparsity.
results_bigrams.append(benchmark_bigrams(Pipeline([
  ('feature_selection', LinearSVC(penalty="l1", dual=False, tol=1e-3)),
  ('classification', LinearSVC())
])))

# Trigrams
print('=' * 80)
print('=' * 80)
print('trigrams')
print('=' * 80)
print('=' * 80)
results_trigrams = []
for clf, name in (
        (RidgeClassifier(tol=1e-2, solver="lsqr"), "Ridge Classifier"),
        (Perceptron(n_iter=50), "Perceptron"),
        (PassiveAggressiveClassifier(n_iter=50), "Passive-Aggressive"),
        (KNeighborsClassifier(n_neighbors=10), "kNN"),
        (RandomForestClassifier(n_estimators=100), "Random forest")):
    print('=' * 80)
    print(name)
    results_trigrams.append(benchmark_trigrams(clf))

for penalty in ["l2", "l1"]:
    print('=' * 80)
    print("%s penalty" % penalty.upper())
    # Train Liblinear model
    results_trigrams.append(benchmark_trigrams(LinearSVC(loss='l2', penalty=penalty,
                                            dual=False, tol=1e-3)))

    # Train SGD model
    results_trigrams.append(benchmark_trigrams(SGDClassifier(alpha=.0001, n_iter=50,penalty=penalty)))

# Train SGD with Elastic Net penalty

print('=' * 80)

print("Elastic-Net penalty")
results_trigrams.append(benchmark_trigrams(SGDClassifier(alpha=.0001, n_iter=50,
                                       penalty="elasticnet")))

# Train NearestCentroid without threshold
print('=' * 80)
print("NearestCentroid (aka Rocchio classifier)")
results_trigrams.append(benchmark_trigrams(NearestCentroid()))

# Train sparse Naive Bayes classifiers
print('=' * 80)
print("Naive Bayes")
results_trigrams.append(benchmark_trigrams(MultinomialNB(alpha=.01)))
results_trigrams.append(benchmark_trigrams(BernoulliNB(alpha=.01)))

print('=' * 80)
print("LinearSVC with L1-based feature selection")
# The smaller C, the stronger the regularization.
# The more regularization, the more sparsity.
results_trigrams.append(benchmark_trigrams(Pipeline([
  ('feature_selection', LinearSVC(penalty="l1", dual=False, tol=1e-3)),
  ('classification', LinearSVC())
])))


Initializing ...
Loading data set... 
Removing NaN... 
Done in 0.231s, 25608 rows available
Starting feature extraction... 
Bigrams vectorizer initialized ... 
Trigrams vectorizer initialized... 
Saving feature names ... 
Done in 48.715s, 1208 bigrams and 791 trigrams available
Starting feature selection... 
Selecting bigrams based on chi2... 
Selecting trigrams based on chi2... 
Done in 0.715s
Preprocessing completed
================================================================================
================================================================================
Bigrams
================================================================================
================================================================================
================================================================================
Ridge Classifier
________________________________________________________________________________
Training: 
RidgeClassifier(alpha=1.0, class_weight=None, copy_X=True, fit_intercept=True,
        max_iter=None, normalize=False, solver='lsqr', tol=0.01)
train time: 0.217s
test time:  0.019s
accuracy:   0.723
dimensionality: 500
density: 1.000000
classification report:
             precision    recall  f1-score   support

          R       0.78      0.53      0.63      4565
          D       0.70      0.88      0.78      5679

avg / total       0.73      0.72      0.71     10244

confusion matrix:
[[2416 2149]
 [ 685 4994]]
================================================================================
Perceptron
________________________________________________________________________________
Training: 
Perceptron(alpha=0.0001, class_weight=None, eta0=1.0, fit_intercept=True,
      n_iter=50, n_jobs=1, penalty=None, random_state=0, shuffle=False,
      verbose=0, warm_start=False)
train time: 0.694s
test time:  0.019s
accuracy:   0.674
dimensionality: 500
density: 0.870000
classification report:
             precision    recall  f1-score   support

          R       0.59      0.87      0.70      4565
          D       0.83      0.52      0.64      5679

avg / total       0.72      0.67      0.67     10244

confusion matrix:
[[3949  616]
 [2723 2956]]
================================================================================
Passive-Aggressive
________________________________________________________________________________
Training: 
PassiveAggressiveClassifier(C=1.0, fit_intercept=True, loss='hinge',
              n_iter=50, n_jobs=1, random_state=None, shuffle=False,
              verbose=0, warm_start=False)
train time: 1.060s
test time:  0.019s
accuracy:   0.688
dimensionality: 500
density: 1.000000
classification report:
             precision    recall  f1-score   support

          R       0.68      0.56      0.62      4565
          D       0.69      0.79      0.74      5679

avg / total       0.69      0.69      0.68     10244

confusion matrix:
[[2574 1991]
 [1210 4469]]
================================================================================
kNN
________________________________________________________________________________
Training: 
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_neighbors=10, p=2, weights='uniform')
train time: 0.747s
test time:  119.162s
accuracy:   0.715
classification report:
             precision    recall  f1-score   support

          R       0.74      0.55      0.63      4565
          D       0.70      0.85      0.77      5679

avg / total       0.72      0.71      0.71     10244

confusion matrix:
[[2502 2063]
 [ 861 4818]]
================================================================================
Random forest
________________________________________________________________________________
Training: 
RandomForestClassifier(bootstrap=True, compute_importances=None,
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_density=None, min_samples_leaf=1,
            min_samples_split=2, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0)
train time: 18.256s
test time:  0.691s
accuracy:   0.725
classification report:
             precision    recall  f1-score   support

          R       0.78      0.54      0.64      4565
          D       0.70      0.88      0.78      5679

avg / total       0.74      0.73      0.72     10244

confusion matrix:
[[2451 2114]
 [ 701 4978]]
================================================================================
L2 penalty
________________________________________________________________________________
Training: 
LinearSVC(C=1.0, class_weight=None, dual=False, fit_intercept=True,
     intercept_scaling=1, loss='l2', multi_class='ovr', penalty='l2',
     random_state=None, tol=0.001, verbose=0)
train time: 0.182s
test time:  0.019s
accuracy:   0.726
dimensionality: 500
density: 1.000000
classification report:
             precision    recall  f1-score   support

          R       0.77      0.54      0.64      4565
          D       0.70      0.87      0.78      5679

avg / total       0.74      0.73      0.72     10244

confusion matrix:
[[2483 2082]
 [ 724 4955]]
________________________________________________________________________________
Training: 
SGDClassifier(alpha=0.0001, class_weight=None, epsilon=0.1, eta0=0.0,
       fit_intercept=True, l1_ratio=0.15, learning_rate='optimal',
       loss='hinge', n_iter=50, n_jobs=1, penalty='l2', power_t=0.5,
       random_state=None, shuffle=False, verbose=0, warm_start=False)
train time: 0.804s
test time:  0.019s
accuracy:   0.726
dimensionality: 500
density: 0.998000
classification report:
             precision    recall  f1-score   support

          R       0.80      0.52      0.63      4565
          D       0.70      0.89      0.78      5679

avg / total       0.74      0.73      0.71     10244

confusion matrix:
[[2359 2206]
 [ 597 5082]]
================================================================================
L1 penalty
________________________________________________________________________________
Training: 
LinearSVC(C=1.0, class_weight=None, dual=False, fit_intercept=True,
     intercept_scaling=1, loss='l2', multi_class='ovr', penalty='l1',
     random_state=None, tol=0.001, verbose=0)
train time: 0.143s
test time:  0.019s
accuracy:   0.727
dimensionality: 500
density: 0.868000
classification report:
             precision    recall  f1-score   support

          R       0.78      0.54      0.64      4565
          D       0.70      0.87      0.78      5679

avg / total       0.74      0.73      0.72     10244

confusion matrix:
[[2486 2079]
 [ 720 4959]]
________________________________________________________________________________
Training: 
SGDClassifier(alpha=0.0001, class_weight=None, epsilon=0.1, eta0=0.0,
       fit_intercept=True, l1_ratio=0.15, learning_rate='optimal',
       loss='hinge', n_iter=50, n_jobs=1, penalty='l1', power_t=0.5,
       random_state=None, shuffle=False, verbose=0, warm_start=False)
train time: 2.152s
test time:  0.019s
accuracy:   0.724
dimensionality: 500
density: 0.722000
classification report:
             precision    recall  f1-score   support

          R       0.80      0.51      0.62      4565
          D       0.69      0.89      0.78      5679

avg / total       0.74      0.72      0.71     10244

confusion matrix:
[[2334 2231]
 [ 597 5082]]
================================================================================
Elastic-Net penalty
________________________________________________________________________________
Training: 
SGDClassifier(alpha=0.0001, class_weight=None, epsilon=0.1, eta0=0.0,
       fit_intercept=True, l1_ratio=0.15, learning_rate='optimal',
       loss='hinge', n_iter=50, n_jobs=1, penalty='elasticnet',
       power_t=0.5, random_state=None, shuffle=False, verbose=0,
       warm_start=False)
train time: 2.555s
test time:  0.019s
accuracy:   0.725
dimensionality: 500
density: 0.890000
classification report:
             precision    recall  f1-score   support

          R       0.80      0.52      0.63      4565
          D       0.70      0.89      0.78      5679

avg / total       0.74      0.73      0.71     10244

confusion matrix:
[[2351 2214]
 [ 598 5081]]
================================================================================
NearestCentroid (aka Rocchio classifier)
________________________________________________________________________________
Training: 
NearestCentroid(metric='euclidean', shrink_threshold=None)
train time: 0.035s
test time:  0.034s
accuracy:   0.669
classification report:
             precision    recall  f1-score   support

          R       0.78      0.36      0.49      4565
          D       0.64      0.92      0.75      5679

avg / total       0.70      0.67      0.64     10244

confusion matrix:
[[1652 2913]
 [ 476 5203]]
================================================================================
Naive Bayes
________________________________________________________________________________
Training: 
MultinomialNB(alpha=0.01, class_prior=None, fit_prior=True)
train time: 0.068s
test time:  0.027s
accuracy:   0.731
dimensionality: 500
density: 1.000000
classification report:
             precision    recall  f1-score   support

          R       0.78      0.55      0.64      4565
          D       0.71      0.88      0.78      5679

avg / total       0.74      0.73      0.72     10244

confusion matrix:
[[2501 2064]
 [ 693 4986]]
________________________________________________________________________________
Training: 
BernoulliNB(alpha=0.01, binarize=0.0, class_prior=None, fit_prior=True)
train time: 0.116s
test time:  0.064s
accuracy:   0.730
dimensionality: 500
density: 1.000000
classification report:
             precision    recall  f1-score   support

          R       0.78      0.54      0.64      4565
          D       0.71      0.88      0.78      5679

avg / total       0.74      0.73      0.72     10244

confusion matrix:
[[2480 2085]
 [ 686 4993]]
================================================================================
LinearSVC with L1-based feature selection
________________________________________________________________________________
Training: 
Pipeline(steps=[('feature_selection', LinearSVC(C=1.0, class_weight=None, dual=False, fit_intercept=True,
     intercept_scaling=1, loss='l2', multi_class='ovr', penalty='l1',
     random_state=None, tol=0.001, verbose=0)), ('classification', LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='l2', multi_class='ovr', penalty='l2',
     random_state=None, tol=0.0001, verbose=0))])
train time: 1.307s
test time:  0.042s
accuracy:   0.726
classification report:
             precision    recall  f1-score   support

          R       0.77      0.54      0.64      4565
          D       0.70      0.87      0.78      5679

avg / total       0.74      0.73      0.72     10244

confusion matrix:
[[2484 2081]
 [ 724 4955]]
================================================================================
================================================================================
trigrams
================================================================================
================================================================================
================================================================================
Ridge Classifier
________________________________________________________________________________
Training: 
RidgeClassifier(alpha=1.0, class_weight=None, copy_X=True, fit_intercept=True,
        max_iter=None, normalize=False, solver='lsqr', tol=0.01)
train time: 0.224s
test time:  0.019s
accuracy:   0.758
dimensionality: 500
density: 1.000000
classification report:
             precision    recall  f1-score   support

          R       0.80      0.61      0.69      4565
          D       0.74      0.88      0.80      5679

avg / total       0.77      0.76      0.75     10244

confusion matrix:
[[2772 1793]
 [ 685 4994]]
================================================================================
Perceptron
________________________________________________________________________________
Training: 
Perceptron(alpha=0.0001, class_weight=None, eta0=1.0, fit_intercept=True,
      n_iter=50, n_jobs=1, penalty=None, random_state=0, shuffle=False,
      verbose=0, warm_start=False)
train time: 0.643s
test time:  0.019s
accuracy:   0.733
dimensionality: 500
density: 0.932000
classification report:
             precision    recall  f1-score   support

          R       0.67      0.79      0.73      4565
          D       0.80      0.68      0.74      5679

avg / total       0.74      0.73      0.73     10244

confusion matrix:
[[3619  946]
 [1789 3890]]
================================================================================
Passive-Aggressive
________________________________________________________________________________
Training: 
PassiveAggressiveClassifier(C=1.0, fit_intercept=True, loss='hinge',
              n_iter=50, n_jobs=1, random_state=None, shuffle=False,
              verbose=0, warm_start=False)
train time: 1.103s
test time:  0.019s
accuracy:   0.634
dimensionality: 500
density: 1.000000
classification report:
             precision    recall  f1-score   support

          R       0.56      0.88      0.68      4565
          D       0.82      0.44      0.57      5679

avg / total       0.70      0.63      0.62     10244

confusion matrix:
[[4017  548]
 [3205 2474]]
================================================================================
kNN
________________________________________________________________________________
Training: 
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_neighbors=10, p=2, weights='uniform')
train time: 0.563s
test time:  104.550s
accuracy:   0.728
classification report:
             precision    recall  f1-score   support

          R       0.69      0.70      0.70      4565
          D       0.76      0.75      0.75      5679

avg / total       0.73      0.73      0.73     10244

confusion matrix:
[[3210 1355]
 [1428 4251]]
================================================================================
Random forest
________________________________________________________________________________
Training: 
RandomForestClassifier(bootstrap=True, compute_importances=None,
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_density=None, min_samples_leaf=1,
            min_samples_split=2, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0)
train time: 12.631s
test time:  0.445s
accuracy:   0.772
classification report:
             precision    recall  f1-score   support

          R       0.78      0.68      0.73      4565
          D       0.77      0.85      0.81      5679

avg / total       0.77      0.77      0.77     10244

confusion matrix:
[[3091 1474]
 [ 858 4821]]
================================================================================
L2 penalty
________________________________________________________________________________
Training: 
LinearSVC(C=1.0, class_weight=None, dual=False, fit_intercept=True,
     intercept_scaling=1, loss='l2', multi_class='ovr', penalty='l2',
     random_state=None, tol=0.001, verbose=0)
train time: 0.176s
test time:  0.019s
accuracy:   0.775
dimensionality: 500
density: 1.000000
classification report:
             precision    recall  f1-score   support

          R       0.79      0.67      0.73      4565
          D       0.76      0.86      0.81      5679

avg / total       0.78      0.78      0.77     10244

confusion matrix:
[[3052 1513]
 [ 790 4889]]
________________________________________________________________________________
Training: 
SGDClassifier(alpha=0.0001, class_weight=None, epsilon=0.1, eta0=0.0,
       fit_intercept=True, l1_ratio=0.15, learning_rate='optimal',
       loss='hinge', n_iter=50, n_jobs=1, penalty='l2', power_t=0.5,
       random_state=None, shuffle=False, verbose=0, warm_start=False)
train time: 0.742s
test time:  0.019s
accuracy:   0.772
dimensionality: 500
density: 1.000000
classification report:
             precision    recall  f1-score   support

          R       0.80      0.65      0.72      4565
          D       0.75      0.87      0.81      5679

avg / total       0.78      0.77      0.77     10244

confusion matrix:
[[2959 1606]
 [ 732 4947]]
================================================================================
L1 penalty
________________________________________________________________________________
Training: 
LinearSVC(C=1.0, class_weight=None, dual=False, fit_intercept=True,
     intercept_scaling=1, loss='l2', multi_class='ovr', penalty='l1',
     random_state=None, tol=0.001, verbose=0)
train time: 0.268s
test time:  0.019s
accuracy:   0.775
dimensionality: 500
density: 0.980000
classification report:
             precision    recall  f1-score   support

          R       0.80      0.67      0.73      4565
          D       0.76      0.86      0.81      5679

avg / total       0.78      0.78      0.77     10244

confusion matrix:
[[3042 1523]
 [ 777 4902]]
________________________________________________________________________________
Training: 
SGDClassifier(alpha=0.0001, class_weight=None, epsilon=0.1, eta0=0.0,
       fit_intercept=True, l1_ratio=0.15, learning_rate='optimal',
       loss='hinge', n_iter=50, n_jobs=1, penalty='l1', power_t=0.5,
       random_state=None, shuffle=False, verbose=0, warm_start=False)
train time: 2.106s
test time:  0.019s
accuracy:   0.769
dimensionality: 500
density: 0.760000
classification report:
             precision    recall  f1-score   support

          R       0.79      0.66      0.72      4565
          D       0.76      0.86      0.80      5679

avg / total       0.77      0.77      0.77     10244

confusion matrix:
[[2991 1574]
 [ 795 4884]]
================================================================================
Elastic-Net penalty
________________________________________________________________________________
Training: 
SGDClassifier(alpha=0.0001, class_weight=None, epsilon=0.1, eta0=0.0,
       fit_intercept=True, l1_ratio=0.15, learning_rate='optimal',
       loss='hinge', n_iter=50, n_jobs=1, penalty='elasticnet',
       power_t=0.5, random_state=None, shuffle=False, verbose=0,
       warm_start=False)
train time: 2.525s
test time:  0.019s
accuracy:   0.771
dimensionality: 500
density: 0.900000
classification report:
             precision    recall  f1-score   support

          R       0.80      0.64      0.71      4565
          D       0.75      0.87      0.81      5679

avg / total       0.78      0.77      0.77     10244

confusion matrix:
[[2929 1636]
 [ 714 4965]]
================================================================================
NearestCentroid (aka Rocchio classifier)
________________________________________________________________________________
Training: 
NearestCentroid(metric='euclidean', shrink_threshold=None)
train time: 0.036s
test time:  0.033s
accuracy:   0.673
classification report:
             precision    recall  f1-score   support

          R       0.60      0.80      0.68      4565
          D       0.78      0.57      0.66      5679

avg / total       0.70      0.67      0.67     10244

confusion matrix:
[[3634  931]
 [2418 3261]]
================================================================================
Naive Bayes
________________________________________________________________________________
Training: 
MultinomialNB(alpha=0.01, class_prior=None, fit_prior=True)
train time: 0.067s
test time:  0.027s
accuracy:   0.764
dimensionality: 500
density: 1.000000
classification report:
             precision    recall  f1-score   support

          R       0.78      0.65      0.71      4565
          D       0.75      0.85      0.80      5679

avg / total       0.77      0.76      0.76     10244

confusion matrix:
[[2986 1579]
 [ 834 4845]]
________________________________________________________________________________
Training: 
BernoulliNB(alpha=0.01, binarize=0.0, class_prior=None, fit_prior=True)
train time: 0.120s
test time:  0.067s
accuracy:   0.754
dimensionality: 500
density: 1.000000
classification report:
             precision    recall  f1-score   support

          R       0.78      0.63      0.70      4565
          D       0.74      0.85      0.79      5679

avg / total       0.76      0.75      0.75     10244

confusion matrix:
[[2877 1688]
 [ 833 4846]]
================================================================================
LinearSVC with L1-based feature selection
________________________________________________________________________________
Training: 
Pipeline(steps=[('feature_selection', LinearSVC(C=1.0, class_weight=None, dual=False, fit_intercept=True,
     intercept_scaling=1, loss='l2', multi_class='ovr', penalty='l1',
     random_state=None, tol=0.001, verbose=0)), ('classification', LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='l2', multi_class='ovr', penalty='l2',
     random_state=None, tol=0.0001, verbose=0))])
train time: 1.442s
test time:  0.043s
accuracy:   0.776
classification report:
             precision    recall  f1-score   support

          R       0.80      0.67      0.73      4565
          D       0.76      0.86      0.81      5679

avg / total       0.78      0.78      0.77     10244

confusion matrix:
[[3055 1510]
 [ 782 4897]]

In [ ]: