In [126]:
#imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
import itertools
import spacy
import nltk
%matplotlib inline
In [127]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
In [128]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
In [129]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
In [130]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
In [131]:
# dataframe display options
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_rows', 200)
In [60]:
lr_pipe = Pipeline([('vect', CountVectorizer(tokenizer= custom_tokenizer, stop_words=s_words)),
('lr', LogisticRegression(multi_class='multinomial', solver='newton-cg'))])
In [61]:
param_grid = {'vect__binary': [True, False],
'vect__ngram_range': [(1,1),(1,2),(1,3)],
'vect__min_df': [i for i in range(1,5)],
'lr__C': [0.01, 0.1, 1, 10, 100],
'lr__class_weight':['balanced', None],}
In [62]:
grid = GridSearchCV(lr_pipe,
param_grid,
cv=5,
verbose=True,
scoring='accuracy')
grid.fit(X_train.ravel(), y_train)
Out[62]:
In [68]:
print("Best cross-validation score: {:.2f}".format(grid.best_score_))
In [69]:
print(grid.best_params_)
In [70]:
print("Vectorization step:\n{}".format( grid.best_estimator_.named_steps["vect"]))
In [71]:
print("Logistic regression step:\n{}".format( grid.best_estimator_.named_steps["lr"]))
In [72]:
pred = grid.predict(X_test.ravel())
print("Classification report on test set for classifier:")
print(classification_report(y_test, pred)) # target_names=news_test.target_names
In [73]:
accuracy_score(y_test, pred)
Out[73]:
The accuracy on the hold-out set is 0.64%
In [74]:
cm = confusion_matrix(y_test, pred)
print("Confusion matrix:")
print(cm)
In [142]:
def plot_confusion_matrix(cm, classes,
normalize=False,
title='Confusion matrix',
cmap=plt.cm.Blues):
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=90)
plt.yticks(tick_marks, classes)
print('Confusion matrix')
thresh = cm.max() / 2.
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
plt.text(j, i, cm[i, j],
horizontalalignment="center",
color="white" if cm[i, j] > thresh else "black")
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
In [76]:
# Compute confusion matrix
cnf_matrix = confusion_matrix(y_test, pred)
np.set_printoptions(precision=2)
In [78]:
v = grid.best_estimator_.named_steps["vect"]
c = grid.best_estimator_.named_steps["lr"]
cls = grid.best_estimator_.named_steps["lr"].classes_
In [79]:
# Plot non-normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=cls,
title='Confusion matrix, without normalization')
It is worth mentioning that category "private party" definition overlaps with "company event" which confuses the model and dirrectly affects accuracy.
In [80]:
def print_topk(vectorizer, clf, class_labels, n):
"""Prints features with the highest coefficient values, per class"""
feature_names = vectorizer.get_feature_names()
for i, class_label in enumerate(class_labels):
top = np.argsort(clf.coef_[i])[-n:]
print("[{}] - {}".format(class_label,
", ".join(feature_names[j] for j in top)))
In [81]:
print_topk(v, c, cls, 5)
I used this information to iteratively clean-up the input vectors
In [143]:
mnb_pipe = Pipeline([('vect', CountVectorizer(tokenizer= custom_tokenizer, stop_words=s_words)),
('tfidf', TfidfTransformer()),
('mnb', MultinomialNB())])
In [144]:
mnb_param_grid = {'vect__binary': [True, False],
'vect__ngram_range': [(1,1),(1,2),(1,3)],
'vect__min_df': [i for i in range(1,5)],
'tfidf__use_idf': [True, False],
'mnb__alpha': [0.01, 0.1, 0.5, 0.7, 1],
'mnb__fit_prior':[True, False],}
In [145]:
mnb_grid = GridSearchCV(mnb_pipe,
mnb_param_grid,
cv=5,
verbose=True,
scoring='accuracy')
mnb_grid.fit(X_train.ravel(), y_train)
Out[145]:
In [147]:
print("Best cross-validation score: {:.2f}".format(mnb_grid.best_score_))
In [148]:
print(mnb_grid.best_params_)
In [149]:
print("Multinomial Naive Bayes step:\n{}".format( mnb_grid.best_estimator_.named_steps["mnb"]))
In [150]:
mnb_pred = mnb_grid.predict(X_test.ravel())
print("Classification report on test set for classifier:")
print(classification_report(y_test, mnb_pred)) # target_names=news_test.target_names
In [151]:
accuracy_score(y_test, mnb_pred)
Out[151]:
In [152]:
# Compute confusion matrix
mnb_cm = confusion_matrix(y_test, mnb_pred)
np.set_printoptions(precision=2)
In [153]:
# Plot non-normalized confusion matrix
plt.figure()
plot_confusion_matrix(mnb_cm, classes=cls,
title='Confusion matrix, without normalization')
In [155]:
v_mnb = mnb_grid.best_estimator_.named_steps["vect"]
c_mnb = mnb_grid.best_estimator_.named_steps["mnb"]
cls_mnb = mnb_grid.best_estimator_.named_steps["mnb"].classes_
In [156]:
print_topk(v_mnb, c_mnb, cls_mnb, 5)
There are common elements across the classes. This is a good ilustration of how "photo" or "shot" should be included in the stop_word list
In [159]:
svm_pipe = Pipeline([('vect', CountVectorizer(tokenizer= custom_tokenizer, stop_words=s_words)),
('tfidf', TfidfTransformer()),
('svm', SVC(kernel='linear'))])
In [160]:
svm_param_grid = {'vect__binary': [True, False],
'vect__ngram_range': [(1,1),(1,2)],
'vect__min_df': [i for i in range(1,5)],
'tfidf__use_idf': [True, False],
'svm__C': [ 0.7, 1, 10],
'svm__kernel':['rbf', 'linear'],
'svm__class_weight': ['balanced', None]}
In [161]:
svm_grid = GridSearchCV(svm_pipe,
svm_param_grid,
cv=5,
verbose=True,
scoring='accuracy')
svm_grid.fit(X_train.ravel(), y_train)
Out[161]:
In [162]:
print("Best cross-validation score: {:.2f}".format(svm_grid.best_score_))
In [163]:
print(svm_grid.best_params_)
In [164]:
print("Support Vector Machine step:\n{}".format( svm_grid.best_estimator_.named_steps["svm"]))
In [166]:
svm_pred = svm_grid.predict(X_test.ravel())
accuracy_score(y_test, svm_pred)
Out[166]:
In [167]:
print("Classification report on test set for classifier:")
print(classification_report(y_test, svm_pred))
In [175]:
# Compute confusion matrix
svm_cm = confusion_matrix(y_test, svm_pred)
np.set_printoptions(precision=2)
In [176]:
# Plot non-normalized confusion matrix
plt.figure()
plot_confusion_matrix(mnb_cm, classes=cls,
title='Confusion matrix, without normalization')
In [201]:
v_svm = svm_grid.best_estimator_.named_steps["vect"]
c_svm = svm_grid.best_estimator_.named_steps["svm"]
cls_svm = svm_grid.best_estimator_.named_steps["svm"].classes_
In [197]:
print_topk(v_svm, c_svm, cls_svm, 5)
In [210]:
c_svm.coef_.toarray().shape
Out[210]:
For multiclass problems, the coefficients are a matrix with all 1-vs-1 classifiers. The layout of the coefficients in the multiclass case is somewhat non-trivial as per Sklearn documentation.
- [n_class-1, n_SV] http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html