In [2]:
import re
from time import time
import string
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pprint import pprint
#Sklearn Imports
from sklearn import metrics
from sklearn.datasets import fetch_20newsgroups
from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, precision_recall_curve, roc_auc_score, auc
from nltk import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
import nltk
nltk.download('stopwords') #download the latest stopwords
Out[2]:
In [3]:
all_newsgroups= fetch_20newsgroups()
In [4]:
pprint(list(all_newsgroups.target_names))
In [5]:
cats = ['sci.med' , 'rec.motorcycles']
newsgroups_train = fetch_20newsgroups(subset='train', categories=cats, remove=('headers', 'footers', 'quotes'))
newsgroups_test = fetch_20newsgroups(subset='test', categories=cats, remove=('headers', 'footers', 'quotes'))
In [6]:
print("Categories to classify\n-----------------------")
print(list(newsgroups_train.target_names))
In [7]:
print("TRAIN DATA\n---------------")
print("Data Type:", type(newsgroups_train))
print("%d documents" % len(newsgroups_train.filenames))
print("%d categories" % len(newsgroups_train.target_names))
print("X shape :", newsgroups_train.filenames.shape)
print("Y shape :",newsgroups_train.target.shape)
print("Y head :", newsgroups_train.target[:10])
In [8]:
print("TEST DATA\n---------------")
print("Data Type:", type(newsgroups_test))
print("%d documents" % len(newsgroups_test.filenames))
print("%d categories" % len(newsgroups_test.target_names))
print("X shape :", newsgroups_test.filenames.shape)
print("Y shape :",newsgroups_test.target.shape)
print("Y head :", newsgroups_test.target[:10])
In [9]:
print(newsgroups_train.data[0])
In [10]:
print(newsgroups_test.data[0])
In [11]:
print(type(newsgroups_test.data))
print(type(newsgroups_test.data[0]))
In [12]:
train_labels = newsgroups_train.target #0, 1 array
#print(train_labels)
test_labels = newsgroups_test.target
#print(test_labels)
RE_PREPROCESS = r'\W+|\d+' #the regular expressions that matches all non-characters
#train_corpus = np.array( [re.sub(RE_PREPROCESS, ' ', text).lower() for text in df_train.jobDescription.values])
#test_corpus = np.array( [re.sub(RE_PREPROCESS, ' ', text).lower() for text in df_test.jobDescription.values])
labels = np.append(train_labels, test_labels)
In [13]:
vectorizer = TfidfVectorizer()
vectors_train = vectorizer.fit_transform(newsgroups_train.data)
vectors_train.shape
Out[13]:
In [14]:
vectors_train.nnz / float(vectors_train.shape[0])
Out[14]:
In [15]:
vectors_test = vectorizer.transform(newsgroups_test.data)
There are 18000+ features for each document. And on average, 87 out of 18000 features are non-zeros. This is a sparse matrix
In [16]:
clf = MultinomialNB(alpha=.01)
clf.fit(vectors_train, newsgroups_train.target)
Out[16]:
In [17]:
y_true = newsgroups_test.target
y_pred = clf.predict(vectors_test)
In [18]:
metrics.f1_score(y_true, y_pred, average='macro')
Out[18]:
Interpretation: An F-1 score of 0.94 is high. Our model is performant.
In [19]:
cm = confusion_matrix(y_true, y_pred)
In [20]:
def print_cm(cm, labels, hide_zeroes=False, hide_diagonal=False, hide_threshold=None):
"""pretty print for confusion matrixes"""
columnwidth = max([len(x) for x in labels] + [5]) # 5 is value length
empty_cell = " " * columnwidth
# Print header
print(" " + empty_cell, end=" ")
for label in labels:
print("%{0}s".format(columnwidth) % label, end=" ")
print()
# Print rows
for i, label1 in enumerate(labels):
print(" %{0}s".format(columnwidth) % label1, end=" ")
for j in range(len(labels)):
cell = "%{0}.1f".format(columnwidth) % cm[i, j]
if hide_zeroes:
cell = cell if float(cm[i, j]) != 0 else empty_cell
if hide_diagonal:
cell = cell if i != j else empty_cell
if hide_threshold:
cell = cell if cm[i, j] > hide_threshold else empty_cell
print(cell, end=" ")
print()
print_cm(cm, labels = ['Automobiles', 'Medical'])
In [21]:
pd.crosstab(y_true, y_pred, rownames=['True'], colnames=['Predicted'], margins=True)
Out[21]:
Interpretation:
In [22]:
def plot_precision_recall(y_true,y_score):
"""
Plot a precision recall curve
Parameters
----------
y_true: ls
ground truth labels
y_score: ls
score output from model
"""
precision_curve, recall_curve, pr_thresholds = precision_recall_curve(y_true,y_score[:,1])
plt.plot(recall_curve, precision_curve)
plt.xlabel('Recall')
plt.ylabel('Precision')
auc_val = auc(recall_curve,precision_curve)
print('AUC-PR: {0:1f}'.format(auc_val))
plt.show()
plt.clf()
In [23]:
y_score = clf.predict_proba(vectors_test)
plot_precision_recall(y_true, y_score)
Interpretation: The area under the curve is 0.98, just shy of the ideal 1.0. The trained classifier is extending to the test set well.
In [24]:
#Params - NOT tuned
ANALYZER = "word" #unit of features are single words rather then phrases of words
STRIP_ACCENTS = 'unicode'
TOKENIZER = None
MAX_DF = (1.0) # Exclude words that have a frequency greater than the threshold
STOP_WORDS = (stopwords.words('english'), None)
#Params - TUNED
NGRAM_RANGE = ((0,1), (0,2)) #Range for pharases of words
MIN_DF = (0, 0.01) # Exclude words that have a frequency less than the threshold
ALPHA = (0.01, 0.1, 1)
In [25]:
pipeline = Pipeline([('tfidf', TfidfVectorizer()), ('clf', MultinomialNB())])
# uncommenting more parameters will give better exploring power but will
# increase processing time in a combinatorial way
parameters = {
'tfidf__ngram_range':NGRAM_RANGE,
'tfidf__min_df':MIN_DF,
'clf__alpha': ALPHA,
}
In [26]:
def optimize_pipeline(pipeline):
# multiprocessing requires the fork to happen in a __main__ protected
# block
# find the best parameters for both the feature extraction and the
# classifier
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=True)
print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline.steps])
print("parameters:")
pprint(parameters)
t0 = time()
grid_search.fit(newsgroups_train.data, newsgroups_train.target)
print("done in %0.3fs" % (time() - t0))
print()
print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
print("\t%s: %r" % (param_name, best_parameters[param_name]))
In [27]:
optimize_pipeline(pipeline)
Credits: