In [1]:
# Naive Bayes classification testing
%pylab inline
# SciKit Learn's newsgroup dataset
from sklearn.datasets import fetch_20newsgroups
news = fetch_20newsgroups(subset='all')
print(type(news.data), type(news.target), type(news.target_names))
print(news.target_names)
print(len(news.data))
print(len(news.target))
# Check first dataset entry
print(news.data[0])
print(news.target[0], news.target_names[news.target[0]])
In [2]:
# Convert the text-based data into numeric data
SPLIT_PERC = 0.75
split_size = int(len(news.data) * SPLIT_PERC)
X_train = news.data[:split_size]
X_test = news.data[split_size:]
y_train = news.target[:split_size]
y_test = news.target[split_size:]
print(len(X_train))
print(len(y_train))
In [3]:
# Naive Bayes training
import numpy as np
from sklearn.cross_validation import cross_val_score, KFold
from scipy.stats import sem
def evaluate_cross_validation(clf, X, y, K):
cv = KFold(len(y), K, shuffle=True, random_state=0)
scores = cross_val_score(clf, X, y, cv=cv)
print(scores)
print("Mean score {0:.3f} (+/-{1:.3f})".format(np.mean(scores), sem(scores)))
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer, CountVectorizer
clf_1 = Pipeline([
('vect', CountVectorizer()),
('clf', MultinomialNB()),
])
clf_2 = Pipeline([
('vect', HashingVectorizer(non_negative=True)),
('clf', MultinomialNB()),
])
clf_3 = Pipeline([
('vect', TfidfVectorizer()),
('clf', MultinomialNB()),
])
clfs = [clf_1, clf_2, clf_3]
for clf in clfs:
evaluate_cross_validation(clf, news.data, news.target, 5)
In [4]:
# New classifier with different regex to split the words
clf_4 = Pipeline([
('vect', TfidfVectorizer(
token_pattern=ur"\b[a-z0-9_\-\.]+[a-z][a-z0-9_\-\.]+\b",
)),
('clf', MultinomialNB()),
])
evaluate_cross_validation(clf_4, news.data, news.target, 5)
In [5]:
# Let's use stop_words
def get_stop_words():
result = set()
with open('stopwords_en.txt', 'r') as in_file:
for line in in_file.readlines():
result.add(line.strip())
return result
print(get_stop_words())
In [6]:
# Try another classifier with get_stop_words now
clf_5 = Pipeline([
('vect', TfidfVectorizer(
stop_words=get_stop_words(),
token_pattern=ur"\b[a-z0-9_\-\.]+[a-z][a-z0-9\_\.]+\b",
)),
('clf', MultinomialNB()),
])
evaluate_cross_validation(clf_5, news.data, news.target, 5)
In [7]:
# Testing with a different alpha parameter for MultinomialNB
clf_7 = Pipeline([
('vect', TfidfVectorizer(
stop_words=get_stop_words(),
token_pattern=ur"\b[a-z0-9_\-\.]+[a-z][a-z0-9\_\.]+\b",
)),
('clf', MultinomialNB(alpha=0.01)),
])
evaluate_cross_validation(clf_7, news.data, news.target, 5)
In [9]:
# Let's evaluate the performance
from sklearn import metrics
def train_and_evaluate(clf, X_train, X_test, y_train, y_test):
clf.fit(X_train, y_train)
print("Accuracy on training set:")
print(clf.score(X_train, y_train))
print("Accuracy on testing set:")
print(clf.score(X_test, y_test))
y_pred = clf.predict(X_test)
print("Classification Report:")
print(metrics.classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(metrics.confusion_matrix(y_test, y_pred))
train_and_evaluate(clf_7, X_train, X_test, y_train, y_test)
In [10]:
# Look inside the vectorizer to see which tokens were used to create our dictionary
print(len(clf_7.named_steps['vect'].get_feature_names()))