In [1]:
import pandas as pd
import numpy as np
from sklearn.naive_bayes import MultinomialNB,BernoulliNB
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction import DictVectorizer
from sklearn.cross_validation import KFold
import matplotlib.pyplot as plt
%matplotlib inline
We use SMS Spam Collection Data Set from UCI, which contains 5574 SMS, 747 of them are spam and the rest are ham.
In [2]:
data = pd.read_csv('./asset/SMSSpamCollection.txt', sep='\t')
pd.set_option("display.max_colwidth",999)
data.head()
Out[2]:
We use the count of each tokens (words) in the SMS as its feature.
Firstly we convert the SMS into features.
In [3]:
def get_freq_of_tokens(sms):
tokens = {}
for token in sms.split(' '):
if token not in tokens:
tokens[token] = 1
else:
tokens[token] += 1
return tokens
print(get_freq_of_tokens(data.iloc[0].text))
In [4]:
features_and_labels = []
for index in range(len(data)):
features_and_labels.append((get_freq_of_tokens(data.iloc[index].text), data.iloc[index].category))
We then trainsform the features into a sparse martix.
In [5]:
encoder = LabelEncoder()
vectorizer = DictVectorizer(dtype=float, sparse=True)
X, y = list(zip(*features_and_labels))
X = vectorizer.fit_transform(X)
# print(X)
# print(vectorizer.feature_names_)
y = encoder.fit_transform(y)
X
Out[5]:
Use the first SMS (i.e., X[0]) to show how to get the feature names and values.
In [6]:
arr = X[0].toarray()
for i in range(len(arr[0])):
if arr[0][i] > 0:
print('{}:{}'.format(vectorizer.feature_names_[i], arr[0][i]))
alpha
is the additive (Laplace/Lidstone) smoothing parameter. Its default value is 1.0 -- we will see that this works pretty well for our problem later.
In [7]:
nb = MultinomialNB(alpha=1)
nb.fit(X, y)
nb.score(X, y)
Out[7]:
Let's use cross validation to explore how alpha affects the performance of the classifier
In [8]:
n_folds = 10
kf = KFold(n=X.shape[0], n_folds=n_folds, shuffle=True, random_state=42)
In [9]:
def test_Multinomial_NB(train_X, train_y, test_X, test_y, alpha=1, debug = False):
nb = MultinomialNB(alpha=alpha)
nb.fit(train_X, train_y)
train_error = nb.score(train_X, train_y)
test_error = nb.score(test_X, test_y)
if debug:
print('training error:\t{}'.format(train_error))
print('testing error:\t{}'.format(test_error))
return train_error, test_error
In [10]:
def cv_MultinomialNB(alpha = 1.0):
train_error_total = 0
test_error_total = 0
for train, test in kf:
train_X = X[train]
test_X = X[test]
train_y = y[train]
test_y = y[test]
train_error, test_error = test_Multinomial_NB(train_X, train_y, test_X, test_y, alpha)
train_error_total += train_error
test_error_total += test_error
return train_error_total/n_folds, test_error_total/n_folds
# print('===================')
# print('avg. training error:\t{}'.format(train_error_total/n_folds))
# print('avg. testing error:\t{}'.format(test_error_total/n_folds))
In [11]:
def cv_plot_MultinomialNB():
cv_res = []
rng = sorted([0.5] + list(range(0, 10)))
for i in rng:
train_error, test_error = cv_MultinomialNB(i)
cv_res.append([i, train_error, test_error])
cv_res_arr = np.array(cv_res)
plt.figure(figsize=(16,9))
plt.title('Error vs. alpha')
plot_train, = plt.plot(cv_res_arr[:,0], cv_res_arr[:,1], label='training')
plot_test, = plt.plot(cv_res_arr[:,0], cv_res_arr[:,2], label='testing')
plt.legend(handles=[plot_train, plot_test])
plt.ylim((min(min(cv_res_arr[:,1]), min(cv_res_arr[:,2])) - 0.01, max(max(cv_res_arr[:,1]), max(cv_res_arr[:,2]))+0.01))
plt.xticks(rng)
In [12]:
cv_plot_MultinomialNB()
Note that since we have the count matrix for words in the vocabulary, we can binarize it to be used in a Bernoulli NB classifier.
In [13]:
nb = BernoulliNB(alpha=1, binarize=0.0) # every value >0.0 will be binarized to 1
nb.fit(X, y)
nb.score(X, y)
Out[13]:
In [14]:
def test_Bernoulli_NB(train_X, train_y, test_X, test_y, alpha=1, debug = False):
nb = BernoulliNB(alpha=alpha)
nb.fit(train_X, train_y)
train_error = nb.score(train_X, train_y)
test_error = nb.score(test_X, test_y)
if debug:
print('training error:\t{}'.format(train_error))
print('testing error:\t{}'.format(test_error))
return train_error, test_error
In [15]:
def cv_BernoulliNB(alpha = 1.0):
train_error_total = 0
test_error_total = 0
for train, test in kf:
train_X = X[train]
test_X = X[test]
train_y = y[train]
test_y = y[test]
train_error, test_error = test_Bernoulli_NB(train_X, train_y, test_X, test_y, alpha)
train_error_total += train_error
test_error_total += test_error
return train_error_total/n_folds, test_error_total/n_folds
# print('===================')
# print('avg. training error:\t{}'.format(train_error_total/n_folds))
# print('avg. testing error:\t{}'.format(test_error_total/n_folds))
In [16]:
def cv_plot_BernoulliNB():
cv_res = []
rng = sorted([0.5] + list(range(0, 10)))
for i in rng:
train_error, test_error = cv_BernoulliNB(i)
cv_res.append([i, train_error, test_error])
cv_res_arr = np.array(cv_res)
plt.figure(figsize=(16,9))
plt.title('Error vs. alpha')
plot_train, = plt.plot(cv_res_arr[:,0], cv_res_arr[:,1], label='training')
plot_test, = plt.plot(cv_res_arr[:,0], cv_res_arr[:,2], label='testing')
plt.legend(handles=[plot_train, plot_test])
plt.ylim((min(min(cv_res_arr[:,1]), min(cv_res_arr[:,2])) - 0.01, max(max(cv_res_arr[:,1]), max(cv_res_arr[:,2]))+0.01))
plt.xticks(rng)
In [17]:
cv_plot_BernoulliNB()
In [ ]: