This analysis attempts to identify spam messages from a corpus of 5,574 SMS text messages. The corpus is labeled as either spam or ham (legitimate messages) with 4,827 as ham and 747 as spam. Using Sci-kit Learn and the Multinomial Naive Bayes model to classify messages as spam and ham.
We will look at various options to tune the model to see if we can get to 0 false positives in which legitimate messages are labled as spam. It is expected that a small percentage of spam messages making it through the spam filter is preferable to legitimate messages being excluded.
Sources:
https://archive.ics.uci.edu/ml/datasets/sms+spam+collection
https://radimrehurek.com/data_science_python/
http://adataanalyst.com/scikit-learn/countvectorizer-sklearn-example/
In [479]:
%matplotlib inline
import os
import json
import time
import pickle
import requests
from io import BytesIO
from zipfile import ZipFile
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import text
import seaborn as sns
sns.set(font_scale=1.5)
In [517]:
URL = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip'
SMS_PATH = os.path.join('datasets', 'sms')
file_name = requests.get(URL)
zipfile = ZipFile(BytesIO(file_name.content))
zip_names = zipfile.namelist()
def fetch_data(file='SMSSPamCollection'):
for file in zip_names:
if not os.path.isdir(SMS_PATH):
os.makedirs(SMS_PATH)
outpath = os.path.join(SMS_PATH, file)
extracted_file = zipfile.read(file)
with open(outpath, 'wb') as f:
f.write(extracted_file)
return outpath
DATA = fetch_data()
In [518]:
df = pd.read_csv(DATA, sep='\t', header=None)
In [519]:
df.columns = ['Label', 'Text']
In [520]:
pd.set_option('max_colwidth', 220)
df.head(20)
Out[520]:
In [521]:
df.describe()
Out[521]:
In [522]:
df.info()
In [523]:
# Add a field to our dataframe with the length of each message.
df['Length'] = df['Text'].apply(len)
In [524]:
df.head()
Out[524]:
In [525]:
df.groupby('Label').describe()
Out[525]:
In [526]:
df.Length.plot(bins=100, kind='hist')
Out[526]:
In [527]:
df.hist(column='Length', by='Label', bins=50, figsize=(10,4))
Out[527]:
In [528]:
text_data = df['Text']
text_data.shape
Out[528]:
In [529]:
# Give our target labels numbers.
df['Label_'] = df['Label'].map({'ham': 0, 'spam': 1})
In [530]:
#stop_words = text.ENGLISH_STOP_WORDS
#Adding stop words did not significantly improve the model.
In [531]:
#textWithoutNums = text_data.replace('\d+', 'NUM_', regex=True)
#Removing all of the numbers in the messages and replacing with a text string did not improve the model either.
In [532]:
vectorizer = CountVectorizer(analyzer='word') #, stop_words=stop_words)
#vectorizer.fit(textWithoutNums)
vectorizer.fit(text_data)
Out[532]:
In [533]:
vectorizer.get_feature_names()
Out[533]:
In [534]:
pd.DataFrame.from_dict(vectorizer.vocabulary_, orient='index').sort_values(by=0, ascending=False).head()
Out[534]:
In [535]:
dtm = vectorizer.transform(text_data)
features = pd.DataFrame(dtm.toarray(), columns=vectorizer.get_feature_names())
features.shape
Out[535]:
In [536]:
features.head()
Out[536]:
In [537]:
X = features
y = np.array(df['Label_'].tolist())
In [538]:
from sklearn.cross_validation import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
print(X_train.shape, y_train.shape)
In [539]:
model = MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
model.fit(X_train, y_train)
Out[539]:
In [540]:
y_pred_class = model.predict(X_test)
In [541]:
print(metrics.classification_report(y_test, y_pred_class))
print('Accuracy Score: ', metrics.accuracy_score(y_test, y_pred_class))
In [542]:
from yellowbrick.classifier import ClassificationReport
bayes = MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
visualizer = ClassificationReport(bayes, classes=['ham', 'spam'])
visualizer.fit(X_train, y_train)
visualizer.score(X_test, y_test)
g = visualizer.poof()
In [543]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred_class)
sns.set(font_scale=1.5)
ax = plt.subplot()
sns.heatmap(cm, annot=True, ax=ax, fmt='g', cbar=False)
ax.set_xlabel('Predicted')
ax.set_ylabel('Actual')
ax.set_title('Confusion Matrix')
ax.xaxis.set_ticklabels(['Ham', 'Spam'])
ax.yaxis.set_ticklabels(['Ham', 'Spam'])
plt.show()
In [544]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
# Split the dataset in two equal parts
X_train_, X_test_, y_train_, y_test_ = train_test_split(
X, y, test_size=0.5, random_state=1)
# Set the parameters by cross-validation
tuned_parameters = [{'alpha': [0.5, 1.0, 1.5, 2.0, 2.5, 3.0], 'class_prior':[None], 'fit_prior': [True, False]}]
scores = ['precision', 'recall']
for score in scores:
print("### Tuning hyper-parameters for %s ###" % score)
print()
clf = GridSearchCV(MultinomialNB(), tuned_parameters, cv=5,
scoring='%s_macro' % score)
clf.fit(X_train_, y_train_)
print("Best parameters set found on development set:")
print()
print(clf.best_params_)
print()
print("Grid scores on development set:")
print()
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
print("%0.3f (+/-%0.03f) for %r"
% (mean, std * 2, params))
print()
print("Detailed classification report:")
print()
print("The model is trained on the full development set.")
print("The scores are computed on the full evaluation set.")
print()
y_true, y_pred = y_test, clf.predict(X_test)
print(classification_report(y_true, y_pred))
print()
print('Accuracy Score: ', metrics.accuracy_score(y_test, y_pred))
print()
In [545]:
from yellowbrick.classifier import ClassificationReport
bayes = MultinomialNB(alpha=3.0, class_prior=None, fit_prior=True)
visualizer = ClassificationReport(bayes, classes=['ham', 'spam'])
visualizer.fit(X_train, y_train)
visualizer.score(X_test, y_test)
g = visualizer.poof()
In [546]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
print(X_train.shape, y_train.shape)
In [547]:
model = MultinomialNB(alpha=3.0, class_prior=None, fit_prior=True)
model.fit(X_train, y_train)
Out[547]:
In [548]:
y_pred_class = model.predict(X_test)
In [549]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred_class)
sns.set(font_scale=1.5)
ax = plt.subplot()
sns.heatmap(cm, annot=True, ax=ax, fmt='g', cbar=False)
ax.set_xlabel('Predicted')
ax.set_ylabel('Actual')
ax.set_title('Confusion Matrix')
ax.xaxis.set_ticklabels(['Ham', 'Spam'])
ax.yaxis.set_ticklabels(['Ham', 'Spam'])
plt.show()
In [ ]: