In [82]:
from __future__ import division # ensure that all division is float division
from __future__ import print_function # print function works properly when used with paranthesis
%matplotlib inline
import matplotlib.pyplot as plt
import os, sys, re
import numpy as np
import pandas as pd
import seaborn as sns
pd.set_option("display.max_colwidth", 255)
Read in SMS Data.
The SMS Spam Collection v.1 is a public set of SMS labeled messages that have been collected for mobile phone spam research. It has one collection composed by 5,574 English, real and non-enconded messages, tagged according being legitimate (ham) or spam.
A collection of 425 SMS spam messages was manually extracted from the Grumbletext Web site. This is a UK forum in which cell phone users make public claims about SMS spam messages, most of them without reporting the very spam message received. The identification of the text of spam messages in the claims is a very hard and time-consuming task, and it involved carefully scanning hundreds of web pages. The Grumbletext Web site is: http://www.grumbletext.co.uk/.
A subset of 3,375 SMS randomly chosen ham messages of the NUS SMS Corpus (NSC), which is a dataset of about 10,000 legitimate messages collected for research at the Department of Computer Science at the National University of Singapore. The messages largely originate from Singaporeans and mostly from students attending the University. These messages were collected from volunteers who were made aware that their contributions were going to be made publicly available. The NUS SMS Corpus is avalaible at: http://www.comp.nus.edu.sg/~rpnlpir/downloads/corpora/smsCorpus/.
In [83]:
df = pd.read_csv("../data/sms.tsv", sep="\t", names=['label', 'message'])
print(df.shape)
df.head()
Out[83]:
Stratified means the proprtions of spam/ham in the train/test sets reflect the original dataset. You can see the percentage is about the same here.
In [84]:
df.shape
Out[84]:
In [58]:
from sklearn.cross_validation import train_test_split
train, test = train_test_split(df, test_size=0.05, stratify=df.label)
print(train.shape, test.shape)
train.label.value_counts()['ham'] / len(train), test.label.value_counts()['ham'] / len(test)
Out[58]:
Extract two sample messages that we will use for testing in the functions below.
In [85]:
sample_df = train.sample(2)
sample_row1 = sample_df.iloc[0] # first row of sample_df
sample_row2 = sample_df.iloc[1] # second row of sample_df
sample_message1 = sample_row1.message
sample_message2 = sample_row2.message
print(sample_row1.label, "|", sample_message1)
print(sample_row2.label, "|", sample_message2)
Use http://regex101.com to come up with regular expressions.
In [87]:
def tokenize(msg):
"""
input: "Change again... It's e one next to escalator..."
output: ["change", "again", "it's", "one", "next", "to", "escalator"]
"""
msg_lowered = msg.lower()
# at least two characters long, cannot start with number
all_tokens = re.findall(r"\b[a-z][a-z0-9']+\b", msg_lowered)
return list(set(all_tokens))
tokens1 = tokenize(sample_message1)
tokens2 = tokenize(sample_message2)
print(sample_message1)
print(sample_message2)
print(tokens1)
print(tokens2)
Walk through the steps of vectorizing a message outside of a function.
In [88]:
token_dict1 = {} # this is a dictionary that looks like {word1: 1, word2: 1, word3: 1}
for token in tokens1:
token_dict1[token] = 1
series1 = pd.Series(token_dict1) # convert the dictionary into a series where the row labels are words
# rewrite the same as above using a dict comprehension
series1 = pd.Series({token: 1 for token in tokens1})
token_dict2 = {} # this is a dictionary that looks like {word1: 1, word2: 1, word3: 1}
for token in tokens2:
token_dict2[token] = 1
series2 = pd.Series(token_dict2) # convert the dictionary into a series where the row labels are words
# rewrite the same as above using a dict comprehension
series2 = pd.Series({token: 1 for token in tokens2})
print("Sample Message 1:", sample_message1)
print("Tokens 1:", tokens1)
print("Series 1:")
print(series1)
print()
print("Sample Message 2:", sample_message2)
print("Tokens 2:", tokens2)
print("Series 2:")
print(series2)
print()
print("Combine Series 1 and Series 2:")
df2 = pd.DataFrame([series1, series2]) # comebine the two
df2.fillna(0, inplace=True)
df2
Out[88]:
Repeat the same process as above of tokenzing and then vectorizing using a function.
In [89]:
def vectorize_row(row):
"""
input: row in data frame with a ".message" attribute
output: vectorized row where the row labels are words and the values are 1 for each row
"""
message = row.message
tokens = tokenize(message)
vectorized_row = pd.Series({token: 1 for token in tokens})
return vectorized_row
In [90]:
vectorize_row(sample_row1)
Out[90]:
In [91]:
vectorize_row(sample_row2)
Out[91]:
This is input to our Naive Bayes model.
In [65]:
def get_feature_matrix(df):
feature_matrix = df.apply(vectorize_row, axis=1)
feature_matrix.fillna(0, inplace=True)
return feature_matrix
In [92]:
get_feature_matrix(sample_df)
Out[92]:
In [93]:
feature_matrix = get_feature_matrix(train)
feature_matrix.shape
Out[93]:
In [94]:
feature_matrix.columns[:50]
Out[94]:
In [95]:
feature_matrix.columns[-50:]
Out[95]:
In [96]:
feature_matrix.head()
Out[96]:
The conditional probability of each word is given additive smoothing below.
In [99]:
def get_conditional_probability_for_word(col, k=0.5):
return (col.sum() + k) / (len(col) + 2*k)
In [100]:
def get_feature_prob(feature_matrix):
spam_boolean_mask = (df.label == "spam")
ham_boolean_mask = (df.label == "ham")
# Explanation for "confusing" syntax:
# http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
feature_matrix_spam = feature_matrix.loc[spam_boolean_mask, :] # get all rows for spam boolean mask
feature_matrix_ham = feature_matrix.loc[ham_boolean_mask, :] # get all rows for ham boolean mask
# mymatrix[:, 0] is to get the first column
# mymatrix[:, 1] is to get the second column
# mymatrix[0, :] is to get the first row
# mymatrix[1, :] is to get the second row
# mymatrix[boolean_mask, :] is to get the rows where boolean_mask is True
feature_prob_spam = feature_matrix_spam.apply(get_conditional_probability_for_word, axis=0)
feature_prob_ham = feature_matrix_ham.apply(get_conditional_probability_for_word, axis=0)
feature_prob = pd.concat([feature_prob_spam, feature_prob_ham], axis=1)
feature_prob.columns = ['spam', 'ham']
return feature_prob
In [101]:
feature_prob = get_feature_prob(feature_matrix)
feature_prob.shape
Out[101]:
In [102]:
feature_prob.head()
Out[102]:
Words with the largest conditional probability for predicting spam.
P(w_i | y= "spam")
In [75]:
feature_prob.sort_values(by='spam', ascending=False).head(10)
Out[75]:
Words with the smallest conditional probability for predicting ham.
P(w_i | y= "ham")
In [76]:
feature_prob.sort_values(by='ham', ascending=True).head(10)
Out[76]:
Key Takeaway: These models are trained looking only at one class at a time, so the largest conditional probabilities may end up being common stop words. However, this will occur in both classes which ends up "cancelling out". The stop words won't predict one way or the other. Instead, looking at the least predictive words of the opposite class - in this case the words least predictive of "ham" will show us highly predictive spam words.
In [77]:
df[df.message.str.contains("a21", case=False)]
Out[77]:
In [78]:
df[df.message.str.contains("landmark", case=False)]
Out[78]:
In [79]:
df[df.message.str.contains("landlines", case=False)]
Out[79]:
In [104]:
test.iloc[0]
Out[104]:
In [80]:
def get_spam_prob(row):
new_msg = row.message
tokens = tokenize(new_msg)
log_prob_if_spam = 0.0
log_prob_if_not_spam = 0.0
for word, prob in feature_prob.iterrows():
prob_if_spam = prob.spam
prob_if_not_spam = prob.ham
if word in tokens:
log_prob_if_spam += math.log(prob_if_spam)
log_prob_if_not_spam += math.log(prob_if_not_spam)
else:
log_prob_if_spam += math.log(1.0 - prob_if_spam)
log_prob_if_not_spam += math.log(1.0 - prob_if_not_spam)
prob_if_spam = math.exp(log_prob_if_spam)
prob_if_not_spam = math.exp(log_prob_if_not_spam)
return prob_if_spam / (prob_if_spam + prob_if_not_spam)
# return pd.Series({
# "spam_prob": prob_if_spam, #/ (prob_if_spam + prob_if_not_spam),
# "ham_prob": prob_if_not_spam #/ (prob_if_spam + prob_if_not_spam)
# })
In [81]:
# test_probs = test.apply(get_spam_prob, axis=1)