In [10]:
from __future__ import division
import re
from collections import defaultdict
import math
from collections import Counter
from ml import *
In [2]:
# Beware underflow and use pseudo-counts to correct for rare word, label combos
In [71]:
def drop_final_s(word):
return re.sub("s$", "", word)
def tokenize(message, drop_s = False):
message = message.lower()
all_words = re.findall("[a-z0-9']+", message)
if drop_s == True:
all_words = [drop_final_s(w)
for w in all_words]
return set(all_words)
def count_words(training_set):
"""training set is (message, is_spam)"""
counts = defaultdict(lambda: [0,0])
for message, is_spam in training_set: # for each message
for word in tokenize(message, drop_s=False): # split into words
counts[word][0 if is_spam else 1] += 1 # for the given word increment spam or not spam count
return counts
def word_probabilities(counts, total_spams, total_non_spams, k = 0.5):
"""take word, label counts and output conditional probabilities"""
return [(w,
(spam + k) / (total_spams + 2 * k),
(non_spam + k) / (total_non_spams + 2 * k))
for w, (spam, non_spam) in counts.iteritems()]
def spam_probability(word_probs, message):
message_words = tokenize(message)
log_prob_if_spam = log_prob_if_not_spam = 0.0
# for all words in message collect probabilities
for word, prob_if_spam, prob_if_not_spam in word_probs:
if word in message_words:
log_prob_if_spam += math.log(prob_if_spam)
log_prob_if_not_spam += math.log(prob_if_not_spam)
else:
log_prob_if_spam += math.log(1.0 - prob_if_spam)
log_prob_if_not_spam += math.log(1.0 - prob_if_not_spam)
prob_if_spam = math.exp(log_prob_if_spam)
prob_if_not_spam = math.exp(log_prob_if_not_spam)
return prob_if_spam / (prob_if_spam + prob_if_not_spam)
class NaiveBayesClassifier:
def __init__(self, k = 0.5):
self.k = k
self.word_probs = []
def train(self, training_set):
# count the spam and non-spam
num_spams = len([is_spam
for message, is_spam in training_set
if is_spam])
num_non_spams = len(training_set) - num_spams
# run word pipeline
word_counts = count_words(training_set)
self.word_probs = word_probabilities(word_counts,
num_spams,
num_non_spams,
self.k)
def classify(self, message):
return spam_probability(self.word_probs, message)
In [72]:
import glob
path = r'.\spam\*\*'
data = []
for fn in glob.glob(path):
is_spam = "ham" not in fn
with open(fn,'r') as file:
for line in file:
if line.startswith("Subject:"):
subject = re.sub(r"^Subject: ", "", line).strip()
data.append((subject, is_spam))
In [73]:
len(test_data)
Out[73]:
In [74]:
random.seed(0)
train_data, test_data = split_data(data, 0.75)
classifier = NaiveBayesClassifier()
classifier.train(train_data)
In [75]:
classified = [(subject, is_spam, classifier.classify(subject))
for subject, is_spam in test_data]
counts = Counter((is_spam, spam_probability > 0.5)
for _, is_spam, spam_probability in classified)
In [76]:
counts
Out[76]:
In [77]:
tn = counts[(False,False)]
tp = counts[(True,True)]
fn = counts[(True,False)]
fp = counts[(False,True)]
In [78]:
print accuracy(tp, fp, fn, tn)
print precision(tp, fp, fn, tn)
print recall(tp, fp, fn, tn)
print f1_score(tp, fp, fn, tn)
In [51]:
classified.sort(key = lambda row: row[2])
In [52]:
spammiest_hams = filter(lambda row: not row[1], classified)[-5:]
In [53]:
spammiest_hams
Out[53]:
In [54]:
hammiest_spams = filter(lambda row: row[1], classified)[:5]
In [55]:
hammiest_spams
Out[55]:
In [57]:
def p_spam_given_word(word_prob):
"""calculate p(spam|word)"""
word, prob_if_spam, prob_if_not_spam = word_prob
return prob_if_spam / (prob_if_spam + prob_if_not_spam)
words = sorted(classifier.word_probs, key = p_spam_given_word)
In [58]:
words[-5:]
Out[58]:
In [59]:
words[:5]
Out[59]:
In [ ]: