In [1]:
from collections import Counter, defaultdict
import math, random, re, glob

In [2]:
def get_subject_data(path):
    data = []

    # regex for stripping out the leading "Subject:" and any spaces after it
    subject_regex = re.compile(r"^Subject:\s+")

    # glob.glob returns every filename that matches the wildcarded path
    for fn in glob.glob(path):
        is_spam = "ham" not in fn

        with open(fn,'r',encoding='ISO-8859-1') as file:
            for line in file:
                if line.startswith("Subject:"):
                    subject = subject_regex.sub("", line).strip()
                    data.append((subject, is_spam))
                    # 폴더에서 하나의 파일의 제목을 subject에 그 파일의 스팸 여부를 data에 저장합니다.

    return data

In [3]:
data = get_subject_data(r"data/spam_mail/*/*")
data[:5]


Out[3]:
[('Re: New Sequences Window', False),
 ('[zzzzteana] RE: Alexander', False),
 ('[zzzzteana] Moscow bomber', False),
 ("[IRR] Klez: The Virus That  Won't Die", False),
 ('Re: Insert signature', False)]

In [4]:
def split_data(data, prob):
    """split data into fractions [prob, 1 - prob]"""
    results = [], []
    for row in data:
        results[0 if random.random() < prob else 1].append(row)
    return results

In [5]:
train_data, test_data = split_data(data, 0.75)
train_data[:5], test_data[:5]


Out[5]:
([('Re: New Sequences Window', False),
  ('[zzzzteana] RE: Alexander', False),
  ('[zzzzteana] Moscow bomber', False),
  ('Re: [zzzzteana] Nothing like mama used to make', False),
  ('[zzzzteana] Playboy wants to go out with a bang', False)],
 [("[IRR] Klez: The Virus That  Won't Die", False),
  ('Re: Insert signature', False),
  ('Re: [zzzzteana] Nothing like mama used to make', False),
  ('Re: [zzzzteana] Nothing like mama used to make', False),
  ('Re: New Sequences Window', False)])

In [6]:
def tokenize(message):
    message = message.lower()                       # convert to lowercase
    all_words = re.findall("[a-z0-9']+", message)   # extract the words
    return set(all_words)                           # remove duplicates

In [7]:
tokenize("turn the word_counts into a list of triplets")


Out[7]:
{'a', 'counts', 'into', 'list', 'of', 'the', 'triplets', 'turn', 'word'}

In [8]:
def count_words(training_set):
    """training set consists of pairs (message, is_spam)"""
    counts = defaultdict(lambda: [0, 0])
    for message, is_spam in training_set:
        for word in tokenize(message):
            counts[word][0 if is_spam else 1] += 1
    return counts

In [9]:
count_words(train_data[:5])


Out[9]:
defaultdict(<function __main__.count_words.<locals>.<lambda>>,
            {'a': [0, 1],
             'alexander': [0, 1],
             'bang': [0, 1],
             'bomber': [0, 1],
             'go': [0, 1],
             'like': [0, 1],
             'make': [0, 1],
             'mama': [0, 1],
             'moscow': [0, 1],
             'new': [0, 1],
             'nothing': [0, 1],
             'out': [0, 1],
             'playboy': [0, 1],
             're': [0, 3],
             'sequences': [0, 1],
             'to': [0, 2],
             'used': [0, 1],
             'wants': [0, 1],
             'window': [0, 1],
             'with': [0, 1],
             'zzzzteana': [0, 4]})

In [10]:
def word_probabilities(counts, total_spams, total_non_spams, k=0.5):
    """turn the word_counts into a list of triplets
    w, p(w | spam) and p(w | ~spam)"""
    return [(w,
             (spam + k) / (total_spams + 2 * k),
             (non_spam + k) / (total_non_spams + 2 * k))
             for w, (spam, non_spam) in counts.items()]

In [11]:
word_probabilities(
    count_words(train_data), len(
        [is_spam for message, is_spam in train_data if is_spam]), 
    1-len([is_spam for message, is_spam in train_data if is_spam]))[:5]


Out[11]:
[('anolther', 0.0013054830287206266, -0.02763157894736842),
 ('gain', 0.0039164490861618795, -0.003947368421052632),
 ('have', 0.0195822454308094, -0.009210526315789473),
 ('isilo', 0.0013054830287206266, -0.003947368421052632),
 ('sonlsnaik', 0.0039164490861618795, -0.0013157894736842105)]

In [12]:
def spam_probability(word_probs, message):
    message_words = tokenize(message)
    log_prob_if_spam = log_prob_if_not_spam = 0.0

    for word, prob_if_spam, prob_if_not_spam in word_probs:

        # for each word in the message,
        # add the log probability of seeing it
        if word in message_words:
            log_prob_if_spam += math.log(prob_if_spam)
            log_prob_if_not_spam += math.log(prob_if_not_spam)

        # for each word that's not in the message
        # add the log probability of _not_ seeing it
        else:
            log_prob_if_spam += math.log(1.0 - prob_if_spam)
            log_prob_if_not_spam += math.log(1.0 - prob_if_not_spam)

    prob_if_spam = math.exp(log_prob_if_spam)
    prob_if_not_spam = math.exp(log_prob_if_not_spam)
    return prob_if_spam / (prob_if_spam + prob_if_not_spam)

In [13]:
def p_spam_given_word(word_prob):
    word, prob_if_spam, prob_if_not_spam = word_prob
    return prob_if_spam / (prob_if_spam + prob_if_not_spam)

In [14]:
class NaiveBayesClassifier:
    def __init__(self, k=0.5):
        self.k = k
        self.word_probs = []

    def train(self, training_set):

        # count spam and non-spam messages
        num_spams = len([is_spam
                         for message, is_spam in training_set
                         if is_spam])
        num_non_spams = len(training_set) - num_spams

        # run training data through our "pipeline"
        word_counts = count_words(training_set)
        self.word_probs = word_probabilities(word_counts,
                                             num_spams,
                                             num_non_spams,
                                             self.k)

    def classify(self, message):
        return spam_probability(self.word_probs, message)

In [15]:
def train_and_test_model(path):

    data = get_subject_data(path)
    random.seed(0)      # just so you get the same answers as me
    train_data, test_data = split_data(data, 0.75)

    classifier = NaiveBayesClassifier()
    classifier.train(train_data)

    classified = [(subject, is_spam, classifier.classify(subject))
              for subject, is_spam in test_data]

    counts = Counter((is_spam, spam_probability > 0.5) # (actual, predicted)
                     for _, is_spam, spam_probability in classified)

    print(counts)

    classified.sort(key=lambda row: row[2])
    spammiest_hams = list(filter(lambda row: not row[1], classified))[-5:]
    hammiest_spams = list(filter(lambda row: row[1], classified))[:5]

    print("spammiest_hams", spammiest_hams)
    print("hammiest_spams", hammiest_spams)

    words = sorted(classifier.word_probs, key=p_spam_given_word)

    spammiest_words = words[-5:]
    hammiest_words = words[:5]

    print("spammiest_words", spammiest_words)
    print("hammiest_words", hammiest_words)

In [16]:
if __name__ == "__main__":
    #train_and_test_model(r"c:\spam\*\*")
    train_and_test_model(r"data/spam_mail/*/*")


Counter({(False, False): 704, (True, True): 101, (True, False): 38, (False, True): 33})
spammiest_hams [('Attn programmers: support offered [FLOSS-Sarai Initiative]', False, 0.9756129605142085), ('2000+ year old Greek computer reinterpreted', False, 0.983535500810491), ('What to look for in your next smart phone (Tech Update)', False, 0.9898719206903178), ('[ILUG-Social] Re: Important - reenactor insurance needed', False, 0.9995349057803374), ('[ILUG-Social] Re: Important - reenactor insurance needed', False, 0.9995349057803374)]
hammiest_spams [('Re: girls', True, 0.0009525186158415109), ('Introducing Chase Platinum for Students with a 0% Introductory APR', True, 0.0012566691211085331), ('.Message report from your contact page....//ytu855 rkq', True, 0.0015109358288594077), ('Testing a system, please delete', True, 0.002692053883680512), ('Never pay for the goodz again (8SimUgQ)', True, 0.0059116232219249675)]
spammiest_words [('year', 0.028767123287671233, 0.00022893772893772894), ('rates', 0.031506849315068496, 0.00022893772893772894), ('sale', 0.031506849315068496, 0.00022893772893772894), ('systemworks', 0.036986301369863014, 0.00022893772893772894), ('money', 0.03972602739726028, 0.00022893772893772894)]
hammiest_words [('spambayes', 0.0013698630136986301, 0.04601648351648352), ('users', 0.0013698630136986301, 0.036401098901098904), ('razor', 0.0013698630136986301, 0.030906593406593408), ('zzzzteana', 0.0013698630136986301, 0.029075091575091576), ('sadev', 0.0013698630136986301, 0.026785714285714284)]