In [1]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from math import log, sqrt
import pandas as pd
import numpy as np
import re
%matplotlib inline

In [2]:
mails = pd.read_csv('spam.csv', encoding = 'latin-1')
mails.head()


Out[2]:
v1 v2 Unnamed: 2 Unnamed: 3 Unnamed: 4
0 ham Go until jurong point, crazy.. Available only ... NaN NaN NaN
1 ham Ok lar... Joking wif u oni... NaN NaN NaN
2 spam Free entry in 2 a wkly comp to win FA Cup fina... NaN NaN NaN
3 ham U dun say so early hor... U c already then say... NaN NaN NaN
4 ham Nah I don't think he goes to usf, he lives aro... NaN NaN NaN

In [3]:
mails.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis = 1, inplace = True)
mails.head()


Out[3]:
v1 v2
0 ham Go until jurong point, crazy.. Available only ...
1 ham Ok lar... Joking wif u oni...
2 spam Free entry in 2 a wkly comp to win FA Cup fina...
3 ham U dun say so early hor... U c already then say...
4 ham Nah I don't think he goes to usf, he lives aro...

In [4]:
mails.rename(columns = {'v1': 'labels', 'v2': 'message'}, inplace = True)
mails.head()


Out[4]:
labels message
0 ham Go until jurong point, crazy.. Available only ...
1 ham Ok lar... Joking wif u oni...
2 spam Free entry in 2 a wkly comp to win FA Cup fina...
3 ham U dun say so early hor... U c already then say...
4 ham Nah I don't think he goes to usf, he lives aro...

In [5]:
mails['labels'].value_counts()


Out[5]:
ham     4825
spam     747
Name: labels, dtype: int64

In [6]:
mails['label'] = mails['labels'].map({'ham': 0, 'spam': 1})
mails.head()


Out[6]:
labels message label
0 ham Go until jurong point, crazy.. Available only ... 0
1 ham Ok lar... Joking wif u oni... 0
2 spam Free entry in 2 a wkly comp to win FA Cup fina... 1
3 ham U dun say so early hor... U c already then say... 0
4 ham Nah I don't think he goes to usf, he lives aro... 0

In [7]:
mails.drop(['labels'], axis = 1, inplace = True)
mails.head()


Out[7]:
message label
0 Go until jurong point, crazy.. Available only ... 0
1 Ok lar... Joking wif u oni... 0
2 Free entry in 2 a wkly comp to win FA Cup fina... 1
3 U dun say so early hor... U c already then say... 0
4 Nah I don't think he goes to usf, he lives aro... 0

In [8]:
totalMails = 4825 + 747
trainIndex, testIndex = list(), list()
for i in range(mails.shape[0]):
    if np.random.uniform(0, 1) < 0.75:
        trainIndex += [i]
    else:
        testIndex += [i]
trainData = mails.loc[trainIndex]
testData = mails.loc[testIndex]

In [9]:
trainData.reset_index(inplace = True)
trainData.drop(['index'], axis = 1, inplace = True)
trainData.head()


Out[9]:
message label
0 Go until jurong point, crazy.. Available only ... 0
1 Free entry in 2 a wkly comp to win FA Cup fina... 1
2 U dun say so early hor... U c already then say... 0
3 Nah I don't think he goes to usf, he lives aro... 0
4 FreeMsg Hey there darling it's been 3 week's n... 1

In [10]:
testData.reset_index(inplace = True)
testData.drop(['index'], axis = 1, inplace = True)
testData.head()


Out[10]:
message label
0 Ok lar... Joking wif u oni... 0
1 I'm gonna be home soon and i don't want to tal... 0
2 XXXMobileMovieClub: To use your credit, click ... 1
3 England v Macedonia - dont miss the goals/team... 1
4 Lol your always so convincing. 0

In [11]:
trainData['label'].value_counts()


Out[11]:
0    3617
1     563
Name: label, dtype: int64

In [12]:
testData['label'].value_counts()


Out[12]:
0    1208
1     184
Name: label, dtype: int64

In [15]:
trainData.head()


Out[15]:
message label
0 Go until jurong point, crazy.. Available only ... 0
1 Free entry in 2 a wkly comp to win FA Cup fina... 1
2 U dun say so early hor... U c already then say... 0
3 FreeMsg Hey there darling it's been 3 week's n... 1
4 As per your request 'Melle Melle (Oru Minnamin... 0

In [16]:
trainData['label'].value_counts()


Out[16]:
0    3628
1     556
Name: label, dtype: int64

In [17]:
testData.head()


Out[17]:
message label
0 Ok lar... Joking wif u oni... 0
1 Nah I don't think he goes to usf, he lives aro... 0
2 Even my brother is not like to speak with me. ... 0
3 SIX chances to win CASH! From 100 to 20,000 po... 1
4 URGENT! You have won a 1 week FREE membership ... 1

In [18]:
testData['label'].value_counts()


Out[18]:
0    1197
1     191
Name: label, dtype: int64

In [14]:
def process_message(message, lower_case = True, stem = True, stop_words = True, gram = 2):
    if lower_case:
        message = message.lower()
    words = word_tokenize(message)
    words = [w for w in words if len(w) > 2]
    if gram > 1:
        w = []
        for i in range(len(words) - gram + 1):
            w += [' '.join(words[i:i + gram])]
        return w
    if stop_words:
        sw = stopwords.words('english')
        words = [word for word in words if word not in sw]
    if stem:
        stemmer = PorterStemmer()
        words = [stemmer.stem(word) for word in words]   
    return words

In [15]:
class SpamClassifier(object):
    def __init__(self, trainData, method = 'tf-idf'):
        self.mails, self.labels = trainData['message'], trainData['label']
        self.method = method

    def train(self):
        self.calc_TF_and_IDF()
        if self.method == 'tf-idf':
            self.calc_TF_IDF()
        else:
            self.calc_prob()

    def calc_prob(self):
        self.prob_spam = dict()
        self.prob_ham = dict()
        for word in self.tf_spam:
            self.prob_spam[word] = (self.tf_spam[word] + 1) / (self.spam_words + \
                                                                len(list(self.tf_spam.keys())))
        for word in self.tf_ham:
            self.prob_ham[word] = (self.tf_ham[word] + 1) / (self.ham_words + \
                                                                len(list(self.tf_ham.keys())))
        self.prob_spam_mail, self.prob_ham_mail = self.spam_mails / self.total_mails, self.ham_mails / self.total_mails 


    def calc_TF_and_IDF(self):
        noOfMessages = self.mails.shape[0]
        self.spam_mails, self.ham_mails = self.labels.value_counts()[1], self.labels.value_counts()[0]
        self.total_mails = self.spam_mails + self.ham_mails
        self.spam_words = 0
        self.ham_words = 0
        self.tf_spam = dict()
        self.tf_ham = dict()
        self.idf_spam = dict()
        self.idf_ham = dict()
        for i in range(noOfMessages):
            message_processed = process_message(self.mails[i])
            count = list() #To keep track of whether the word has ocured in the message or not.
                           #For IDF
            for word in message_processed:
                if self.labels[i]:
                    self.tf_spam[word] = self.tf_spam.get(word, 0) + 1
                    self.spam_words += 1
                else:
                    self.tf_ham[word] = self.tf_ham.get(word, 0) + 1
                    self.ham_words += 1
                if word not in count:
                    count += [word]
            for word in count:
                if self.labels[i]:
                    self.idf_spam[word] = self.idf_spam.get(word, 0) + 1
                else:
                    self.idf_ham[word] = self.idf_ham.get(word, 0) + 1

    def calc_TF_IDF(self):
        self.prob_spam = dict()
        self.prob_ham = dict()
        self.sum_tf_idf_spam = 0
        self.sum_tf_idf_ham = 0
        for word in self.tf_spam:
            self.prob_spam[word] = (self.tf_spam[word]) * log((self.spam_mails + self.ham_mails) \
                                                          / (self.idf_spam[word] + self.idf_ham.get(word, 0)))
            self.sum_tf_idf_spam += self.prob_spam[word]
        for word in self.tf_spam:
            self.prob_spam[word] = (self.prob_spam[word] + 1) / (self.sum_tf_idf_spam + len(list(self.prob_spam.keys())))
            
        for word in self.tf_ham:
            self.prob_ham[word] = (self.tf_ham[word]) * log((self.spam_mails + self.ham_mails) \
                                                          / (self.idf_spam.get(word, 0) + self.idf_ham[word]))
            self.sum_tf_idf_ham += self.prob_ham[word]
        for word in self.tf_ham:
            self.prob_ham[word] = (self.prob_ham[word] + 1) / (self.sum_tf_idf_ham + len(list(self.prob_ham.keys())))
            
    
        self.prob_spam_mail, self.prob_ham_mail = self.spam_mails / self.total_mails, self.ham_mails / self.total_mails 
                    
    def classify(self, processed_message):
        pSpam, pHam = 0, 0
        for word in processed_message:                
            if word in self.prob_spam:
                pSpam += log(self.prob_spam[word])
            else:
                if self.method == 'tf-idf':
                    pSpam -= log(self.sum_tf_idf_spam + len(list(self.prob_spam.keys())))
                else:
                    pSpam -= log(self.spam_words + len(list(self.prob_spam.keys())))
            if word in self.prob_ham:
                pHam += log(self.prob_ham[word])
            else:
                if self.method == 'tf-idf':
                    pHam -= log(self.sum_tf_idf_ham + len(list(self.prob_ham.keys()))) 
                else:
                    pHam -= log(self.ham_words + len(list(self.prob_ham.keys())))
            pSpam += log(self.prob_spam_mail)
            pHam += log(self.prob_ham_mail)
        return pSpam >= pHam
    
    def predict(self, testData):
        result = dict()
        for (i, message) in enumerate(testData):
            processed_message = process_message(message)
            result[i] = int(self.classify(processed_message))
        return result

In [16]:
def metrics(labels, predictions):
    true_pos, true_neg, false_pos, false_neg = 0, 0, 0, 0
    for i in range(len(labels)):
        true_pos += int(labels[i] == 1 and predictions[i] == 1)
        true_neg += int(labels[i] == 0 and predictions[i] == 0)
        false_pos += int(labels[i] == 0 and predictions[i] == 1)
        false_neg += int(labels[i] == 1 and predictions[i] == 0)
    precision = true_pos / (true_pos + false_pos)
    recall = true_pos / (true_pos + false_neg)
    Fscore = 2 * precision * recall / (precision + recall)
    accuracy = (true_pos + true_neg) / (true_pos + true_neg + false_pos + false_neg)

    print("Precision: ", precision)
    print("Recall: ", recall)
    print("F-score: ", Fscore)
    print("Accuracy: ", accuracy)

In [17]:
sc_tf_idf = SpamClassifier(trainData, 'tf-idf')
sc_tf_idf.train()
preds_tf_idf = sc_tf_idf.predict(testData['message'])
metrics(testData['label'], preds_tf_idf)


Precision:  0.8970588235294118
Recall:  0.6630434782608695
F-score:  0.7625
Accuracy:  0.9454022988505747

In [19]:
sc_bow = SpamClassifier(trainData, 'bow')
sc_bow.train()
preds_bow = sc_bow.predict(testData['message'])
metrics(testData['label'], preds_bow)


Precision:  0.8898305084745762
Recall:  0.5706521739130435
F-score:  0.695364238410596
Accuracy:  0.9339080459770115

In [22]:
pm = process_message('Hello, Pls send a message when you are available')
sc_tf_idf.classify(pm)


Out[22]:
False

In [25]:
pm = process_message('Congratulations ur awarded $700 in our competition')
sc_tf_idf.classify(pm)


Out[25]:
True

In [ ]: