notebook.community

Edit and run



In [1]:

    
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from math import log, sqrt
import pandas as pd
import numpy as np
import re
%matplotlib inline



In [2]:

    
mails = pd.read_csv('spam.csv', encoding = 'latin-1')
mails.head()









    Out[2]:







  
    
      
      v1
      v2
      Unnamed: 2
      Unnamed: 3
      Unnamed: 4
    
  
  
    
      0
      ham
      Go until jurong point, crazy.. Available only ...
      NaN
      NaN
      NaN
    
    
      1
      ham
      Ok lar... Joking wif u oni...
      NaN
      NaN
      NaN
    
    
      2
      spam
      Free entry in 2 a wkly comp to win FA Cup fina...
      NaN
      NaN
      NaN
    
    
      3
      ham
      U dun say so early hor... U c already then say...
      NaN
      NaN
      NaN
    
    
      4
      ham
      Nah I don't think he goes to usf, he lives aro...
      NaN
      NaN
      NaN



In [3]:

    
mails.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis = 1, inplace = True)
mails.head()









    Out[3]:







  
    
      
      v1
      v2
    
  
  
    
      0
      ham
      Go until jurong point, crazy.. Available only ...
    
    
      1
      ham
      Ok lar... Joking wif u oni...
    
    
      2
      spam
      Free entry in 2 a wkly comp to win FA Cup fina...
    
    
      3
      ham
      U dun say so early hor... U c already then say...
    
    
      4
      ham
      Nah I don't think he goes to usf, he lives aro...



In [4]:

    
mails.rename(columns = {'v1': 'labels', 'v2': 'message'}, inplace = True)
mails.head()









    Out[4]:







  
    
      
      labels
      message
    
  
  
    
      0
      ham
      Go until jurong point, crazy.. Available only ...
    
    
      1
      ham
      Ok lar... Joking wif u oni...
    
    
      2
      spam
      Free entry in 2 a wkly comp to win FA Cup fina...
    
    
      3
      ham
      U dun say so early hor... U c already then say...
    
    
      4
      ham
      Nah I don't think he goes to usf, he lives aro...



In [5]:

    
mails['labels'].value_counts()









    Out[5]:





ham     4825
spam     747
Name: labels, dtype: int64



In [6]:

    
mails['label'] = mails['labels'].map({'ham': 0, 'spam': 1})
mails.head()









    Out[6]:







  
    
      
      labels
      message
      label
    
  
  
    
      0
      ham
      Go until jurong point, crazy.. Available only ...
      0
    
    
      1
      ham
      Ok lar... Joking wif u oni...
      0
    
    
      2
      spam
      Free entry in 2 a wkly comp to win FA Cup fina...
      1
    
    
      3
      ham
      U dun say so early hor... U c already then say...
      0
    
    
      4
      ham
      Nah I don't think he goes to usf, he lives aro...
      0



In [7]:

    
mails.drop(['labels'], axis = 1, inplace = True)
mails.head()









    Out[7]:







  
    
      
      message
      label
    
  
  
    
      0
      Go until jurong point, crazy.. Available only ...
      0
    
    
      1
      Ok lar... Joking wif u oni...
      0
    
    
      2
      Free entry in 2 a wkly comp to win FA Cup fina...
      1
    
    
      3
      U dun say so early hor... U c already then say...
      0
    
    
      4
      Nah I don't think he goes to usf, he lives aro...
      0



In [8]:

    
totalMails = 4825 + 747
trainIndex, testIndex = list(), list()
for i in range(mails.shape[0]):
    if np.random.uniform(0, 1) < 0.75:
        trainIndex += [i]
    else:
        testIndex += [i]
trainData = mails.loc[trainIndex]
testData = mails.loc[testIndex]



In [9]:

    
trainData.reset_index(inplace = True)
trainData.drop(['index'], axis = 1, inplace = True)
trainData.head()









    Out[9]:







  
    
      
      message
      label
    
  
  
    
      0
      Go until jurong point, crazy.. Available only ...
      0
    
    
      1
      Free entry in 2 a wkly comp to win FA Cup fina...
      1
    
    
      2
      U dun say so early hor... U c already then say...
      0
    
    
      3
      Nah I don't think he goes to usf, he lives aro...
      0
    
    
      4
      FreeMsg Hey there darling it's been 3 week's n...
      1



In [10]:

    
testData.reset_index(inplace = True)
testData.drop(['index'], axis = 1, inplace = True)
testData.head()









    Out[10]:







  
    
      
      message
      label
    
  
  
    
      0
      Ok lar... Joking wif u oni...
      0
    
    
      1
      I'm gonna be home soon and i don't want to tal...
      0
    
    
      2
      XXXMobileMovieClub: To use your credit, click ...
      1
    
    
      3
      England v Macedonia - dont miss the goals/team...
      1
    
    
      4
      Lol your always so convincing.
      0



In [11]:

    
trainData['label'].value_counts()









    Out[11]:





0    3617
1     563
Name: label, dtype: int64



In [12]:

    
testData['label'].value_counts()









    Out[12]:





0    1208
1     184
Name: label, dtype: int64



In [15]:

    
trainData.head()









    Out[15]:






  
    
      
      message
      label
    
  
  
    
      0
      Go until jurong point, crazy.. Available only ...
      0
    
    
      1
      Free entry in 2 a wkly comp to win FA Cup fina...
      1
    
    
      2
      U dun say so early hor... U c already then say...
      0
    
    
      3
      FreeMsg Hey there darling it's been 3 week's n...
      1
    
    
      4
      As per your request 'Melle Melle (Oru Minnamin...
      0



In [16]:

    
trainData['label'].value_counts()









    Out[16]:





0    3628
1     556
Name: label, dtype: int64



In [17]:

    
testData.head()









    Out[17]:






  
    
      
      message
      label
    
  
  
    
      0
      Ok lar... Joking wif u oni...
      0
    
    
      1
      Nah I don't think he goes to usf, he lives aro...
      0
    
    
      2
      Even my brother is not like to speak with me. ...
      0
    
    
      3
      SIX chances to win CASH! From 100 to 20,000 po...
      1
    
    
      4
      URGENT! You have won a 1 week FREE membership ...
      1



In [18]:

    
testData['label'].value_counts()









    Out[18]:





0    1197
1     191
Name: label, dtype: int64



In [14]:

    
def process_message(message, lower_case = True, stem = True, stop_words = True, gram = 2):
    if lower_case:
        message = message.lower()
    words = word_tokenize(message)
    words = [w for w in words if len(w) > 2]
    if gram > 1:
        w = []
        for i in range(len(words) - gram + 1):
            w += [' '.join(words[i:i + gram])]
        return w
    if stop_words:
        sw = stopwords.words('english')
        words = [word for word in words if word not in sw]
    if stem:
        stemmer = PorterStemmer()
        words = [stemmer.stem(word) for word in words]   
    return words



In [15]:

    
class SpamClassifier(object):
    def __init__(self, trainData, method = 'tf-idf'):
        self.mails, self.labels = trainData['message'], trainData['label']
        self.method = method

    def train(self):
        self.calc_TF_and_IDF()
        if self.method == 'tf-idf':
            self.calc_TF_IDF()
        else:
            self.calc_prob()

    def calc_prob(self):
        self.prob_spam = dict()
        self.prob_ham = dict()
        for word in self.tf_spam:
            self.prob_spam[word] = (self.tf_spam[word] + 1) / (self.spam_words + \
                                                                len(list(self.tf_spam.keys())))
        for word in self.tf_ham:
            self.prob_ham[word] = (self.tf_ham[word] + 1) / (self.ham_words + \
                                                                len(list(self.tf_ham.keys())))
        self.prob_spam_mail, self.prob_ham_mail = self.spam_mails / self.total_mails, self.ham_mails / self.total_mails 


    def calc_TF_and_IDF(self):
        noOfMessages = self.mails.shape[0]
        self.spam_mails, self.ham_mails = self.labels.value_counts()[1], self.labels.value_counts()[0]
        self.total_mails = self.spam_mails + self.ham_mails
        self.spam_words = 0
        self.ham_words = 0
        self.tf_spam = dict()
        self.tf_ham = dict()
        self.idf_spam = dict()
        self.idf_ham = dict()
        for i in range(noOfMessages):
            message_processed = process_message(self.mails[i])
            count = list() #To keep track of whether the word has ocured in the message or not.
                           #For IDF
            for word in message_processed:
                if self.labels[i]:
                    self.tf_spam[word] = self.tf_spam.get(word, 0) + 1
                    self.spam_words += 1
                else:
                    self.tf_ham[word] = self.tf_ham.get(word, 0) + 1
                    self.ham_words += 1
                if word not in count:
                    count += [word]
            for word in count:
                if self.labels[i]:
                    self.idf_spam[word] = self.idf_spam.get(word, 0) + 1
                else:
                    self.idf_ham[word] = self.idf_ham.get(word, 0) + 1

    def calc_TF_IDF(self):
        self.prob_spam = dict()
        self.prob_ham = dict()
        self.sum_tf_idf_spam = 0
        self.sum_tf_idf_ham = 0
        for word in self.tf_spam:
            self.prob_spam[word] = (self.tf_spam[word]) * log((self.spam_mails + self.ham_mails) \
                                                          / (self.idf_spam[word] + self.idf_ham.get(word, 0)))
            self.sum_tf_idf_spam += self.prob_spam[word]
        for word in self.tf_spam:
            self.prob_spam[word] = (self.prob_spam[word] + 1) / (self.sum_tf_idf_spam + len(list(self.prob_spam.keys())))
            
        for word in self.tf_ham:
            self.prob_ham[word] = (self.tf_ham[word]) * log((self.spam_mails + self.ham_mails) \
                                                          / (self.idf_spam.get(word, 0) + self.idf_ham[word]))
            self.sum_tf_idf_ham += self.prob_ham[word]
        for word in self.tf_ham:
            self.prob_ham[word] = (self.prob_ham[word] + 1) / (self.sum_tf_idf_ham + len(list(self.prob_ham.keys())))
            
    
        self.prob_spam_mail, self.prob_ham_mail = self.spam_mails / self.total_mails, self.ham_mails / self.total_mails 
                    
    def classify(self, processed_message):
        pSpam, pHam = 0, 0
        for word in processed_message:                
            if word in self.prob_spam:
                pSpam += log(self.prob_spam[word])
            else:
                if self.method == 'tf-idf':
                    pSpam -= log(self.sum_tf_idf_spam + len(list(self.prob_spam.keys())))
                else:
                    pSpam -= log(self.spam_words + len(list(self.prob_spam.keys())))
            if word in self.prob_ham:
                pHam += log(self.prob_ham[word])
            else:
                if self.method == 'tf-idf':
                    pHam -= log(self.sum_tf_idf_ham + len(list(self.prob_ham.keys()))) 
                else:
                    pHam -= log(self.ham_words + len(list(self.prob_ham.keys())))
            pSpam += log(self.prob_spam_mail)
            pHam += log(self.prob_ham_mail)
        return pSpam >= pHam
    
    def predict(self, testData):
        result = dict()
        for (i, message) in enumerate(testData):
            processed_message = process_message(message)
            result[i] = int(self.classify(processed_message))
        return result



In [16]:

    
def metrics(labels, predictions):
    true_pos, true_neg, false_pos, false_neg = 0, 0, 0, 0
    for i in range(len(labels)):
        true_pos += int(labels[i] == 1 and predictions[i] == 1)
        true_neg += int(labels[i] == 0 and predictions[i] == 0)
        false_pos += int(labels[i] == 0 and predictions[i] == 1)
        false_neg += int(labels[i] == 1 and predictions[i] == 0)
    precision = true_pos / (true_pos + false_pos)
    recall = true_pos / (true_pos + false_neg)
    Fscore = 2 * precision * recall / (precision + recall)
    accuracy = (true_pos + true_neg) / (true_pos + true_neg + false_pos + false_neg)

    print("Precision: ", precision)
    print("Recall: ", recall)
    print("F-score: ", Fscore)
    print("Accuracy: ", accuracy)



In [17]:

    
sc_tf_idf = SpamClassifier(trainData, 'tf-idf')
sc_tf_idf.train()
preds_tf_idf = sc_tf_idf.predict(testData['message'])
metrics(testData['label'], preds_tf_idf)









    



Precision:  0.8970588235294118
Recall:  0.6630434782608695
F-score:  0.7625
Accuracy:  0.9454022988505747



In [19]:

    
sc_bow = SpamClassifier(trainData, 'bow')
sc_bow.train()
preds_bow = sc_bow.predict(testData['message'])
metrics(testData['label'], preds_bow)









    



Precision:  0.8898305084745762
Recall:  0.5706521739130435
F-score:  0.695364238410596
Accuracy:  0.9339080459770115



In [22]:

    
pm = process_message('Hello, Pls send a message when you are available')
sc_tf_idf.classify(pm)









    Out[22]:





False



In [25]:

    
pm = process_message('Congratulations ur awarded $700 in our competition')
sc_tf_idf.classify(pm)









    Out[25]:





True



In [ ]:

	v1	v2	Unnamed: 2	Unnamed: 3	Unnamed: 4
0	ham	Go until jurong point, crazy.. Available only ...	NaN	NaN	NaN
1	ham	Ok lar... Joking wif u oni...	NaN	NaN	NaN
2	spam	Free entry in 2 a wkly comp to win FA Cup fina...	NaN	NaN	NaN
3	ham	U dun say so early hor... U c already then say...	NaN	NaN	NaN
4	ham	Nah I don't think he goes to usf, he lives aro...	NaN	NaN	NaN

	message	label
0	Ok lar... Joking wif u oni...	0
1	I'm gonna be home soon and i don't want to tal...	0
2	XXXMobileMovieClub: To use your credit, click ...	1
3	England v Macedonia - dont miss the goals/team...	1
4	Lol your always so convincing.	0

	message	label
0	Ok lar... Joking wif u oni...	0
1	Nah I don't think he goes to usf, he lives aro...	0
2	Even my brother is not like to speak with me. ...	0
3	SIX chances to win CASH! From 100 to 20,000 po...	1
4	URGENT! You have won a 1 week FREE membership ...	1