In [1]:
import numpy as np

import csv
import string

from scipy.io import loadmat
from scipy import optimize

import pandas as pd

import matplotlib
import matplotlib.pyplot as plt
matplotlib.style.use('ggplot')
%matplotlib inline

from sklearn import svm

%load_ext autoreload
%autoreload 2

In [2]:
spam_test_file_path = '../course_materials/spamTest.mat'
spam_train_file_path = '../course_materials/spamTrain.mat'

eMail_1 = '../course_materials/emailSample1.txt'
eMail_2 = '../course_materials/emailSample2.txt'
spam_1 = '../course_materials/spamSample1.txt'
spam_2 = '../course_materials/spamSample2.txt'

vocabFile = '../course_materials/vocab.txt'

In [3]:
spamTestData = loadmat(spam_test_file_path)
spamTrainData = loadmat(spam_train_file_path)

In [4]:
print (spamTrainData.keys())
print (spamTrainData['X'].shape)
print (spamTrainData['y'].shape)
print (spamTestData.keys())
print (spamTestData['Xtest'].shape)
print (spamTestData['ytest'].shape)


dict_keys(['__header__', '__version__', '__globals__', 'X', 'y'])
(4000, 1899)
(4000, 1)
dict_keys(['__header__', '__version__', '__globals__', 'Xtest', 'ytest'])
(1000, 1899)
(1000, 1)

1 Data Extraction and Transformation

1.1 Import dictionary


In [5]:
def importVocab(vocabFile):
    wordList = csv.reader(open(vocabFile), delimiter="\t")
    vocab ={}
    for row in wordList:
        vocab[row[0]] = row[1]
    return vocab

1.2 Map Indeces onto e-Mails


In [6]:
def wordStemming(word):
    '''Remove suffixes -e, -s, -es, -ed, -ing from the words in e-mail'''
    wordSuffixes = ('s', 'es', 'ed', 'ing')
    if len(word)>2:
        for suffix in wordSuffixes:
            if word.endswith(suffix):
                return wordStemming(word[:-len(suffix)])
    return word

def preprocess_eMail(eMail_txt):
    '''preprocess the e-mail, so that it matches the vocabulary'''
    eMail = open(eMail_txt, 'r')
#     converting to lower case and stripping the punctuation
    wordList = [word.strip(string.punctuation).lower() for word in eMail.read().split()]

    for i, word in enumerate(wordList):
#         word stemming
        wordList[i] = wordStemming(word)
#         normalise URLs
        if 'http' in word or 'www' in word:
            wordList[i] = 'httpaddr'
#         normalise eMail addresses
        elif '@' in word:
            wordList[i] = 'emailaddr'
#         normalise numbers
        elif any(char.isdigit() for char in word):
            wordList[i] = 'number'
#         normalise dollar sign
        elif '$' in word:
            wordList[i] = 'dollar'
#     remove empty strings
    wordList = [word for word in wordList if word != '']
    return wordList

def extractFeatures(vocabFile, eMail_txt):
    vocab = importVocab(vocabFile)
    invVocab = {v: int(k) for k, v in vocab.items()}
    wordList = preprocess_eMail(eMail_txt)
    wordIndexList = [invVocab[word] for word in wordList if word in invVocab]
    uniqueIndex = list(set(wordIndexList))
    eMailVector = np.zeros(len(vocab))
    eMailVector[[uniqueIndex]] = 1
    return eMailVector

In [7]:
np.sum(extractFeatures(vocabFile, spam_1))


/Users/nikita/Documents/andrew-ng-2-python/.venv_andrew_ng_2_python/lib/python3.7/site-packages/ipykernel_launcher.py:42: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.
Out[7]:
37.0

In [8]:
X_train = spamTrainData['X']
y_train = spamTrainData['y']
X_test = spamTestData['Xtest']
y_test = spamTestData['ytest']

σ = 1
gaussianSVM = svm.SVC(C=1, kernel='rbf', gamma=σ**(-2))
gaussianSVM.fit(X_train, y_train.flatten())

a = gaussianSVM.decision_function(X_train).reshape(y_train.shape)
print(a.shape)


(4000, 1)

In [9]:
b = a>=0

In [10]:
tP = np.sum(y_train*b)
fP = np.sum(y_train-b==-1)
fN = np.sum(y_train-b==1)
accuracy = (y_train.shape[0] - fP - fN)/y_train.shape[0]
precision  = tP/(tP+fP)
recall  = tP/(tP+fN)
print(f"\taccuracy {accuracy:.3f}\n\trecision {precision:.3f}\n\tRecall {recall:.3f}")


	accuracy 1.000
	recision 1.000
	Recall 1.000

In [11]:
c = gaussianSVM.decision_function(X_test).reshape(y_test.shape)

In [12]:
c.shape
d = c >= 0

In [13]:
tP = np.sum(y_test*d)
fP = np.sum(y_test-d==-1)
fN = np.sum(y_test-d==1)
accuracy = (y_test.shape[0] - fP - fN)/y_test.shape[0]
precision  = tP/(tP+fP)
recall  = tP/(tP+fN)
print(f"\taccuracy {accuracy:.3f}\n\trecision {precision:.3f}\n\tRecall {recall:.3f}")


	accuracy 0.798
	recision 1.000
	Recall 0.344

In [ ]: