In [1]:
import numpy as np
import csv
import string
from scipy.io import loadmat
from scipy import optimize
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
matplotlib.style.use('ggplot')
%matplotlib inline
from sklearn import svm
%load_ext autoreload
%autoreload 2
In [2]:
spam_test_file_path = '../course_materials/spamTest.mat'
spam_train_file_path = '../course_materials/spamTrain.mat'
eMail_1 = '../course_materials/emailSample1.txt'
eMail_2 = '../course_materials/emailSample2.txt'
spam_1 = '../course_materials/spamSample1.txt'
spam_2 = '../course_materials/spamSample2.txt'
vocabFile = '../course_materials/vocab.txt'
In [3]:
spamTestData = loadmat(spam_test_file_path)
spamTrainData = loadmat(spam_train_file_path)
In [4]:
print (spamTrainData.keys())
print (spamTrainData['X'].shape)
print (spamTrainData['y'].shape)
print (spamTestData.keys())
print (spamTestData['Xtest'].shape)
print (spamTestData['ytest'].shape)
In [5]:
def importVocab(vocabFile):
wordList = csv.reader(open(vocabFile), delimiter="\t")
vocab ={}
for row in wordList:
vocab[row[0]] = row[1]
return vocab
In [6]:
def wordStemming(word):
'''Remove suffixes -e, -s, -es, -ed, -ing from the words in e-mail'''
wordSuffixes = ('s', 'es', 'ed', 'ing')
if len(word)>2:
for suffix in wordSuffixes:
if word.endswith(suffix):
return wordStemming(word[:-len(suffix)])
return word
def preprocess_eMail(eMail_txt):
'''preprocess the e-mail, so that it matches the vocabulary'''
eMail = open(eMail_txt, 'r')
# converting to lower case and stripping the punctuation
wordList = [word.strip(string.punctuation).lower() for word in eMail.read().split()]
for i, word in enumerate(wordList):
# word stemming
wordList[i] = wordStemming(word)
# normalise URLs
if 'http' in word or 'www' in word:
wordList[i] = 'httpaddr'
# normalise eMail addresses
elif '@' in word:
wordList[i] = 'emailaddr'
# normalise numbers
elif any(char.isdigit() for char in word):
wordList[i] = 'number'
# normalise dollar sign
elif '$' in word:
wordList[i] = 'dollar'
# remove empty strings
wordList = [word for word in wordList if word != '']
return wordList
def extractFeatures(vocabFile, eMail_txt):
vocab = importVocab(vocabFile)
invVocab = {v: int(k) for k, v in vocab.items()}
wordList = preprocess_eMail(eMail_txt)
wordIndexList = [invVocab[word] for word in wordList if word in invVocab]
uniqueIndex = list(set(wordIndexList))
eMailVector = np.zeros(len(vocab))
eMailVector[[uniqueIndex]] = 1
return eMailVector
In [7]:
np.sum(extractFeatures(vocabFile, spam_1))
Out[7]:
In [8]:
X_train = spamTrainData['X']
y_train = spamTrainData['y']
X_test = spamTestData['Xtest']
y_test = spamTestData['ytest']
σ = 1
gaussianSVM = svm.SVC(C=1, kernel='rbf', gamma=σ**(-2))
gaussianSVM.fit(X_train, y_train.flatten())
a = gaussianSVM.decision_function(X_train).reshape(y_train.shape)
print(a.shape)
In [9]:
b = a>=0
In [10]:
tP = np.sum(y_train*b)
fP = np.sum(y_train-b==-1)
fN = np.sum(y_train-b==1)
accuracy = (y_train.shape[0] - fP - fN)/y_train.shape[0]
precision = tP/(tP+fP)
recall = tP/(tP+fN)
print(f"\taccuracy {accuracy:.3f}\n\trecision {precision:.3f}\n\tRecall {recall:.3f}")
In [11]:
c = gaussianSVM.decision_function(X_test).reshape(y_test.shape)
In [12]:
c.shape
d = c >= 0
In [13]:
tP = np.sum(y_test*d)
fP = np.sum(y_test-d==-1)
fN = np.sum(y_test-d==1)
accuracy = (y_test.shape[0] - fP - fN)/y_test.shape[0]
precision = tP/(tP+fP)
recall = tP/(tP+fN)
print(f"\taccuracy {accuracy:.3f}\n\trecision {precision:.3f}\n\tRecall {recall:.3f}")
In [ ]: