In [1]:
import pandas as pd
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
In [2]:
data = pd.read_csv("./spambase/spambase.data",header=None)
In [3]:
names = ["word_freq_make", "word_freq_address", "word_freq_all", "word_freq_3d", "word_freq_our", "word_freq_over", "word_freq_remove", "word_freq_internet", "word_freq_order", "word_freq_mail", "word_freq_receive", "word_freq_will", "word_freq_people", "word_freq_report", "word_freq_addresses", "word_freq_free", "word_freq_business", "word_freq_email", "word_freq_you", "word_freq_credit", "word_freq_your", "word_freq_font", "word_freq_000", "word_freq_money", "word_freq_hp", "word_freq_hpl", "word_freq_george", "word_freq_650", "word_freq_lab", "word_freq_labs", "word_freq_telnet", "word_freq_857", "word_freq_data", "word_freq_415", "word_freq_85", "word_freq_technology", "word_freq_1999", "word_freq_parts", "word_freq_pm", "word_freq_direct", "word_freq_cs", "word_freq_meeting", "word_freq_original", "word_freq_project", "word_freq_re", "word_freq_edu", "word_freq_table", "word_freq_conference", "char_freq_;", "char_freq_(", "char_freq_[", "char_freq_!", "char_freq_$", "char_freq_#", "capital_run_length_average", "capital_run_length_longest", "capital_run_length_total","SPAM"]
In [4]:
data.columns = names
In [5]:
data
Out[5]:
We will first break out a training and testing set with an 80/20 split
In [6]:
spamTesting, spamTrain = train_test_split(
data, test_size=0.8, random_state=5)
In [7]:
len(spamTesting)
Out[7]:
In [8]:
len(spamTrain)
Out[8]:
Random Forests
In [9]:
RndForClf = RandomForestClassifier(n_jobs=2, n_estimators=100, max_features="auto",random_state=620)
In [10]:
RndForPred = RndForClf.fit(spamTrain[names[0:57]],spamTrain[names[-1]]).predict(spamTesting[names[0:57]])
In [11]:
accuracy_score(spamTesting[names[-1]], RndForPred)
Out[11]:
In [12]:
print(classification_report(spamTesting[names[-1]], RndForPred,target_names=['Not Spam','Spam']))
A 94% accuracy with an 95% Precision - recall - f1score is quite high, but how does it compare to other methods?
Ada Boost
In [13]:
AdaBooClf = AdaBoostClassifier()
In [14]:
AdaBooPred = AdaBooClf.fit(spamTrain[names[0:57]],spamTrain[names[-1]]).predict(spamTesting[names[0:57]])
In [17]:
accuracy_score(spamTesting[names[-1]], AdaBooPred)
Out[17]:
In [18]:
print(classification_report(spamTesting[names[-1]], AdaBooPred,target_names=['Not Spam','Spam']))
The scores are just slightly worse than the Random Forest
K-nearest neighbors
In [19]:
KNNClf = KNeighborsClassifier(7)
In [20]:
KNNPred = KNNClf.fit(spamTrain[names[0:57]],spamTrain[names[-1]]).predict(spamTesting[names[0:57]])
In [21]:
accuracy_score(spamTesting[names[-1]], KNNPred)
Out[21]:
In [22]:
print(classification_report(spamTesting[names[-1]], KNNPred,target_names=['Not Spam','Spam']))
Very poor performance compared to the previous two. Also, so far it seems that the methods have a better performance identifying Non-spam than spam as noted by the lower precision, recall and f1 scores
Support Vector Machines
In [23]:
#RBF
SVMRBFClf = SVC()
In [24]:
SVMRBFPred = SVMRBFClf.fit(spamTrain[names[0:57]],spamTrain[names[-1]]).predict(spamTesting[names[0:57]])
In [25]:
accuracy_score(spamTesting[names[-1]], SVMRBFPred)
Out[25]:
In [26]:
print(classification_report(spamTesting[names[-1]], SVMRBFPred,target_names=['Not Spam','Spam']))
Not great performance either
Naive Bayes
In [27]:
GaussClf = GaussianNB()
In [28]:
GaussPred = GaussClf.fit(spamTrain[names[0:57]],spamTrain[names[-1]]).predict(spamTesting[names[0:57]])
In [29]:
accuracy_score(spamTesting[names[-1]], GaussPred)
Out[29]:
In [30]:
print(classification_report(spamTesting[names[-1]], GaussPred,target_names=['Not Spam','Spam']))
Also not great performance either
Decision Trees
In [31]:
DecTreeClf = DecisionTreeClassifier(random_state=620)
In [32]:
DecTreePred = DecTreeClf.fit(spamTrain[names[0:57]],spamTrain[names[-1]]).predict(spamTesting[names[0:57]])
In [33]:
accuracy_score(spamTesting[names[-1]], DecTreePred)
Out[33]:
In [34]:
print(classification_report(spamTesting[names[-1]], GaussPred,target_names=['Not Spam','Spam']))
Decision tree is decent, though not as good as the Random Forest
So the Random Forest is the best model for Spam detection, but most models seems to have poor precision when it comes to Spam and poor recall when it comes to non-spam
We can use all the models to create one voting model to try and increase the accuracy.
In [38]:
VotingHardClf = VotingClassifier(estimators = [('RF',RndForClf),('Ada',AdaBooClf),('KNN',KNNClf),('SVNRBF',SVMRBFClf),('NB',GaussClf),('DecTree',DecTreeClf)],voting = 'hard')
In [39]:
VotingHardPred = VotingHardClf.fit(spamTrain[names[0:57]],spamTrain[names[-1]]).predict(spamTesting[names[0:57]])
In [40]:
accuracy_score(spamTesting[names[-1]], VotingHardPred)
Out[40]:
In [41]:
print(classification_report(spamTesting[names[-1]], VotingHardPred,target_names=['Not Spam','Spam']))
In [ ]: