In [1]:
import pandas as pd
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score, classification_report

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

In [2]:
data = pd.read_csv("./spambase/spambase.data",header=None)

In [3]:
names = ["word_freq_make", "word_freq_address", "word_freq_all", "word_freq_3d", "word_freq_our", "word_freq_over", "word_freq_remove", "word_freq_internet", "word_freq_order", "word_freq_mail", "word_freq_receive", "word_freq_will", "word_freq_people", "word_freq_report", "word_freq_addresses", "word_freq_free", "word_freq_business", "word_freq_email", "word_freq_you", "word_freq_credit", "word_freq_your", "word_freq_font", "word_freq_000", "word_freq_money", "word_freq_hp", "word_freq_hpl", "word_freq_george", "word_freq_650", "word_freq_lab", "word_freq_labs", "word_freq_telnet", "word_freq_857", "word_freq_data", "word_freq_415", "word_freq_85", "word_freq_technology", "word_freq_1999", "word_freq_parts", "word_freq_pm", "word_freq_direct", "word_freq_cs", "word_freq_meeting", "word_freq_original", "word_freq_project", "word_freq_re", "word_freq_edu", "word_freq_table", "word_freq_conference", "char_freq_;", "char_freq_(", "char_freq_[", "char_freq_!", "char_freq_$", "char_freq_#", "capital_run_length_average", "capital_run_length_longest", "capital_run_length_total","SPAM"]

In [4]:
data.columns = names

In [5]:
data


Out[5]:
word_freq_make word_freq_address word_freq_all word_freq_3d word_freq_our word_freq_over word_freq_remove word_freq_internet word_freq_order word_freq_mail ... char_freq_; char_freq_( char_freq_[ char_freq_! char_freq_$ char_freq_# capital_run_length_average capital_run_length_longest capital_run_length_total SPAM
0 0.00 0.64 0.64 0 0.32 0.00 0.00 0.00 0.00 0.00 ... 0.000 0.000 0.000 0.778 0.000 0.000 3.756 61 278 1
1 0.21 0.28 0.50 0 0.14 0.28 0.21 0.07 0.00 0.94 ... 0.000 0.132 0.000 0.372 0.180 0.048 5.114 101 1028 1
2 0.06 0.00 0.71 0 1.23 0.19 0.19 0.12 0.64 0.25 ... 0.010 0.143 0.000 0.276 0.184 0.010 9.821 485 2259 1
3 0.00 0.00 0.00 0 0.63 0.00 0.31 0.63 0.31 0.63 ... 0.000 0.137 0.000 0.137 0.000 0.000 3.537 40 191 1
4 0.00 0.00 0.00 0 0.63 0.00 0.31 0.63 0.31 0.63 ... 0.000 0.135 0.000 0.135 0.000 0.000 3.537 40 191 1
5 0.00 0.00 0.00 0 1.85 0.00 0.00 1.85 0.00 0.00 ... 0.000 0.223 0.000 0.000 0.000 0.000 3.000 15 54 1
6 0.00 0.00 0.00 0 1.92 0.00 0.00 0.00 0.00 0.64 ... 0.000 0.054 0.000 0.164 0.054 0.000 1.671 4 112 1
7 0.00 0.00 0.00 0 1.88 0.00 0.00 1.88 0.00 0.00 ... 0.000 0.206 0.000 0.000 0.000 0.000 2.450 11 49 1
8 0.15 0.00 0.46 0 0.61 0.00 0.30 0.00 0.92 0.76 ... 0.000 0.271 0.000 0.181 0.203 0.022 9.744 445 1257 1
9 0.06 0.12 0.77 0 0.19 0.32 0.38 0.00 0.06 0.00 ... 0.040 0.030 0.000 0.244 0.081 0.000 1.729 43 749 1
10 0.00 0.00 0.00 0 0.00 0.00 0.96 0.00 0.00 1.92 ... 0.000 0.000 0.000 0.462 0.000 0.000 1.312 6 21 1
11 0.00 0.00 0.25 0 0.38 0.25 0.25 0.00 0.00 0.00 ... 0.022 0.044 0.000 0.663 0.000 0.000 1.243 11 184 1
12 0.00 0.69 0.34 0 0.34 0.00 0.00 0.00 0.00 0.00 ... 0.000 0.056 0.000 0.786 0.000 0.000 3.728 61 261 1
13 0.00 0.00 0.00 0 0.90 0.00 0.90 0.00 0.00 0.90 ... 0.000 0.000 0.000 0.000 0.000 0.000 2.083 7 25 1
14 0.00 0.00 1.42 0 0.71 0.35 0.00 0.35 0.00 0.71 ... 0.000 0.102 0.000 0.357 0.000 0.000 1.971 24 205 1
15 0.00 0.42 0.42 0 1.27 0.00 0.42 0.00 0.00 1.27 ... 0.000 0.063 0.000 0.572 0.063 0.000 5.659 55 249 1
16 0.00 0.00 0.00 0 0.94 0.00 0.00 0.00 0.00 0.00 ... 0.000 0.000 0.000 0.428 0.000 0.000 4.652 31 107 1
17 0.00 0.00 0.00 0 0.00 0.00 0.00 0.00 0.00 0.00 ... 0.000 0.000 0.000 1.975 0.370 0.000 35.461 95 461 1
18 0.00 0.00 0.55 0 1.11 0.00 0.18 0.00 0.00 0.00 ... 0.000 0.182 0.000 0.455 0.000 0.000 1.320 4 70 1
19 0.00 0.63 0.00 0 1.59 0.31 0.00 0.00 0.31 0.00 ... 0.000 0.275 0.000 0.055 0.496 0.000 3.509 91 186 1
20 0.00 0.00 0.00 0 0.00 0.00 0.00 0.00 0.00 0.00 ... 0.000 0.729 0.000 0.729 0.000 0.000 3.833 9 23 1
21 0.05 0.07 0.10 0 0.76 0.05 0.15 0.02 0.55 0.00 ... 0.042 0.101 0.016 0.250 0.046 0.059 2.569 66 2259 1
22 0.00 0.00 0.00 0 2.94 0.00 0.00 0.00 0.00 0.00 ... 0.404 0.404 0.000 0.809 0.000 0.000 4.857 12 34 1
23 0.00 0.00 0.00 0 1.16 0.00 0.00 0.00 0.00 0.00 ... 0.000 0.133 0.000 0.667 0.000 0.000 1.131 5 69 1
24 0.00 0.00 0.00 0 0.00 0.00 0.00 0.00 0.00 0.00 ... 0.000 0.196 0.000 0.392 0.196 0.000 5.466 22 82 1
25 0.05 0.07 0.10 0 0.76 0.05 0.15 0.02 0.55 0.00 ... 0.042 0.101 0.016 0.250 0.046 0.059 2.565 66 2258 1
26 0.00 0.00 0.00 0 0.00 0.00 0.00 0.00 0.00 0.00 ... 0.000 0.196 0.000 0.392 0.196 0.000 5.466 22 82 1
27 0.00 0.00 0.00 0 0.00 0.00 1.66 0.00 0.00 0.00 ... 0.000 0.000 0.000 0.368 0.000 0.000 2.611 12 47 1
28 0.00 0.00 0.00 0 0.00 0.00 0.00 0.00 0.00 0.00 ... 0.000 0.352 0.000 0.352 0.000 0.000 4.000 11 36 1
29 0.00 0.00 0.00 0 0.65 0.00 0.65 0.00 0.00 0.00 ... 0.000 0.459 0.000 0.091 0.000 0.000 2.687 66 129 1
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
4571 0.00 0.00 0.46 0 0.23 0.23 0.00 0.00 0.00 0.00 ... 0.000 0.082 0.000 0.082 0.000 0.000 1.256 5 98 0
4572 0.00 0.00 0.00 0 0.00 0.00 0.00 0.00 0.00 0.00 ... 0.000 0.254 0.000 0.000 0.000 0.000 1.000 1 13 0
4573 0.00 0.00 0.18 0 0.18 0.18 0.00 0.00 0.00 0.00 ... 0.033 0.033 0.000 0.099 0.000 0.000 1.489 11 137 0
4574 0.29 0.00 0.29 0 0.00 0.00 0.00 0.00 0.00 0.29 ... 0.000 0.107 0.000 0.000 0.000 0.000 1.220 6 61 0
4575 0.00 0.00 0.00 0 0.00 0.00 0.00 0.00 0.00 1.38 ... 0.000 0.213 0.000 0.000 0.000 0.000 1.720 11 43 0
4576 0.00 0.00 0.00 0 0.00 0.00 0.00 0.00 0.00 0.00 ... 0.000 0.131 0.000 0.000 0.000 0.000 1.488 5 64 0
4577 0.00 0.00 1.20 0 0.00 0.00 0.00 0.00 0.00 0.00 ... 0.000 0.000 0.000 0.000 0.000 0.000 1.200 3 24 0
4578 0.00 0.00 0.40 0 0.00 0.00 0.00 0.00 0.00 0.00 ... 0.000 0.000 0.145 0.000 0.000 0.000 1.372 5 70 0
4579 0.27 0.05 0.10 0 0.00 0.00 0.00 0.00 0.00 0.00 ... 0.607 0.064 0.036 0.055 0.000 0.202 3.766 43 1789 0
4580 0.00 0.00 0.00 0 0.00 0.00 0.00 0.00 0.00 0.00 ... 0.000 0.000 0.000 0.000 0.000 0.000 1.571 5 11 0
4581 0.00 0.00 0.00 0 0.00 0.51 0.00 0.00 0.00 0.00 ... 0.000 0.091 0.000 0.091 0.000 0.000 1.586 4 46 0
4582 0.00 0.00 0.00 0 0.00 0.00 0.00 0.00 0.00 0.00 ... 0.000 0.000 0.000 0.000 0.000 0.000 1.266 3 19 0
4583 0.00 0.00 1.23 0 0.00 0.00 0.00 0.00 0.00 0.00 ... 0.000 0.000 0.406 0.000 0.000 0.000 1.666 13 70 0
4584 0.00 0.00 0.45 0 0.00 0.22 0.00 0.00 0.00 0.00 ... 0.000 0.082 0.000 0.041 0.000 0.000 1.500 7 123 0
4585 0.00 0.00 0.00 0 0.00 0.00 0.00 0.00 0.00 0.00 ... 0.000 0.625 0.000 0.000 0.000 0.000 1.375 4 11 0
4586 0.00 0.00 0.00 0 0.36 0.00 0.00 0.00 0.00 0.00 ... 0.000 0.112 0.000 0.000 0.000 0.056 1.793 21 174 0
4587 0.00 0.00 0.00 0 0.00 0.00 0.00 0.00 0.00 0.00 ... 0.000 0.125 0.000 0.000 0.125 0.000 1.272 4 28 0
4588 0.00 0.00 3.03 0 0.00 0.00 0.00 0.00 0.00 0.00 ... 0.000 0.000 0.000 0.000 0.000 0.000 1.111 2 10 0
4589 0.00 0.00 0.00 0 0.54 0.00 0.00 0.00 0.00 0.00 ... 0.000 0.000 0.000 0.000 0.000 0.000 1.000 1 22 0
4590 0.00 0.00 0.00 0 0.00 0.00 0.00 0.00 0.00 0.00 ... 0.000 0.185 0.000 0.000 0.000 0.092 2.468 11 79 0
4591 0.00 0.00 0.00 0 0.00 0.00 0.00 0.00 0.00 0.00 ... 0.000 0.000 0.000 0.000 0.000 0.000 1.000 1 8 0
4592 0.00 0.00 1.25 0 2.50 0.00 0.00 0.00 0.00 0.00 ... 0.000 0.111 0.000 0.000 0.000 0.000 1.285 4 27 0
4593 0.00 0.00 0.00 0 0.00 0.00 0.00 0.00 0.00 0.00 ... 0.000 0.000 0.000 1.052 0.000 0.000 1.000 1 6 0
4594 0.00 0.00 0.00 0 0.00 0.00 0.00 0.00 0.00 0.00 ... 0.000 0.630 0.000 0.000 0.000 0.000 1.727 5 19 0
4595 0.00 0.00 1.19 0 0.00 0.00 0.00 0.00 0.00 0.00 ... 0.000 0.000 0.000 0.000 0.000 0.000 1.000 1 24 0
4596 0.31 0.00 0.62 0 0.00 0.31 0.00 0.00 0.00 0.00 ... 0.000 0.232 0.000 0.000 0.000 0.000 1.142 3 88 0
4597 0.00 0.00 0.00 0 0.00 0.00 0.00 0.00 0.00 0.00 ... 0.000 0.000 0.000 0.353 0.000 0.000 1.555 4 14 0
4598 0.30 0.00 0.30 0 0.00 0.00 0.00 0.00 0.00 0.00 ... 0.102 0.718 0.000 0.000 0.000 0.000 1.404 6 118 0
4599 0.96 0.00 0.00 0 0.32 0.00 0.00 0.00 0.00 0.00 ... 0.000 0.057 0.000 0.000 0.000 0.000 1.147 5 78 0
4600 0.00 0.00 0.65 0 0.00 0.00 0.00 0.00 0.00 0.00 ... 0.000 0.000 0.000 0.125 0.000 0.000 1.250 5 40 0

4601 rows × 58 columns

We will first break out a training and testing set with an 80/20 split


In [6]:
spamTesting, spamTrain = train_test_split(
    data, test_size=0.8, random_state=5)

In [7]:
len(spamTesting)


Out[7]:
920

In [8]:
len(spamTrain)


Out[8]:
3681

Random Forests


In [9]:
RndForClf = RandomForestClassifier(n_jobs=2, n_estimators=100, max_features="auto",random_state=620)

In [10]:
RndForPred = RndForClf.fit(spamTrain[names[0:57]],spamTrain[names[-1]]).predict(spamTesting[names[0:57]])

In [11]:
accuracy_score(spamTesting[names[-1]], RndForPred)


Out[11]:
0.94891304347826089

In [12]:
print(classification_report(spamTesting[names[-1]], RndForPred,target_names=['Not Spam','Spam']))


             precision    recall  f1-score   support

   Not Spam       0.96      0.96      0.96       580
       Spam       0.94      0.92      0.93       340

avg / total       0.95      0.95      0.95       920

A 94% accuracy with an 95% Precision - recall - f1score is quite high, but how does it compare to other methods?

Ada Boost


In [13]:
AdaBooClf = AdaBoostClassifier()

In [14]:
AdaBooPred = AdaBooClf.fit(spamTrain[names[0:57]],spamTrain[names[-1]]).predict(spamTesting[names[0:57]])

In [17]:
accuracy_score(spamTesting[names[-1]], AdaBooPred)


Out[17]:
0.93260869565217386

In [18]:
print(classification_report(spamTesting[names[-1]], AdaBooPred,target_names=['Not Spam','Spam']))


             precision    recall  f1-score   support

   Not Spam       0.96      0.93      0.95       580
       Spam       0.89      0.93      0.91       340

avg / total       0.93      0.93      0.93       920

The scores are just slightly worse than the Random Forest

K-nearest neighbors


In [19]:
KNNClf = KNeighborsClassifier(7)

In [20]:
KNNPred = KNNClf.fit(spamTrain[names[0:57]],spamTrain[names[-1]]).predict(spamTesting[names[0:57]])

In [21]:
accuracy_score(spamTesting[names[-1]], KNNPred)


Out[21]:
0.80869565217391304

In [22]:
print(classification_report(spamTesting[names[-1]], KNNPred,target_names=['Not Spam','Spam']))


             precision    recall  f1-score   support

   Not Spam       0.86      0.83      0.85       580
       Spam       0.73      0.76      0.75       340

avg / total       0.81      0.81      0.81       920

Very poor performance compared to the previous two. Also, so far it seems that the methods have a better performance identifying Non-spam than spam as noted by the lower precision, recall and f1 scores

Support Vector Machines


In [23]:
#RBF
SVMRBFClf = SVC()

In [24]:
SVMRBFPred = SVMRBFClf.fit(spamTrain[names[0:57]],spamTrain[names[-1]]).predict(spamTesting[names[0:57]])

In [25]:
accuracy_score(spamTesting[names[-1]], SVMRBFPred)


Out[25]:
0.83369565217391306

In [26]:
print(classification_report(spamTesting[names[-1]], SVMRBFPred,target_names=['Not Spam','Spam']))


             precision    recall  f1-score   support

   Not Spam       0.90      0.83      0.86       580
       Spam       0.74      0.84      0.79       340

avg / total       0.84      0.83      0.84       920

Not great performance either

Naive Bayes


In [27]:
GaussClf = GaussianNB()

In [28]:
GaussPred = GaussClf.fit(spamTrain[names[0:57]],spamTrain[names[-1]]).predict(spamTesting[names[0:57]])

In [29]:
accuracy_score(spamTesting[names[-1]], GaussPred)


Out[29]:
0.81630434782608696

In [30]:
print(classification_report(spamTesting[names[-1]], GaussPred,target_names=['Not Spam','Spam']))


             precision    recall  f1-score   support

   Not Spam       0.98      0.73      0.83       580
       Spam       0.67      0.97      0.80       340

avg / total       0.87      0.82      0.82       920

Also not great performance either

Decision Trees


In [31]:
DecTreeClf = DecisionTreeClassifier(random_state=620)

In [32]:
DecTreePred = DecTreeClf.fit(spamTrain[names[0:57]],spamTrain[names[-1]]).predict(spamTesting[names[0:57]])

In [33]:
accuracy_score(spamTesting[names[-1]], DecTreePred)


Out[33]:
0.90434782608695652

In [34]:
print(classification_report(spamTesting[names[-1]], GaussPred,target_names=['Not Spam','Spam']))


             precision    recall  f1-score   support

   Not Spam       0.98      0.73      0.83       580
       Spam       0.67      0.97      0.80       340

avg / total       0.87      0.82      0.82       920

Decision tree is decent, though not as good as the Random Forest

So the Random Forest is the best model for Spam detection, but most models seems to have poor precision when it comes to Spam and poor recall when it comes to non-spam

We can use all the models to create one voting model to try and increase the accuracy.


In [38]:
VotingHardClf = VotingClassifier(estimators = [('RF',RndForClf),('Ada',AdaBooClf),('KNN',KNNClf),('SVNRBF',SVMRBFClf),('NB',GaussClf),('DecTree',DecTreeClf)],voting = 'hard')

In [39]:
VotingHardPred = VotingHardClf.fit(spamTrain[names[0:57]],spamTrain[names[-1]]).predict(spamTesting[names[0:57]])

In [40]:
accuracy_score(spamTesting[names[-1]], VotingHardPred)


Out[40]:
0.94239130434782614

In [41]:
print(classification_report(spamTesting[names[-1]], VotingHardPred,target_names=['Not Spam','Spam']))


             precision    recall  f1-score   support

   Not Spam       0.95      0.96      0.95       580
       Spam       0.93      0.92      0.92       340

avg / total       0.94      0.94      0.94       920


In [ ]: