IS620 - Document Classification

Daina Bouquin

Spam Filtering and Classification



In [1]:

    
import nltk
import numpy as np
import pandas as pd
%matplotlib inline

# pull in the spam dataset
spam = pd.read_csv("spambase.csv")

Summaries



In [2]:

    
# Summary stats
spam.describe()









    Out[2]:






  
    
      
      word_freq_make
      word_freq_address
      word_freq_all
      word_freq_3d
      word_freq_our
      word_freq_over
      word_freq_remove
      word_freq_internet
      word_freq_order
      word_freq_mail
      ...
      char_freq_;
      char_freq_(
      char_freq_[
      char_freq_!
      char_freq_$
      char_freq_#
      capital_run_length_average
      capital_run_length_longest
      capital_run_length_total
      spamclass
    
  
  
    
      count
      4601.000000
      4601.000000
      4601.000000
      4601.000000
      4601.000000
      4601.000000
      4601.000000
      4601.000000
      4601.000000
      4601.000000
      ...
      4601.000000
      4601.000000
      4601.000000
      4601.000000
      4601.000000
      4601.000000
      4601.000000
      4601.000000
      4601.000000
      4601.000000
    
    
      mean
      0.104553
      0.213015
      0.280656
      0.065425
      0.312223
      0.095901
      0.114208
      0.105295
      0.090067
      0.239413
      ...
      0.038575
      0.139030
      0.016976
      0.269071
      0.075811
      0.044238
      5.191515
      52.172789
      283.289285
      0.394045
    
    
      std
      0.305358
      1.290575
      0.504143
      1.395151
      0.672513
      0.273824
      0.391441
      0.401071
      0.278616
      0.644755
      ...
      0.243471
      0.270355
      0.109394
      0.815672
      0.245882
      0.429342
      31.729449
      194.891310
      606.347851
      0.488698
    
    
      min
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      ...
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      1.000000
      1.000000
      1.000000
      0.000000
    
    
      25%
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      ...
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      1.588000
      6.000000
      35.000000
      0.000000
    
    
      50%
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      ...
      0.000000
      0.065000
      0.000000
      0.000000
      0.000000
      0.000000
      2.276000
      15.000000
      95.000000
      0.000000
    
    
      75%
      0.000000
      0.000000
      0.420000
      0.000000
      0.380000
      0.000000
      0.000000
      0.000000
      0.000000
      0.160000
      ...
      0.000000
      0.188000
      0.000000
      0.315000
      0.052000
      0.000000
      3.706000
      43.000000
      266.000000
      1.000000
    
    
      max
      4.540000
      14.280000
      5.100000
      42.810000
      10.000000
      5.880000
      7.270000
      11.110000
      5.260000
      18.180000
      ...
      4.385000
      9.752000
      4.081000
      32.478000
      6.003000
      19.829000
      1102.500000
      9989.000000
      15841.000000
      1.000000
    
  

8 rows × 58 columns



In [3]:

    
# Variable types
print(spam.dtypes)









    



word_freq_make                float64
word_freq_address             float64
word_freq_all                 float64
word_freq_3d                  float64
word_freq_our                 float64
word_freq_over                float64
word_freq_remove              float64
word_freq_internet            float64
word_freq_order               float64
word_freq_mail                float64
word_freq_receive             float64
word_freq_will                float64
word_freq_people              float64
word_freq_report              float64
word_freq_addresses           float64
word_freq_free                float64
word_freq_business            float64
word_freq_email               float64
word_freq_you                 float64
word_freq_credit              float64
word_freq_your                float64
word_freq_font                float64
word_freq_000                 float64
word_freq_money               float64
word_freq_hp                  float64
word_freq_hpl                 float64
word_freq_george              float64
word_freq_650                 float64
word_freq_lab                 float64
word_freq_labs                float64
word_freq_telnet              float64
word_freq_857                 float64
word_freq_data                float64
word_freq_415                 float64
word_freq_85                  float64
word_freq_technology          float64
word_freq_1999                float64
word_freq_parts               float64
word_freq_pm                  float64
word_freq_direct              float64
word_freq_cs                  float64
word_freq_meeting             float64
word_freq_original            float64
word_freq_project             float64
word_freq_re                  float64
word_freq_edu                 float64
word_freq_table               float64
word_freq_conference          float64
char_freq_;                   float64
char_freq_(                   float64
char_freq_[                   float64
char_freq_!                   float64
char_freq_$                   float64
char_freq_#                   float64
capital_run_length_average    float64
capital_run_length_longest      int64
capital_run_length_total        int64
spamclass                       int64
dtype: object



In [5]:

    
# Count spam and non-spam
count_spam = len(spam[spam.spamclass==1])
count_nonspam = len(spam[spam.spamclass==0])

print "Spam: %d" %count_spam
print "Non-spam: %d" %count_nonspam









    



Spam: 1813
Non-spam: 2788

Split data into two datasets: training & testing



In [10]:

    
# Split into test, train, validate
percTrain = 0.7
percVal = 0.15
percTest = 0.15
N = len(spam)
trainNum = int(percTrain * N)
valNum = int(percVal * N)
testNum = N - trainNum - valNum



In [11]:

    
# Check targets
print "Training target: %d" %trainNum
print "Validation target: %d" %valNum
print "Testing target: %d" %testNum
print "Total: %d" %(trainNum + valNum + testNum)









    



Training target: 3220
Validation target: 690
Testing target: 691
Total: 4601



In [12]:

    
# Split it up
from sklearn.cross_validation import train_test_split
trainSet, testSet = train_test_split(spam, test_size=testNum, random_state=8)
trainSet, valSet = train_test_split(trainSet, test_size=valNum, random_state=88)



In [30]:

    
# Check lengths
print "Training set: %d" %len(trainSet)
print "Validation set: %d" %len(valSet)
print "Testing set: %d" %len(testSet)
print "Total: %d" %(len(trainSet) + len(valSet) + len(testSet))









    



Training set: 3220
Validation set: 690
Testing set: 691
Total: 4601

Random forest

http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html



In [31]:

    
# Random forest - train
from sklearn import ensemble

rf = ensemble.RandomForestClassifier(criterion="entropy", random_state=88)
rf_fit = rf.fit(trainSetVars, trainSetClass)

rf_train = rf_fit.predict(trainSetVars)
model_summary(trainSetClass, rf_train)









    



True positives: 1248
False positives: 3
True negatives: 1955
False negatives: 14

             precision    recall  f1-score   support

       Spam       1.00      0.99      0.99      1262
   Not spam       0.99      1.00      1.00      1958

avg / total       0.99      0.99      0.99      3220



In [32]:

    
# Random forest - test set
rf_test = rf_fit.predict(testSetVars)
model_summary(testSetClass, rf_test)









    



True positives: 248
False positives: 16
True negatives: 396
False negatives: 31

             precision    recall  f1-score   support

       Spam       0.94      0.89      0.91       279
   Not spam       0.93      0.96      0.94       412

avg / total       0.93      0.93      0.93       691



In [33]:

    
# Random forest feature importance
featImp(rf_fit, testSetVars)









    



/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:6: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....)






    Out[33]:






  
    
      
      Var
      Imp
    
  
  
    
      51
      char_freq_!
      0.133634
    
    
      52
      char_freq_$
      0.110833
    
    
      55
      capital_run_length_longest
      0.078106
    
    
      15
      word_freq_free
      0.066308
    
    
      54
      capital_run_length_average
      0.065711
    
    
      6
      word_freq_remove
      0.053635
    
    
      24
      word_freq_hp
      0.041705
    
    
      56
      capital_run_length_total
      0.036781
    
    
      25
      word_freq_hpl
      0.034489
    
    
      18
      word_freq_you
      0.033800

Results

Random forest has a 94% accuracy, and the most important feature seems to be charfreq!

	word_freq_make	word_freq_address	word_freq_all	word_freq_3d	word_freq_our	word_freq_over	word_freq_remove	word_freq_internet	word_freq_order	word_freq_mail	...	char_freq_;	char_freq_(	char_freq_[	char_freq_!	char_freq_$	char_freq_#	capital_run_length_average	capital_run_length_longest	capital_run_length_total	spamclass
count	4601.000000	4601.000000	4601.000000	4601.000000	4601.000000	4601.000000	4601.000000	4601.000000	4601.000000	4601.000000	...	4601.000000	4601.000000	4601.000000	4601.000000	4601.000000	4601.000000	4601.000000	4601.000000	4601.000000	4601.000000
mean	0.104553	0.213015	0.280656	0.065425	0.312223	0.095901	0.114208	0.105295	0.090067	0.239413	...	0.038575	0.139030	0.016976	0.269071	0.075811	0.044238	5.191515	52.172789	283.289285	0.394045
std	0.305358	1.290575	0.504143	1.395151	0.672513	0.273824	0.391441	0.401071	0.278616	0.644755	...	0.243471	0.270355	0.109394	0.815672	0.245882	0.429342	31.729449	194.891310	606.347851	0.488698
min	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	...	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	1.000000	1.000000	1.000000	0.000000
25%	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	...	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	1.588000	6.000000	35.000000	0.000000
50%	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	...	0.000000	0.065000	0.000000	0.000000	0.000000	0.000000	2.276000	15.000000	95.000000	0.000000
75%	0.000000	0.000000	0.420000	0.000000	0.380000	0.000000	0.000000	0.000000	0.000000	0.160000	...	0.000000	0.188000	0.000000	0.315000	0.052000	0.000000	3.706000	43.000000	266.000000	1.000000
max	4.540000	14.280000	5.100000	42.810000	10.000000	5.880000	7.270000	11.110000	5.260000	18.180000	...	4.385000	9.752000	4.081000	32.478000	6.003000	19.829000	1102.500000	9989.000000	15841.000000	1.000000

	Var	Imp
51	char_freq_!	0.133634
52	char_freq_$	0.110833
55	capital_run_length_longest	0.078106
15	word_freq_free	0.066308
54	capital_run_length_average	0.065711
6	word_freq_remove	0.053635
24	word_freq_hp	0.041705
56	capital_run_length_total	0.036781
25	word_freq_hpl	0.034489
18	word_freq_you	0.033800