IS620 - Document Classification

Daina Bouquin


In [1]:
import nltk
import numpy as np
import pandas as pd
%matplotlib inline

# pull in the spam dataset
spam = pd.read_csv("spambase.csv")

Summaries


In [2]:
# Summary stats
spam.describe()


Out[2]:
word_freq_make word_freq_address word_freq_all word_freq_3d word_freq_our word_freq_over word_freq_remove word_freq_internet word_freq_order word_freq_mail ... char_freq_; char_freq_( char_freq_[ char_freq_! char_freq_$ char_freq_# capital_run_length_average capital_run_length_longest capital_run_length_total spamclass
count 4601.000000 4601.000000 4601.000000 4601.000000 4601.000000 4601.000000 4601.000000 4601.000000 4601.000000 4601.000000 ... 4601.000000 4601.000000 4601.000000 4601.000000 4601.000000 4601.000000 4601.000000 4601.000000 4601.000000 4601.000000
mean 0.104553 0.213015 0.280656 0.065425 0.312223 0.095901 0.114208 0.105295 0.090067 0.239413 ... 0.038575 0.139030 0.016976 0.269071 0.075811 0.044238 5.191515 52.172789 283.289285 0.394045
std 0.305358 1.290575 0.504143 1.395151 0.672513 0.273824 0.391441 0.401071 0.278616 0.644755 ... 0.243471 0.270355 0.109394 0.815672 0.245882 0.429342 31.729449 194.891310 606.347851 0.488698
min 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000 1.000000 1.000000 0.000000
25% 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.588000 6.000000 35.000000 0.000000
50% 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.065000 0.000000 0.000000 0.000000 0.000000 2.276000 15.000000 95.000000 0.000000
75% 0.000000 0.000000 0.420000 0.000000 0.380000 0.000000 0.000000 0.000000 0.000000 0.160000 ... 0.000000 0.188000 0.000000 0.315000 0.052000 0.000000 3.706000 43.000000 266.000000 1.000000
max 4.540000 14.280000 5.100000 42.810000 10.000000 5.880000 7.270000 11.110000 5.260000 18.180000 ... 4.385000 9.752000 4.081000 32.478000 6.003000 19.829000 1102.500000 9989.000000 15841.000000 1.000000

8 rows × 58 columns


In [3]:
# Variable types
print(spam.dtypes)


word_freq_make                float64
word_freq_address             float64
word_freq_all                 float64
word_freq_3d                  float64
word_freq_our                 float64
word_freq_over                float64
word_freq_remove              float64
word_freq_internet            float64
word_freq_order               float64
word_freq_mail                float64
word_freq_receive             float64
word_freq_will                float64
word_freq_people              float64
word_freq_report              float64
word_freq_addresses           float64
word_freq_free                float64
word_freq_business            float64
word_freq_email               float64
word_freq_you                 float64
word_freq_credit              float64
word_freq_your                float64
word_freq_font                float64
word_freq_000                 float64
word_freq_money               float64
word_freq_hp                  float64
word_freq_hpl                 float64
word_freq_george              float64
word_freq_650                 float64
word_freq_lab                 float64
word_freq_labs                float64
word_freq_telnet              float64
word_freq_857                 float64
word_freq_data                float64
word_freq_415                 float64
word_freq_85                  float64
word_freq_technology          float64
word_freq_1999                float64
word_freq_parts               float64
word_freq_pm                  float64
word_freq_direct              float64
word_freq_cs                  float64
word_freq_meeting             float64
word_freq_original            float64
word_freq_project             float64
word_freq_re                  float64
word_freq_edu                 float64
word_freq_table               float64
word_freq_conference          float64
char_freq_;                   float64
char_freq_(                   float64
char_freq_[                   float64
char_freq_!                   float64
char_freq_$                   float64
char_freq_#                   float64
capital_run_length_average    float64
capital_run_length_longest      int64
capital_run_length_total        int64
spamclass                       int64
dtype: object

In [5]:
# Count spam and non-spam
count_spam = len(spam[spam.spamclass==1])
count_nonspam = len(spam[spam.spamclass==0])

print "Spam: %d" %count_spam
print "Non-spam: %d" %count_nonspam


Spam: 1813
Non-spam: 2788

Split data into two datasets: training & testing


In [10]:
# Split into test, train, validate
percTrain = 0.7
percVal = 0.15
percTest = 0.15
N = len(spam)
trainNum = int(percTrain * N)
valNum = int(percVal * N)
testNum = N - trainNum - valNum

In [11]:
# Check targets
print "Training target: %d" %trainNum
print "Validation target: %d" %valNum
print "Testing target: %d" %testNum
print "Total: %d" %(trainNum + valNum + testNum)


Training target: 3220
Validation target: 690
Testing target: 691
Total: 4601

In [12]:
# Split it up
from sklearn.cross_validation import train_test_split
trainSet, testSet = train_test_split(spam, test_size=testNum, random_state=8)
trainSet, valSet = train_test_split(trainSet, test_size=valNum, random_state=88)

In [30]:
# Check lengths
print "Training set: %d" %len(trainSet)
print "Validation set: %d" %len(valSet)
print "Testing set: %d" %len(testSet)
print "Total: %d" %(len(trainSet) + len(valSet) + len(testSet))


Training set: 3220
Validation set: 690
Testing set: 691
Total: 4601

In [31]:
# Random forest - train
from sklearn import ensemble

rf = ensemble.RandomForestClassifier(criterion="entropy", random_state=88)
rf_fit = rf.fit(trainSetVars, trainSetClass)

rf_train = rf_fit.predict(trainSetVars)
model_summary(trainSetClass, rf_train)


True positives: 1248
False positives: 3
True negatives: 1955
False negatives: 14

             precision    recall  f1-score   support

       Spam       1.00      0.99      0.99      1262
   Not spam       0.99      1.00      1.00      1958

avg / total       0.99      0.99      0.99      3220


In [32]:
# Random forest - test set
rf_test = rf_fit.predict(testSetVars)
model_summary(testSetClass, rf_test)


True positives: 248
False positives: 16
True negatives: 396
False negatives: 31

             precision    recall  f1-score   support

       Spam       0.94      0.89      0.91       279
   Not spam       0.93      0.96      0.94       412

avg / total       0.93      0.93      0.93       691


In [33]:
# Random forest feature importance
featImp(rf_fit, testSetVars)


/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:6: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....)
Out[33]:
Var Imp
51 char_freq_! 0.133634
52 char_freq_$ 0.110833
55 capital_run_length_longest 0.078106
15 word_freq_free 0.066308
54 capital_run_length_average 0.065711
6 word_freq_remove 0.053635
24 word_freq_hp 0.041705
56 capital_run_length_total 0.036781
25 word_freq_hpl 0.034489
18 word_freq_you 0.033800

Results

Random forest has a 94% accuracy, and the most important feature seems to be charfreq!