Spam Classifier


In [ ]:
# spam.py
from nltk import word_tokenize, NaiveBayesClassifier
from nltk import classify
from nltk import WordNetLemmatizer
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB


#from sklearn.lda import LDA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_fscore_support
from sklearn import preprocessing

import random
import os, glob, re
import numpy
print "Imports Done !"

In [ ]:
# Functions for text processing
wordlemmatizer = WordNetLemmatizer()
commonwords = stopwords.words('english')

def NLTK_extractor(email):
	features =  {}
	wordtokens = [ wordlemmatizer.lemmatize(word.lower()) for word in word_tokenize(email.decode('utf-8', 'ignore')) ]
	for word in wordtokens:
		if word not in commonwords:
			features[word] = True
	return features

def word_extractor(email):
	words =  ""
	wordtokens = [ wordlemmatizer.lemmatize(word.lower()) for word in word_tokenize(email.decode('utf-8', 'ignore')) ]
	for word in wordtokens:
		if word not in commonwords:
			words += word
			words += " "
	return words

def printScores(scores,nSPAM):
	if(scores[3][0]==nSPAM):
		label1 = 'Spam'
		label2 = 'Ham'
	else:
		label2 = 'Spam'
		label1 = 'Ham'

	print "Precision %s = %f"%(label1,scores[0][0])
	print "Precision %s = %f"%(label2,scores[0][1])
	print "Recall %s = %f"%(label1,scores[1][0])
	print "Recall %s = %f"%(label2,scores[1][1])
	print "F-Measure %s = %f"%(label1,scores[2][0])
	print "F-Measure %s = %f"%(label2,scores[2][1])

In [ ]:
# Import ham/spam emails.

SEED = 448
random.seed(SEED)
hamtexts = []
spamtexts = []

for filename in glob.glob("enron-dataset/small-collection/ham/*.txt"):
    find = open(filename)
    hamtexts.append(find.read())
    find.close()
    
for filename in glob.glob("enron-dataset/small-collection/spam/*.txt"):
    find = open(filename)
    spamtexts.append(find.read())
    find.close()	

emails = [(email,'spam') for email in spamtexts]
emails += [(email,'ham') for email in hamtexts]
random.shuffle(emails)

In [ ]:
# Preparing train/test sets.
examples = [(word_extractor(email), label) for (email, label) in emails]
size_test = int(len(examples)*0.9)
train_set, test_set = examples[size_test:], examples[:size_test]
print "train_set size = %d, test_set size = %d" % (len(train_set),len(test_set))
nspam_train = sum([example[1]=='spam' for example in train_set])
nham_train = sum([example[1]=='ham' for example in train_set])
nspam_test = sum([example[1]=='spam' for example in test_set])
nham_test = sum([example[1]=='ham' for example in test_set])
print "n_spam train_set = %d, n_ham train_set = %d (%f%%)" % (nspam_train,nham_train,100*nham_train/(nspam_train+nham_train))
print "n_spam test_set = %d, n_ham test_set = %d  (%f%%)" % (nspam_test,nham_test,100*nham_test/(nspam_test+nham_test))

In [ ]:
limitngrams = 1
count_vectorizer = CountVectorizer(ngram_range=(1, limitngrams), binary='False')
x = count_vectorizer.fit_transform(numpy.asarray([example[0] for example in train_set]))
y = numpy.asarray([example[1] for example in train_set])

xt = count_vectorizer.transform(numpy.asarray([example[0] for example in test_set]))
yt = numpy.asarray([example[1] for example in test_set])

In [ ]:
classifier = MultinomialNB()
#classifier = MultinomialNB(class_prior=[0.5,0.5])
#classifier = BernoulliNB()

classifier.fit(x,y)

acc = classifier.score(xt,yt)
print "Classificaction Accuracy NB (TEST) : %f"%acc
ypred = classifier.predict(xt)
scores = precision_recall_fscore_support(yt, ypred)
ntestSPAM = sum(yt == 'spam')
printScores(scores,ntestSPAM)

In [ ]:
tf_transformer = TfidfTransformer(use_idf=False).fit(x)
x_train_tfidf = tf_transformer.transform(x)
x_test_tfidf = tf_transformer.transform(xt)
scaler = preprocessing.StandardScaler(with_mean=False).fit(x_train_tfidf)

x_train_tfidf = scaler.transform(x_train_tfidf)
x_test_tfidf = scaler.transform(x_test_tfidf)

In [ ]:
model = LogisticRegression()
model = model.fit(x_train_tfidf, y)
acc = model.score(x_test_tfidf,yt)
print "Classificaction Accuracy LOGIT (TEST) : %f"%acc

In [ ]:
text = 'you are the winner !! congrats. your prize is only a one transaction of distance from you'
text_tfidf = scaler.transform(tf_transformer.transform(count_vectorizer.transform(numpy.asarray([word_extractor(text)]))))

print model.predict(text_tfidf)
print model.predict_proba(text_tfidf)

In [ ]:
model = LinearDiscriminantAnalysis()
model = model.fit(x_train_tfidf.toarray(), y)
acc = model.score(x_test_tfidf.toarray(),yt)
print "Classificaction Accuracy LDA (TEST) : %f"%acc

In [ ]:
model = QuadraticDiscriminantAnalysis()
model = model.fit(x_train_tfidf.toarray(), y)
acc = model.score(x_test_tfidf.toarray(),yt)
print "Classificaction Accuracy QDA (TEST) : %f"%acc