In [ ]:
# spam.py
from nltk import word_tokenize, NaiveBayesClassifier
from nltk import classify
from nltk import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
#from sklearn.lda import LDA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_fscore_support
from sklearn import preprocessing
import random
import os, glob, re
import numpy
print "Imports Done !"
In [ ]:
# Functions for text processing
wordlemmatizer = WordNetLemmatizer()
commonwords = stopwords.words('english')
def NLTK_extractor(email):
features = {}
wordtokens = [ wordlemmatizer.lemmatize(word.lower()) for word in word_tokenize(email.decode('utf-8', 'ignore')) ]
for word in wordtokens:
if word not in commonwords:
features[word] = True
return features
def word_extractor(email):
words = ""
wordtokens = [ wordlemmatizer.lemmatize(word.lower()) for word in word_tokenize(email.decode('utf-8', 'ignore')) ]
for word in wordtokens:
if word not in commonwords:
words += word
words += " "
return words
def printScores(scores,nSPAM):
if(scores[3][0]==nSPAM):
label1 = 'Spam'
label2 = 'Ham'
else:
label2 = 'Spam'
label1 = 'Ham'
print "Precision %s = %f"%(label1,scores[0][0])
print "Precision %s = %f"%(label2,scores[0][1])
print "Recall %s = %f"%(label1,scores[1][0])
print "Recall %s = %f"%(label2,scores[1][1])
print "F-Measure %s = %f"%(label1,scores[2][0])
print "F-Measure %s = %f"%(label2,scores[2][1])
In [ ]:
# Import ham/spam emails.
SEED = 448
random.seed(SEED)
hamtexts = []
spamtexts = []
for filename in glob.glob("enron-dataset/small-collection/ham/*.txt"):
find = open(filename)
hamtexts.append(find.read())
find.close()
for filename in glob.glob("enron-dataset/small-collection/spam/*.txt"):
find = open(filename)
spamtexts.append(find.read())
find.close()
emails = [(email,'spam') for email in spamtexts]
emails += [(email,'ham') for email in hamtexts]
random.shuffle(emails)
In [ ]:
# Preparing train/test sets.
examples = [(word_extractor(email), label) for (email, label) in emails]
size_test = int(len(examples)*0.9)
train_set, test_set = examples[size_test:], examples[:size_test]
print "train_set size = %d, test_set size = %d" % (len(train_set),len(test_set))
nspam_train = sum([example[1]=='spam' for example in train_set])
nham_train = sum([example[1]=='ham' for example in train_set])
nspam_test = sum([example[1]=='spam' for example in test_set])
nham_test = sum([example[1]=='ham' for example in test_set])
print "n_spam train_set = %d, n_ham train_set = %d (%f%%)" % (nspam_train,nham_train,100*nham_train/(nspam_train+nham_train))
print "n_spam test_set = %d, n_ham test_set = %d (%f%%)" % (nspam_test,nham_test,100*nham_test/(nspam_test+nham_test))
In [ ]:
limitngrams = 1
count_vectorizer = CountVectorizer(ngram_range=(1, limitngrams), binary='False')
x = count_vectorizer.fit_transform(numpy.asarray([example[0] for example in train_set]))
y = numpy.asarray([example[1] for example in train_set])
xt = count_vectorizer.transform(numpy.asarray([example[0] for example in test_set]))
yt = numpy.asarray([example[1] for example in test_set])
In [ ]:
classifier = MultinomialNB()
#classifier = MultinomialNB(class_prior=[0.5,0.5])
#classifier = BernoulliNB()
classifier.fit(x,y)
acc = classifier.score(xt,yt)
print "Classificaction Accuracy NB (TEST) : %f"%acc
ypred = classifier.predict(xt)
scores = precision_recall_fscore_support(yt, ypred)
ntestSPAM = sum(yt == 'spam')
printScores(scores,ntestSPAM)
In [ ]:
tf_transformer = TfidfTransformer(use_idf=False).fit(x)
x_train_tfidf = tf_transformer.transform(x)
x_test_tfidf = tf_transformer.transform(xt)
scaler = preprocessing.StandardScaler(with_mean=False).fit(x_train_tfidf)
x_train_tfidf = scaler.transform(x_train_tfidf)
x_test_tfidf = scaler.transform(x_test_tfidf)
In [ ]:
model = LogisticRegression()
model = model.fit(x_train_tfidf, y)
acc = model.score(x_test_tfidf,yt)
print "Classificaction Accuracy LOGIT (TEST) : %f"%acc
In [ ]:
text = 'you are the winner !! congrats. your prize is only a one transaction of distance from you'
text_tfidf = scaler.transform(tf_transformer.transform(count_vectorizer.transform(numpy.asarray([word_extractor(text)]))))
print model.predict(text_tfidf)
print model.predict_proba(text_tfidf)
In [ ]:
model = LinearDiscriminantAnalysis()
model = model.fit(x_train_tfidf.toarray(), y)
acc = model.score(x_test_tfidf.toarray(),yt)
print "Classificaction Accuracy LDA (TEST) : %f"%acc
In [ ]:
model = QuadraticDiscriminantAnalysis()
model = model.fit(x_train_tfidf.toarray(), y)
acc = model.score(x_test_tfidf.toarray(),yt)
print "Classificaction Accuracy QDA (TEST) : %f"%acc