In [3]:
import os
import pickle
import math
import gc
from Parser import *
from Classifier import *
from DictionaryUtils import *
In [4]:
class PickledDataProcessing:
#Expand the database from a given pickled file. Merge every mergeAfter messages.
@staticmethod
def crawlMails(dictionary, frequency, dump, breakAfter = -1, mergeAfter = 10000):
i = 0
j = 0
data = pickle.load(open( dump, "rb" ))
files = []
for text in data:
content = (ParserDictionary.stripHeaders(text)).lower()
files.append(content)
if i == breakAfter:
break;
if j == mergeAfter:
dictionary, frequency = DictionaryMI.createDictionary(dictionary, frequency, files)
del(files)
files = []
j = -1
i+=1
j+=1
dictionary, frequency = DictionaryMI.createDictionary(dictionary, frequency, files)
return dictionary, frequency
#Load file from given databases. Return it as an input data for classifier parsed with a given parser.
@staticmethod
def loadMails(parser, pickledFile, count = -1):
msgs = []
ctr = 0
for i in pickle.load(open( pickledFile, "rb" )):
msgs.append(parser.parseEmail(i))
if (count != -1):
if ctr == count:
break
ctr += 1
return msgs
#Build a dictionary (array) containing all words from given datasets.
@staticmethod
def extractDictionary(pickledData, amount = 2000):
d, f = [], []
for x in pickledData:
d, f = PickledDataProcessing.crawlMails(d, f, x, breakAfter = amount)
return d, f
In [5]:
#Evaluate the quality of a given classifier on several saved datasets.
def evaluateDataset(classifier, parser, pickledDataset, amount = -1):
i = 0
ctr = 0
dataset = pickle.load(open(pickledDataset, "rb"))
for x in dataset:
if classifier.evaluate(parser.parseEmail(x)) > 0:
i += 1
ctr += 1
if amount == ctr:
break
if amount == -1:
print "Dataset length: " + str(len(dataset))
else:
print "Dataset length: " + str(min(len(dataset), amount))
print str(100 * i / ctr) + "% of given datased was classified as positive!\n\r"
def testClassifier(classifier, parser, label = "", amount = -1):
print "***************"+ label +"***************"
print "Spamassassin HAM"
evaluateDataset(classifier, parser, "../ham.p", amount)
gc.collect()
print "Spamassassin SPAM"
evaluateDataset(classifier, parser, "../spam.p", amount)
gc.collect()
print "Enron HAM"
evaluateDataset(classifier, parser, "../Enron/ham.p", amount)
gc.collect()
print "Enron SPAM"
evaluateDataset(classifier, parser, "../Enron/spam.p", amount)
gc.collect()
print "SPAM dataset"
#evaluateDataset(classifier, parser, "../SPAM/01.p")
gc.collect()
print "Classifier evaluation DONE"
print "***************"+ label +"***************"
In [6]:
import sys
import resource
soft, hard = 5.0 * 10**9, 5.0 * 10**9
resource.setrlimit(resource.RLIMIT_AS,(soft, hard))
In [10]:
def buildTesting(pHam, pSpam, learningDataset = 500, label = "", dictLength = 250):
dHam, fHam = PickledDataProcessing.extractDictionary([pHam], learningDataset)
dSpam, fSpam = PickledDataProcessing.extractDictionary([pSpam], learningDataset)
print "Frequencies/Dictionaries loaded"
optimziedDictionary = DictionaryMI.filterDictionary(dSpam, fSpam, dHam, fHam)[0:dictLength]
parser = ParserDictionary(optimziedDictionary)
spam = PickledDataProcessing.loadMails(parser, pSpam, learningDataset)
ham = PickledDataProcessing.loadMails(parser, pHam, learningDataset)
print "Prepared spam/ham"
#classifier = LogisticClassifier(spam + ham, [1] * len(spam) + [0] * len(ham))
classifier = SVMClassifier(spam + ham, [1] * len(spam) + [0] * len(ham))
#testClassifier(classifier, parser, label, 10000)
return classifier, parser
In [30]:
#buildTesting("../ham.p", "../spam.p", 100, "Spamassassin dataset", 250)
#buildTesting("../Enron/ham.p", "../Enron/spam.p", 100, "Enron dataset", 250)
In [13]:
def buildTesting(pHam, pSpam, learningDataset = 500, dictLength = 250):
def openPickledMails(filename, processFn, count = 500):
mails = pickle.load(open(filename, "rb"))[:count]
return map(processFn, mails)
dHam, fHam = DictionaryMI.createDictionary(openPickledMails(pHam, ParserDictionary.stripHeaders, learningDataset), [], [])
dSpam, fSpam = DictionaryMI.createDictionary(openPickledMails(pSpam, ParserDictionary.stripHeaders, learningDataset), [], [])
print "Frequencies/Dictionaries loaded"
optimziedDictionary = DictionaryMI.filterDictionary(dSpam, fSpam, dHam, fHam)[:dictLength]
parser = ParserDictionary(optimziedDictionary)
spam = PickledDataProcessing.loadMails(parser, pSpam, learningDataset)
ham = PickledDataProcessing.loadMails(parser, pHam, learningDataset)
print "Prepared spam/ham"
classifier = LogisticClassifier(spam + ham, [1] * len(spam) + [0] * len(ham))
#classifier = SVMClassifier(spam + ham, [1] * len(spam) + [0] * len(ham))
testClassifier(classifier, parser, "label", 10000)
return classifier, parser
In [15]:
classifier, parser = buildTesting("../Enron/ham.p", "../Enron/spam.p")
In [ ]: