In [3]:
import os
import pickle
import math
import gc

from Parser import *
from Classifier import *
from DictionaryUtils import *

In [4]:
class PickledDataProcessing:
    #Expand the database from a given pickled file. Merge every mergeAfter messages.
    @staticmethod
    def crawlMails(dictionary, frequency, dump, breakAfter = -1, mergeAfter = 10000):
        i = 0
        j = 0
        data = pickle.load(open( dump, "rb" ))
        files = []
        for text in data:
            content = (ParserDictionary.stripHeaders(text)).lower()
            files.append(content)
            if i == breakAfter:
                break;
            if j == mergeAfter:
                dictionary, frequency = DictionaryMI.createDictionary(dictionary, frequency, files)
                del(files)
                files = []
                j = -1
            i+=1        
            j+=1
        dictionary, frequency = DictionaryMI.createDictionary(dictionary, frequency, files)
        return dictionary, frequency
    
    #Load file from given databases. Return it as an input data for classifier parsed with a given parser.
    @staticmethod
    def loadMails(parser, pickledFile, count = -1):
        msgs = []
        ctr = 0
        for i in pickle.load(open( pickledFile, "rb" )):
            msgs.append(parser.parseEmail(i))
            if (count != -1):
                if ctr == count:
                    break
                ctr += 1
                
        return msgs
        
    #Build a dictionary (array) containing all words from given datasets.
    @staticmethod
    def extractDictionary(pickledData, amount = 2000):
        d, f = [], []
        for x in pickledData:
            d, f = PickledDataProcessing.crawlMails(d, f, x, breakAfter = amount)
        return d, f

In [5]:
#Evaluate the quality of a given classifier on several saved datasets.
def evaluateDataset(classifier, parser, pickledDataset, amount = -1):
    i = 0
    ctr = 0
    dataset = pickle.load(open(pickledDataset, "rb"))
    for x in dataset:
        if classifier.evaluate(parser.parseEmail(x)) > 0:
            i += 1
        ctr += 1
        if amount == ctr:
            break
    if amount == -1:
        print "Dataset length: " + str(len(dataset))
    else:
        print "Dataset length: " + str(min(len(dataset), amount))
    print str(100 * i / ctr) + "% of given datased was classified as positive!\n\r"
    
def testClassifier(classifier, parser, label = "", amount = -1):
    print "***************"+ label +"***************"
    print "Spamassassin HAM"
    evaluateDataset(classifier, parser, "../ham.p", amount)
    gc.collect()
    print "Spamassassin SPAM"
    evaluateDataset(classifier, parser, "../spam.p", amount)
    gc.collect()
    print "Enron HAM"
    evaluateDataset(classifier, parser, "../Enron/ham.p", amount)
    gc.collect()
    print "Enron SPAM"
    evaluateDataset(classifier, parser, "../Enron/spam.p", amount)
    gc.collect()
    print "SPAM dataset"
    #evaluateDataset(classifier, parser, "../SPAM/01.p")
    gc.collect()
    print "Classifier evaluation DONE"
    print "***************"+ label +"***************"

In [6]:
import sys
import resource

soft, hard = 5.0 * 10**9, 5.0 * 10**9
resource.setrlimit(resource.RLIMIT_AS,(soft, hard))

In [10]:
def buildTesting(pHam, pSpam, learningDataset = 500, label = "", dictLength = 250):    
    dHam, fHam = PickledDataProcessing.extractDictionary([pHam], learningDataset)
    dSpam, fSpam = PickledDataProcessing.extractDictionary([pSpam], learningDataset)
    print "Frequencies/Dictionaries loaded"

    optimziedDictionary = DictionaryMI.filterDictionary(dSpam, fSpam, dHam, fHam)[0:dictLength]
    parser = ParserDictionary(optimziedDictionary)

    spam = PickledDataProcessing.loadMails(parser, pSpam, learningDataset)
    ham = PickledDataProcessing.loadMails(parser, pHam, learningDataset)
    print "Prepared spam/ham"
    
    #classifier = LogisticClassifier(spam + ham, [1] * len(spam) + [0] * len(ham))
    classifier = SVMClassifier(spam + ham, [1] * len(spam) + [0] * len(ham))
    #testClassifier(classifier, parser, label, 10000)
    return classifier, parser

In [30]:
#buildTesting("../ham.p", "../spam.p", 100, "Spamassassin dataset", 250)
#buildTesting("../Enron/ham.p", "../Enron/spam.p", 100, "Enron dataset", 250)

In [13]:
def buildTesting(pHam, pSpam, learningDataset = 500, dictLength = 250):    
    def openPickledMails(filename, processFn, count = 500):
        mails = pickle.load(open(filename, "rb"))[:count]
        return map(processFn, mails)
    
    dHam, fHam = DictionaryMI.createDictionary(openPickledMails(pHam, ParserDictionary.stripHeaders, learningDataset), [], [])
    dSpam, fSpam = DictionaryMI.createDictionary(openPickledMails(pSpam, ParserDictionary.stripHeaders, learningDataset), [], [])
    print "Frequencies/Dictionaries loaded"

    optimziedDictionary = DictionaryMI.filterDictionary(dSpam, fSpam, dHam, fHam)[:dictLength]
    parser = ParserDictionary(optimziedDictionary)

    spam = PickledDataProcessing.loadMails(parser, pSpam, learningDataset)
    ham = PickledDataProcessing.loadMails(parser, pHam, learningDataset)
    print "Prepared spam/ham"
    
    classifier = LogisticClassifier(spam + ham, [1] * len(spam) + [0] * len(ham))
    #classifier = SVMClassifier(spam + ham, [1] * len(spam) + [0] * len(ham))
    testClassifier(classifier, parser, "label", 10000)
    return classifier, parser

In [15]:
classifier, parser = buildTesting("../Enron/ham.p", "../Enron/spam.p")


Frequencies/Dictionaries loaded
Prepared spam/ham
***************label***************
Spamassassin HAM
Dataset length: 2752
8% of given datased was classified as positive!

Spamassassin SPAM
Dataset length: 501
98% of given datased was classified as positive!

Enron HAM
Dataset length: 10000
32% of given datased was classified as positive!

Enron SPAM
Dataset length: 4502
52% of given datased was classified as positive!

SPAM dataset
Classifier evaluation DONE
***************label***************

In [ ]: