In [1]:
from glob2 import glob

In [2]:
from email.parser import Parser

In [3]:
import os

In [4]:
def extractEmailBody(filename):
    with open(filename, 'r') as fin:
        mail = fin.readlines()
        mail = ''.join(mail)
        msg = Parser().parsestr(mail)
        return msg.get_payload()

In [5]:
def removeQuotedText(body):
    # Remove everything after 'Original Message'
    cutIdx = body.find('-----Original Message')
    if cutIdx!=-1:
        body = body[:cutIdx]
    return body

In [6]:
def saveEmailBody(filename, body):
    dirname = os.path.dirname(filename)
    if not os.path.exists(dirname):
        os.makedirs(dirname)
    with open(filename, 'w') as fout:
        fout.write(body)

In [7]:
basedir = 'enron_mail'
savedir = 'enron_mail_clean'
docs = glob(basedir + '/**/*.')

In [20]:
for doc in docs:
    try:
        body = extractEmailBody(doc)
        body = removeQuotedText(body)
        newDoc = doc.replace(basedir, savedir)
        saveEmailBody(newDoc, body)
    except Exception as e:
        print "Error with doc: ", doc

In [8]:
docs = glob(savedir + '/**/*.')

In [9]:
from nltk.tokenize import RegexpTokenizer

In [27]:
vocab = set()

tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|\S+')

for doc in docs:
    with open(doc, 'r') as fin:
        mail = fin.readlines()
        mail = ''.join(mail)
        tokens = tokenizer.tokenize(mail)
        tokens = [ t.lower() for t in tokens ]
        vocab.update(tokens)

In [10]:
import gensim

In [29]:
def docList(docs):
    for doc in docs:
        with open(doc, 'r') as fin:
            mail = fin.readlines()
            mail = ''.join(mail)
            tokens = tokenizer.tokenize(mail)
            tokens = [ t.lower() for t in tokens ]
            yield tokens

In [51]:
dic = gensim.corpora.Dictionary.from_documents(docList(docs))

In [54]:
dic.save('enron_vocab.gensim.dic')

In [63]:
dic = gensim.corpora.Dictionary.load('enron_vocab.gensim.dic')

In [74]:
# Load
# dic = gensim.corpora.Dictionary.load('enron_vocab.gensim.dic')

# Remove ignore tokens: 
# dic.filter_tokens(bad_ids=[277, 339])

In [59]:
# Notice that texcatlipocatl does not appear in the corpus
email = 'hello all this is a sample email from texcatlipocatl'.split()
emailIds = dic.doc2bow(email)
for id,count in emailIds:
    print 'Word  %-10s (id=%6d) appears %6d times in corpus'%('"'+dic[id]+'"',id,dic.dfs[id])


Word  "from"     (id=   111) appears 194178 times in corpus
Word  "a"        (id=   129) appears 294818 times in corpus
Word  "this"     (id=   148) appears 245405 times in corpus
Word  "all"      (id=   187) appears 123167 times in corpus
Word  "is"       (id=   277) appears 273785 times in corpus
Word  "email"    (id=   763) appears  53633 times in corpus
Word  "sample"   (id=   912) appears   2641 times in corpus
Word  "hello"    (id= 13515) appears   6611 times in corpus

In [66]:
stopwords = 'all this is a from '.split()
stopwordIds = [ id for id,count in dic.doc2bow(stopwords) ]
dic.filter_tokens(bad_ids=stopwordIds)

email = 'hello all this is a sample email from texcatlipocatl'.split()
emailIds = dic.doc2bow(email)
for id,count in emailIds:
    print 'Word  %-10s (id=%6d) appears %6d times in corpus'%('"'+dic[id]+'"',id,dic.dfs[id])


Word  "email"    (id= 17171) appears  53633 times in corpus
Word  "sample"   (id=350196) appears   2641 times in corpus
Word  "hello"    (id=830234) appears   6611 times in corpus

In [11]:
dic = gensim.corpora.Dictionary.load('enron_vocab.gensim.dic')

In [25]:
dic.dfs[0], dic.dfs[343881]


Out[25]:
(49726, 4)

In [29]:
%pylab inline


Populating the interactive namespace from numpy and matplotlib

In [40]:
x = dic.dfs.values()
x = sorted(x)
x = x[::-1]
loglog(x)


Out[40]:
[<matplotlib.lines.Line2D at 0x7f262e17c550>]

In [21]:
# Email stop quoted text markers

# -----Original Message-----
# "James Robinson (CES)" <cypress@neosoft.com> on 08/05/99 04:31:16 PM
# ---------------------- Forwarded by Louis Soldano/ET&S/Enron on 06/02/99 01:12 PM ---------------------------
# > 
# msg = Parser().parsestr(msg).get_payload() N times

Outline

Say we have $N=10$ topics (although this is arbitrary and I don't like it). For each document (email) we can calculate which topics make it up and on what proportion

  • Email 1: 10% Topic 1, 5% Topic 2, 15% Topic 3...
  • Email 2: 5% Topic 1, 25% Topic 2, 7% Topic 3...
  • Email 3: 30% Topic 1, 15% Topic 2, 5% Topic 3...
  • etc

Using this, can we compute a distance between documents?

Also, keep in mind that each topic provides a probability distribution of words. Meaning that:

  • Topic 1: 5% "dog", 10% "cat", 7% "wolf"
  • Topic 2: 7% "car", 20% "train", 1% "boat"
  • Topic 3: 8% "dog", 15% "house", 13% "tree"

If we can calculate the distance between two documents (using cosine similarity, for instance) we can build a distance matrix from every email to every other email. We can use such a distance matrix to construct a 'map' where we place documents in a 2D space, clustering together documents which are similar to other documents -- this is easy using MDS.