In [1]:
from glob2 import glob
In [2]:
from email.parser import Parser
In [3]:
import os
In [4]:
def extractEmailBody(filename):
with open(filename, 'r') as fin:
mail = fin.readlines()
mail = ''.join(mail)
msg = Parser().parsestr(mail)
return msg.get_payload()
In [5]:
def removeQuotedText(body):
# Remove everything after 'Original Message'
cutIdx = body.find('-----Original Message')
if cutIdx!=-1:
body = body[:cutIdx]
return body
In [6]:
def saveEmailBody(filename, body):
dirname = os.path.dirname(filename)
if not os.path.exists(dirname):
os.makedirs(dirname)
with open(filename, 'w') as fout:
fout.write(body)
In [7]:
basedir = 'enron_mail'
savedir = 'enron_mail_clean'
docs = glob(basedir + '/**/*.')
In [20]:
for doc in docs:
try:
body = extractEmailBody(doc)
body = removeQuotedText(body)
newDoc = doc.replace(basedir, savedir)
saveEmailBody(newDoc, body)
except Exception as e:
print "Error with doc: ", doc
In [8]:
docs = glob(savedir + '/**/*.')
In [9]:
from nltk.tokenize import RegexpTokenizer
In [27]:
vocab = set()
tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|\S+')
for doc in docs:
with open(doc, 'r') as fin:
mail = fin.readlines()
mail = ''.join(mail)
tokens = tokenizer.tokenize(mail)
tokens = [ t.lower() for t in tokens ]
vocab.update(tokens)
In [10]:
import gensim
In [29]:
def docList(docs):
for doc in docs:
with open(doc, 'r') as fin:
mail = fin.readlines()
mail = ''.join(mail)
tokens = tokenizer.tokenize(mail)
tokens = [ t.lower() for t in tokens ]
yield tokens
In [51]:
dic = gensim.corpora.Dictionary.from_documents(docList(docs))
In [54]:
dic.save('enron_vocab.gensim.dic')
In [63]:
dic = gensim.corpora.Dictionary.load('enron_vocab.gensim.dic')
In [74]:
# Load
# dic = gensim.corpora.Dictionary.load('enron_vocab.gensim.dic')
# Remove ignore tokens:
# dic.filter_tokens(bad_ids=[277, 339])
In [59]:
# Notice that texcatlipocatl does not appear in the corpus
email = 'hello all this is a sample email from texcatlipocatl'.split()
emailIds = dic.doc2bow(email)
for id,count in emailIds:
print 'Word %-10s (id=%6d) appears %6d times in corpus'%('"'+dic[id]+'"',id,dic.dfs[id])
In [66]:
stopwords = 'all this is a from '.split()
stopwordIds = [ id for id,count in dic.doc2bow(stopwords) ]
dic.filter_tokens(bad_ids=stopwordIds)
email = 'hello all this is a sample email from texcatlipocatl'.split()
emailIds = dic.doc2bow(email)
for id,count in emailIds:
print 'Word %-10s (id=%6d) appears %6d times in corpus'%('"'+dic[id]+'"',id,dic.dfs[id])
In [11]:
dic = gensim.corpora.Dictionary.load('enron_vocab.gensim.dic')
In [25]:
dic.dfs[0], dic.dfs[343881]
Out[25]:
In [29]:
%pylab inline
In [40]:
x = dic.dfs.values()
x = sorted(x)
x = x[::-1]
loglog(x)
Out[40]:
In [21]:
# Email stop quoted text markers
# -----Original Message-----
# "James Robinson (CES)" <cypress@neosoft.com> on 08/05/99 04:31:16 PM
# ---------------------- Forwarded by Louis Soldano/ET&S/Enron on 06/02/99 01:12 PM ---------------------------
# >
# msg = Parser().parsestr(msg).get_payload() N times
Say we have $N=10$ topics (although this is arbitrary and I don't like it). For each document (email) we can calculate which topics make it up and on what proportion
Using this, can we compute a distance between documents?
Also, keep in mind that each topic provides a probability distribution of words. Meaning that:
If we can calculate the distance between two documents (using cosine similarity, for instance) we can build a distance matrix from every email to every other email. We can use such a distance matrix to construct a 'map' where we place documents in a 2D space, clustering together documents which are similar to other documents -- this is easy using MDS.