Code adapted from here.
In [1]:
import pandas as pd
import sys
from urllib.request import urlopen
import time
import os
import envoy
import mailbox
import email
import quopri
import json
from bs4 import BeautifulSoup
from dateutil.parser import parse
import re
from time import asctime
from dateutil.parser import parse
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'
Download raw data
In [2]:
URL = "http://www.cs.cmu.edu/~enron/enron_mail_20110402.tgz"
DOWNLOAD_DIR = "/Users/yuwenwu/insight/cultivate/data/external/"
In [3]:
def download(url, download_dir):
file_name = url.split('/')[-1]
u = urlopen(url)
f = open(os.path.join(download_dir, file_name), 'wb')
meta = u.info()
file_size = int(meta.get_all("Content-Length")[0])
#print("Downloading: {} Bytes: {}".format(file_name, file_size))
file_size_dl = 0
block_sz = 8192
last_update = time.time()
while True:
buffer = u.read(block_sz)
if not buffer:
break
file_size_dl += len(buffer)
f.write(buffer)
download_status = r"%10d MB [%3.2f%%]" % (file_size_dl / 1000000.0, file_size_dl * 100.0 / file_size)
download_status = download_status + chr(8)*(len(download_status)+1)
if time.time() - last_update > 5:
#print(download_status),
sys.stdout.flush()
last_update = time.time()
f.close()
return f.name
# Extracts a gzipped tarfile. e.g. "$ tar xzf filename.tgz"
def tar_xzf(f):
# Call out to the shell for a faster decompression.
# This will still take a while because Vagrant synchronizes
# thousands of files that are extracted to the host machine
r = envoy.run("tar xzf {} -C {}".format(f, DOWNLOAD_DIR))
#print(r.std_out)
#print(r.std_err)
f = download(URL, DOWNLOAD_DIR)
_ = tar_xzf(f)
Convert raw email data to mbox format.
In [16]:
MAILDIR = DOWNLOAD_DIR + 'enron_mail_20110402/maildir'
# Where to write the converted mbox
MBOX = DOWNLOAD_DIR + '/enron.mbox'
# Create a file handle that we'll be writing into...
mbox = open(MBOX, 'w')
# Walk the directories and process any folder named 'inbox'
for (root, dirs, file_names) in os.walk(MAILDIR):
if root.split(os.sep)[-1].lower() not in ['_sent_mail', 'discussion_threads', 'inbox', 'sent_items']:
continue
# Process each message in 'inbox'
for file_name in file_names:
file_path = os.path.join(root, file_name)
message_text = open(file_path, 'wb')
# Compute fields for the From_ line in a traditional mbox message
_from = re.search(r"From: ([^\n]+)", message_text).groups()[0]
_date = re.search(r"Date: ([^\n]+)", message_text).groups()[0]
# Convert _date to the asctime representation for the From_ line
_date = asctime(parse(_date).timetuple())
msg = email.message_from_string(message_text)
msg.set_unixfrom('From %s %s' % (_from, _date))
_ = mbox.write(msg.as_string(unixfrom=True) + "\n\n");
mbox.close()
Convert mbox to json.
In [5]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning, module='bs4')
In [14]:
def clean_content(msg):
# Decode message from "quoted printable" format
msg = quopri.decodestring(msg)
# Strip out HTML tags, if any are present.
# Bail on unknown encodings if errors happen in BeautifulSoup.
try:
soup = BeautifulSoup(msg, 'lxml')
except:
return ''
return ''.join(soup.findAll(text=True))
# There's a lot of data to process, and the Pythonic way to do it is with a
# generator. See http://wiki.python.org/moin/Generators.
# Using a generator requires a trivial encoder to be passed to json for object
# serialization.
class Encoder(json.JSONEncoder):
def default(self, o): return list(o)
def jsonifyMessage(msg):
json_msg = {'parts': []}
for (k, v) in msg.items():
json_msg[k] = v
# The To, Cc, and Bcc fields, if present, could have multiple items.
# Note that not all of these fields are necessarily defined.
for k in ['To', 'Cc', 'Bcc']:
if not json_msg.get(k):
continue
json_msg[k] = json_msg[k].replace('\n', '').replace('\t', '').replace('\r', '')\
.replace(' ', '').split(',')
for part in msg.walk():
json_part = {}
if part.get_content_maintype() == 'multipart':
continue
json_part['contentType'] = part.get_content_type()
content = part.get_payload(decode=False)
json_part['content'] = clean_content(content)
json_msg['parts'].append(json_part)
if 'Date' not in json_msg:
return
else:
date_time = pd.to_datetime(json_msg['Date'])
json_msg['Date'] = str(date_time)
return json_msg
mbox = mailbox.mbox('enron.mbox')
OUT_FILE = DOWNLOAD_DIR + '/enron.mbox.json'
all_jsons = []
for message in mbox:
json_msg = jsonifyMessage(message)
if json_msg != None:
all_jsons.append(json_msg)
In [15]:
f = open(OUT_FILE, 'w')
_ = f.write(json.dumps(all_jsons, cls=Encoder));
f.close()