Mining the social web

Code adapted from here.


In [1]:
import pandas as pd
import sys
from urllib.request import urlopen
import time
import os
import envoy 
import mailbox
import email
import quopri
import json
from bs4 import BeautifulSoup
from dateutil.parser import parse
import re
from time import asctime
from dateutil.parser import parse

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

Download raw data


In [2]:
URL = "http://www.cs.cmu.edu/~enron/enron_mail_20110402.tgz"
DOWNLOAD_DIR = "/Users/yuwenwu/insight/cultivate/data/external/"

In [3]:
def download(url, download_dir):    
    file_name = url.split('/')[-1]
    u = urlopen(url)
    f = open(os.path.join(download_dir, file_name), 'wb')
    meta = u.info()
    file_size = int(meta.get_all("Content-Length")[0])
    #print("Downloading: {} Bytes: {}".format(file_name, file_size))

    file_size_dl = 0
    block_sz = 8192
    last_update = time.time()
    while True:
        buffer = u.read(block_sz)
        if not buffer:
            break

        file_size_dl += len(buffer)
        f.write(buffer)
        download_status = r"%10d MB  [%3.2f%%]" % (file_size_dl / 1000000.0, file_size_dl * 100.0 / file_size)
        download_status = download_status + chr(8)*(len(download_status)+1)
        if time.time() - last_update > 5:
            #print(download_status),
            sys.stdout.flush()
            last_update = time.time()
    f.close()
    return f.name

# Extracts a gzipped tarfile. e.g. "$ tar xzf filename.tgz"

def tar_xzf(f):
    # Call out to the shell for a faster decompression.
    # This will still take a while because Vagrant synchronizes
    # thousands of files that are extracted to the host machine
    r = envoy.run("tar xzf {} -C {}".format(f, DOWNLOAD_DIR))
    #print(r.std_out)
    #print(r.std_err)

f = download(URL, DOWNLOAD_DIR)
_ = tar_xzf(f)

Convert raw email data to mbox format.


In [16]:
MAILDIR = DOWNLOAD_DIR + 'enron_mail_20110402/maildir' 

# Where to write the converted mbox
MBOX = DOWNLOAD_DIR + '/enron.mbox'

# Create a file handle that we'll be writing into...
mbox = open(MBOX, 'w')

# Walk the directories and process any folder named 'inbox'

for (root, dirs, file_names) in os.walk(MAILDIR):

    if root.split(os.sep)[-1].lower() not in ['_sent_mail', 'discussion_threads', 'inbox', 'sent_items']:
        continue

    # Process each message in 'inbox'

    for file_name in file_names:
        file_path = os.path.join(root, file_name)
        message_text = open(file_path, 'wb')

        # Compute fields for the From_ line in a traditional mbox message

        _from = re.search(r"From: ([^\n]+)", message_text).groups()[0]
        _date = re.search(r"Date: ([^\n]+)", message_text).groups()[0]

        # Convert _date to the asctime representation for the From_ line

        _date = asctime(parse(_date).timetuple())

        msg = email.message_from_string(message_text)
        msg.set_unixfrom('From %s %s' % (_from, _date))

        _ = mbox.write(msg.as_string(unixfrom=True) + "\n\n");
    
mbox.close()

Convert mbox to json.


In [5]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning, module='bs4')

In [14]:
def clean_content(msg):

    # Decode message from "quoted printable" format
    msg = quopri.decodestring(msg)
        
    # Strip out HTML tags, if any are present.
    # Bail on unknown encodings if errors happen in BeautifulSoup.
    try:
        soup = BeautifulSoup(msg, 'lxml')
    except:
        return ''
    return ''.join(soup.findAll(text=True))

# There's a lot of data to process, and the Pythonic way to do it is with a 
# generator. See http://wiki.python.org/moin/Generators.
# Using a generator requires a trivial encoder to be passed to json for object 
# serialization.

class Encoder(json.JSONEncoder):
    def default(self, o): return  list(o)
        
def jsonifyMessage(msg):
    json_msg = {'parts': []}
    for (k, v) in msg.items():
        json_msg[k] = v

    # The To, Cc, and Bcc fields, if present, could have multiple items.
    # Note that not all of these fields are necessarily defined.

    for k in ['To', 'Cc', 'Bcc']:
        if not json_msg.get(k):
            continue
        json_msg[k] = json_msg[k].replace('\n', '').replace('\t', '').replace('\r', '')\
                                 .replace(' ', '').split(',')

    for part in msg.walk():
        json_part = {}
        if part.get_content_maintype() == 'multipart':
            continue
            
        json_part['contentType'] = part.get_content_type()
        content = part.get_payload(decode=False)
        json_part['content'] = clean_content(content)
           
        json_msg['parts'].append(json_part)
        
    if 'Date' not in json_msg:
        return
    else:
        date_time = pd.to_datetime(json_msg['Date'])
        json_msg['Date'] = str(date_time)

    return json_msg

mbox = mailbox.mbox('enron.mbox')
OUT_FILE = DOWNLOAD_DIR + '/enron.mbox.json'

all_jsons = []
for message in mbox:
    json_msg = jsonifyMessage(message)
    if json_msg != None:
        all_jsons.append(json_msg)

In [15]:
f = open(OUT_FILE, 'w')
_ = f.write(json.dumps(all_jsons, cls=Encoder));
f.close()