In [1]:
import os, sys, email
import numpy as np 
import pandas as pd
# Plotting
import matplotlib.pyplot as plt
%matplotlib inline
#import seaborn as sns; sns.set_style('whitegrid')
#import plotly
#plotly.offline.init_notebook_mode()
#import plotly.graph_objs as go
#import wordcloud

# Network analysis
#import networkx as nx
# NLP
#from nltk.tokenize.regexp import RegexpTokenizer

#from subprocess import check_output
#print(check_output(["ls", "../input"]).decode("utf8"))

In [3]:
emails_df = pd.read_csv('data/emails.csv')
# Read the data into a DataFrame

print(emails_df.shape)


(517401, 2)

In [4]:
## Helper functions
def get_text_from_email(msg):
    '''To get the content from email objects'''
    parts = []
    for part in msg.walk():
        if part.get_content_type() == 'text/plain':
            parts.append( part.get_payload() )
    return ''.join(parts)

def split_email_addresses(line):
    '''To separate multiple email addresses'''
    if line:
        addrs = line.split(',')
        addrs = frozenset(map(lambda x: x.strip(), addrs))
    else:
        addrs = None
    return addrs

In [5]:
messages = list(map(email.message_from_string, emails_df['message']))
emails_df.drop('message', axis=1, inplace=True)
# Get fields from parsed email objects
keys = messages[0].keys()
for key in keys:
    emails_df[key] = [doc[key] for doc in messages]
# Parse content from emails
emails_df['content'] = list(map(get_text_from_email, messages))
# Split multiple email addresses
emails_df['From'] = emails_df['From'].map(split_email_addresses)
emails_df['To'] = emails_df['To'].map(split_email_addresses)

# Extract the root of 'file' as 'user'
emails_df['user'] = emails_df['file'].map(lambda x:x.split('/')[0])
del messages

emails_df.head()


Out[5]:
file Message-ID Date From To Subject Mime-Version Content-Type Content-Transfer-Encoding X-From X-To X-cc X-bcc X-Folder X-Origin X-FileName content user
0 allen-p/_sent_mail/1. <18782981.1075855378110.JavaMail.evans@thyme> Mon, 14 May 2001 16:39:00 -0700 (PDT) (phillip.allen@enron.com) (tim.belden@enron.com) 1.0 text/plain; charset=us-ascii 7bit Phillip K Allen Tim Belden <Tim Belden/Enron@EnronXGate> \Phillip_Allen_Jan2002_1\Allen, Phillip K.\'Se... Allen-P pallen (Non-Privileged).pst Here is our forecast\n\n allen-p
1 allen-p/_sent_mail/10. <15464986.1075855378456.JavaMail.evans@thyme> Fri, 4 May 2001 13:51:00 -0700 (PDT) (phillip.allen@enron.com) (john.lavorato@enron.com) Re: 1.0 text/plain; charset=us-ascii 7bit Phillip K Allen John J Lavorato <John J Lavorato/ENRON@enronXg... \Phillip_Allen_Jan2002_1\Allen, Phillip K.\'Se... Allen-P pallen (Non-Privileged).pst Traveling to have a business meeting takes the... allen-p
2 allen-p/_sent_mail/100. <24216240.1075855687451.JavaMail.evans@thyme> Wed, 18 Oct 2000 03:00:00 -0700 (PDT) (phillip.allen@enron.com) (leah.arsdall@enron.com) Re: test 1.0 text/plain; charset=us-ascii 7bit Phillip K Allen Leah Van Arsdall \Phillip_Allen_Dec2000\Notes Folders\'sent mail Allen-P pallen.nsf test successful. way to go!!! allen-p
3 allen-p/_sent_mail/1000. <13505866.1075863688222.JavaMail.evans@thyme> Mon, 23 Oct 2000 06:13:00 -0700 (PDT) (phillip.allen@enron.com) (randall.gay@enron.com) 1.0 text/plain; charset=us-ascii 7bit Phillip K Allen Randall L Gay \Phillip_Allen_Dec2000\Notes Folders\'sent mail Allen-P pallen.nsf Randy,\n\n Can you send me a schedule of the s... allen-p
4 allen-p/_sent_mail/1001. <30922949.1075863688243.JavaMail.evans@thyme> Thu, 31 Aug 2000 05:07:00 -0700 (PDT) (phillip.allen@enron.com) (greg.piper@enron.com) Re: Hello 1.0 text/plain; charset=us-ascii 7bit Phillip K Allen Greg Piper \Phillip_Allen_Dec2000\Notes Folders\'sent mail Allen-P pallen.nsf Let's shoot for Tuesday at 11:45. allen-p

In [6]:
emails_df[['Message-ID','Subject','content']].to_csv('enron_subject.csv',sep = '\t',encoding = 'utf-8')

In [ ]: