In [1]:
import os, sys, email,re
import pandas as pd
import numpy as np

np.random.seed(42)

import networkx as nx

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [32]:
df = pd.read_csv('data/enron/emails.csv', header=0)
df.head()


Out[32]:
file message
0 allen-p/_sent_mail/1. Message-ID: <18782981.1075855378110.JavaMail.e...
1 allen-p/_sent_mail/10. Message-ID: <15464986.1075855378456.JavaMail.e...
2 allen-p/_sent_mail/100. Message-ID: <24216240.1075855687451.JavaMail.e...
3 allen-p/_sent_mail/1000. Message-ID: <13505866.1075863688222.JavaMail.e...
4 allen-p/_sent_mail/1001. Message-ID: <30922949.1075863688243.JavaMail.e...

In [33]:
list(df) # show all attributes


Out[33]:
['file', 'message']

In [34]:
print(df['message'][0])


Message-ID: <18782981.1075855378110.JavaMail.evans@thyme>
Date: Mon, 14 May 2001 16:39:00 -0700 (PDT)
From: phillip.allen@enron.com
To: tim.belden@enron.com
Subject: 
Mime-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Content-Transfer-Encoding: 7bit
X-From: Phillip K Allen
X-To: Tim Belden <Tim Belden/Enron@EnronXGate>
X-cc: 
X-bcc: 
X-Folder: \Phillip_Allen_Jan2002_1\Allen, Phillip K.\'Sent Mail
X-Origin: Allen-P
X-FileName: pallen (Non-Privileged).pst

Here is our forecast

 

In [35]:
df.shape


Out[35]:
(517401, 2)

In [36]:
## Helper functions
def get_text_from_email(msg):
    '''To get the content from email objects'''
    parts = []
    for part in msg.walk():
        if part.get_content_type() == 'text/plain':
            parts.append( part.get_payload() )
    return ''.join(parts)

def split_email_addresses(line):
    '''To separate multiple email addresses'''
    if line:
        addrs = line.split(',')
        addrs = frozenset(map(lambda x: x.strip(), addrs))
    else:
        addrs = None
    return addrs

In [37]:
# Parse the emails into a list email objects
messages = list(map(email.message_from_string, df['message']))
df.drop('message', axis=1, inplace=True)
# Get fields from parsed email objects
keys = messages[0].keys()
for key in keys:
    df[key] = [doc[key] for doc in messages]
# Parse content from emails
df['content'] = list(map(get_text_from_email, messages))
# Split multiple email addresses
df['From'] = df['From'].map(split_email_addresses)
df['To'] = df['To'].map(split_email_addresses)

# Extract the root of 'file' as 'user'
df['user'] = df['file'].map(lambda x:x.split('/')[0])
del messages

df.head()


Out[37]:
file Message-ID Date From To Subject Mime-Version Content-Type Content-Transfer-Encoding X-From X-To X-cc X-bcc X-Folder X-Origin X-FileName content user
0 allen-p/_sent_mail/1. <18782981.1075855378110.JavaMail.evans@thyme> Mon, 14 May 2001 16:39:00 -0700 (PDT) (phillip.allen@enron.com) (tim.belden@enron.com) 1.0 text/plain; charset=us-ascii 7bit Phillip K Allen Tim Belden <Tim Belden/Enron@EnronXGate> \Phillip_Allen_Jan2002_1\Allen, Phillip K.\'Se... Allen-P pallen (Non-Privileged).pst Here is our forecast\n\n allen-p
1 allen-p/_sent_mail/10. <15464986.1075855378456.JavaMail.evans@thyme> Fri, 4 May 2001 13:51:00 -0700 (PDT) (phillip.allen@enron.com) (john.lavorato@enron.com) Re: 1.0 text/plain; charset=us-ascii 7bit Phillip K Allen John J Lavorato <John J Lavorato/ENRON@enronXg... \Phillip_Allen_Jan2002_1\Allen, Phillip K.\'Se... Allen-P pallen (Non-Privileged).pst Traveling to have a business meeting takes the... allen-p
2 allen-p/_sent_mail/100. <24216240.1075855687451.JavaMail.evans@thyme> Wed, 18 Oct 2000 03:00:00 -0700 (PDT) (phillip.allen@enron.com) (leah.arsdall@enron.com) Re: test 1.0 text/plain; charset=us-ascii 7bit Phillip K Allen Leah Van Arsdall \Phillip_Allen_Dec2000\Notes Folders\'sent mail Allen-P pallen.nsf test successful. way to go!!! allen-p
3 allen-p/_sent_mail/1000. <13505866.1075863688222.JavaMail.evans@thyme> Mon, 23 Oct 2000 06:13:00 -0700 (PDT) (phillip.allen@enron.com) (randall.gay@enron.com) 1.0 text/plain; charset=us-ascii 7bit Phillip K Allen Randall L Gay \Phillip_Allen_Dec2000\Notes Folders\'sent mail Allen-P pallen.nsf Randy,\n\n Can you send me a schedule of the s... allen-p
4 allen-p/_sent_mail/1001. <30922949.1075863688243.JavaMail.evans@thyme> Thu, 31 Aug 2000 05:07:00 -0700 (PDT) (phillip.allen@enron.com) (greg.piper@enron.com) Re: Hello 1.0 text/plain; charset=us-ascii 7bit Phillip K Allen Greg Piper \Phillip_Allen_Dec2000\Notes Folders\'sent mail Allen-P pallen.nsf Let's shoot for Tuesday at 11:45. allen-p

Clean Data


In [38]:
# Parse datetime
df['Date'] = pd.to_datetime(df['Date'], infer_datetime_format=True)
df.dtypes


Out[38]:
file                                 object
Message-ID                           object
Date                         datetime64[ns]
From                                 object
To                                   object
Subject                              object
Mime-Version                         object
Content-Type                         object
Content-Transfer-Encoding            object
X-From                               object
X-To                                 object
X-cc                                 object
X-bcc                                object
X-Folder                             object
X-Origin                             object
X-FileName                           object
content                              object
user                                 object
dtype: object

What should be the index? Message-ID or Date timestamp


In [39]:
# Set index and drop columns with two few values
df = df.set_index('Message-ID')\
    .drop(['file', 'Mime-Version', 'Content-Type', 'Content-Transfer-Encoding'], axis=1)

In [40]:
df.head()


Out[40]:
Date From To Subject X-From X-To X-cc X-bcc X-Folder X-Origin X-FileName content user
Message-ID
<18782981.1075855378110.JavaMail.evans@thyme> 2001-05-14 23:39:00 (phillip.allen@enron.com) (tim.belden@enron.com) Phillip K Allen Tim Belden <Tim Belden/Enron@EnronXGate> \Phillip_Allen_Jan2002_1\Allen, Phillip K.\'Se... Allen-P pallen (Non-Privileged).pst Here is our forecast\n\n allen-p
<15464986.1075855378456.JavaMail.evans@thyme> 2001-05-04 20:51:00 (phillip.allen@enron.com) (john.lavorato@enron.com) Re: Phillip K Allen John J Lavorato <John J Lavorato/ENRON@enronXg... \Phillip_Allen_Jan2002_1\Allen, Phillip K.\'Se... Allen-P pallen (Non-Privileged).pst Traveling to have a business meeting takes the... allen-p
<24216240.1075855687451.JavaMail.evans@thyme> 2000-10-18 10:00:00 (phillip.allen@enron.com) (leah.arsdall@enron.com) Re: test Phillip K Allen Leah Van Arsdall \Phillip_Allen_Dec2000\Notes Folders\'sent mail Allen-P pallen.nsf test successful. way to go!!! allen-p
<13505866.1075863688222.JavaMail.evans@thyme> 2000-10-23 13:13:00 (phillip.allen@enron.com) (randall.gay@enron.com) Phillip K Allen Randall L Gay \Phillip_Allen_Dec2000\Notes Folders\'sent mail Allen-P pallen.nsf Randy,\n\n Can you send me a schedule of the s... allen-p
<30922949.1075863688243.JavaMail.evans@thyme> 2000-08-31 12:07:00 (phillip.allen@enron.com) (greg.piper@enron.com) Re: Hello Phillip K Allen Greg Piper \Phillip_Allen_Dec2000\Notes Folders\'sent mail Allen-P pallen.nsf Let's shoot for Tuesday at 11:45. allen-p

Exploratory Analysis

Who was sending to whom


In [4]:
## Save as csv
df.to_csv('data/enron/enron_cleaned.csv')


-----------------------------------------------
NameError     Traceback (most recent call last)
<ipython-input-4-cafb11133188> in <module>()
      1 ## Save as csv
----> 2 df.to_csv('data/enron/enron_cleaned.csv')

NameError: name 'df' is not defined

In [2]:
df = pd.read_csv('data/enron/enron_cleaned.csv', parse_dates=[1])
df.head()


Out[2]:
Message-ID Date From To Subject X-From X-To X-cc X-bcc X-Folder X-Origin X-FileName content user
0 <18782981.1075855378110.JavaMail.evans@thyme> 2001-05-14 23:39:00 frozenset({'phillip.allen@enron.com'}) frozenset({'tim.belden@enron.com'}) NaN Phillip K Allen Tim Belden <Tim Belden/Enron@EnronXGate> NaN NaN \Phillip_Allen_Jan2002_1\Allen, Phillip K.\'Se... Allen-P pallen (Non-Privileged).pst Here is our forecast\n\n allen-p
1 <15464986.1075855378456.JavaMail.evans@thyme> 2001-05-04 20:51:00 frozenset({'phillip.allen@enron.com'}) frozenset({'john.lavorato@enron.com'}) Re: Phillip K Allen John J Lavorato <John J Lavorato/ENRON@enronXg... NaN NaN \Phillip_Allen_Jan2002_1\Allen, Phillip K.\'Se... Allen-P pallen (Non-Privileged).pst Traveling to have a business meeting takes the... allen-p
2 <24216240.1075855687451.JavaMail.evans@thyme> 2000-10-18 10:00:00 frozenset({'phillip.allen@enron.com'}) frozenset({'leah.arsdall@enron.com'}) Re: test Phillip K Allen Leah Van Arsdall NaN NaN \Phillip_Allen_Dec2000\Notes Folders\'sent mail Allen-P pallen.nsf test successful. way to go!!! allen-p
3 <13505866.1075863688222.JavaMail.evans@thyme> 2000-10-23 13:13:00 frozenset({'phillip.allen@enron.com'}) frozenset({'randall.gay@enron.com'}) NaN Phillip K Allen Randall L Gay NaN NaN \Phillip_Allen_Dec2000\Notes Folders\'sent mail Allen-P pallen.nsf Randy,\n\n Can you send me a schedule of the s... allen-p
4 <30922949.1075863688243.JavaMail.evans@thyme> 2000-08-31 12:07:00 frozenset({'phillip.allen@enron.com'}) frozenset({'greg.piper@enron.com'}) Re: Hello Phillip K Allen Greg Piper NaN NaN \Phillip_Allen_Dec2000\Notes Folders\'sent mail Allen-P pallen.nsf Let's shoot for Tuesday at 11:45. allen-p

In [3]:
df.dtypes


Out[3]:
Message-ID            object
Date          datetime64[ns]
From                  object
To                    object
Subject               object
X-From                object
X-To                  object
X-cc                  object
X-bcc                 object
X-Folder              object
X-Origin              object
X-FileName            object
content               object
user                  object
dtype: object

In [4]:
len(df.From.unique()) # how many unique senders


Out[4]:
20328

In [5]:
len(df.To.unique()) # how many unique senders


Out[5]:
55420

In [6]:
G = nx.from_pandas_dataframe(df, 'From', 'To')

In [ ]:
spring_lay = nx.spring_layout(G)

In [ ]:
nx.draw_networkx_nodes(G, spring_lay, node_size=20)
nx.draw_networkx_edges(G, spring_lay, alpha=0.4)

Email Body Content


In [16]:
def clean(text):
    stop = set(stopwords.words('english'))
    stop.update(("to","cc","subject","http","from","sent","aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"))
    exclude = set(string.punctuation) 
    lemma = WordNetLemmatizer()
    porter= PorterStemmer()
    
    text=text.rstrip()
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    stop_free = " ".join([i for i in text.lower().split() if((i not in stop) and (not i.isdigit()))])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    #stem = " ".join(porter.stem(token) for token in normalized.split())
    
    return normalized

In [17]:
analysis_df = df[['Date', 'From', 'To', 'Date','content']].dropna().copy()
analysis_df.shape


Out[17]:
(495554, 5)

In [23]:
analysis_df.head()


Out[23]:
Date From To Date content
0 Mon, 14 May 2001 16:39:00 -0700 (PDT) (phillip.allen@enron.com) (tim.belden@enron.com) Mon, 14 May 2001 16:39:00 -0700 (PDT) Here is our forecast\n\n
1 Fri, 4 May 2001 13:51:00 -0700 (PDT) (phillip.allen@enron.com) (john.lavorato@enron.com) Fri, 4 May 2001 13:51:00 -0700 (PDT) Traveling to have a business meeting takes the...
2 Wed, 18 Oct 2000 03:00:00 -0700 (PDT) (phillip.allen@enron.com) (leah.arsdall@enron.com) Wed, 18 Oct 2000 03:00:00 -0700 (PDT) test successful. way to go!!!
3 Mon, 23 Oct 2000 06:13:00 -0700 (PDT) (phillip.allen@enron.com) (randall.gay@enron.com) Mon, 23 Oct 2000 06:13:00 -0700 (PDT) Randy,\n\n Can you send me a schedule of the s...
4 Thu, 31 Aug 2000 05:07:00 -0700 (PDT) (phillip.allen@enron.com) (greg.piper@enron.com) Thu, 31 Aug 2000 05:07:00 -0700 (PDT) Let's shoot for Tuesday at 11:45.

In [ ]:
fig, ax = plt.subplots(figsize=(25, 10))
sns.countplot(ax=ax, x="From", data=df)
plt.xticks(rotation=45)

In [ ]:
fig, ax = plt.subplots(figsize=(25, 10))
sns.countplot(ax=ax, x="To", data=df)
plt.xticks(rotation=45)

In [ ]:


In [ ]:


In [ ]:


In [29]:
df['Date'] = pd.to_datetime(analysis_df.Date)


------------------------------
ValueErrorTraceback (most recent call last)
<ipython-input-29-62ecd159cce1> in <module>()
----> 1 df['Date'] = pd.to_datetime(analysis_df.Date)

/Users/carrie/anaconda/envs/mlbook/lib/python3.5/site-packages/pandas/core/tools/datetimes.py in to_datetime(arg, errors, dayfirst, yearfirst, utc, box, format, exact, unit, infer_datetime_format, origin)
    510         result = Series(values, index=arg.index, name=arg.name)
    511     elif isinstance(arg, (ABCDataFrame, MutableMapping)):
--> 512         result = _assemble_from_unit_mappings(arg, errors=errors)
    513     elif isinstance(arg, ABCIndexClass):
    514         result = _convert_listlike(arg, box, format, name=arg.name)

/Users/carrie/anaconda/envs/mlbook/lib/python3.5/site-packages/pandas/core/tools/datetimes.py in _assemble_from_unit_mappings(arg, errors)
    567     arg = DataFrame(arg)
    568     if not arg.columns.is_unique:
--> 569         raise ValueError("cannot assemble with duplicate keys")
    570 
    571     # replace passed unit with _unit_map

ValueError: cannot assemble with duplicate keys

In [19]:
test = analysis_df.set_index('Date')
test.head()


Out[19]:
From To content
Date
(Mon, 14 May 2001 16:39:00 -0700 (PDT), Mon, 14 May 2001 16:39:00 -0700 (PDT)) (phillip.allen@enron.com) (tim.belden@enron.com) Here is our forecast\n\n
(Fri, 4 May 2001 13:51:00 -0700 (PDT), Fri, 4 May 2001 13:51:00 -0700 (PDT)) (phillip.allen@enron.com) (john.lavorato@enron.com) Traveling to have a business meeting takes the...
(Wed, 18 Oct 2000 03:00:00 -0700 (PDT), Wed, 18 Oct 2000 03:00:00 -0700 (PDT)) (phillip.allen@enron.com) (leah.arsdall@enron.com) test successful. way to go!!!
(Mon, 23 Oct 2000 06:13:00 -0700 (PDT), Mon, 23 Oct 2000 06:13:00 -0700 (PDT)) (phillip.allen@enron.com) (randall.gay@enron.com) Randy,\n\n Can you send me a schedule of the s...
(Thu, 31 Aug 2000 05:07:00 -0700 (PDT), Thu, 31 Aug 2000 05:07:00 -0700 (PDT)) (phillip.allen@enron.com) (greg.piper@enron.com) Let's shoot for Tuesday at 11:45.

In [ ]: