In [1]:
import os, sys, email,re
import pandas as pd
import numpy as np
np.random.seed(42)
import networkx as nx
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
In [32]:
df = pd.read_csv('data/enron/emails.csv', header=0)
df.head()
Out[32]:
In [33]:
list(df) # show all attributes
Out[33]:
In [34]:
print(df['message'][0])
In [35]:
df.shape
Out[35]:
In [36]:
## Helper functions
def get_text_from_email(msg):
'''To get the content from email objects'''
parts = []
for part in msg.walk():
if part.get_content_type() == 'text/plain':
parts.append( part.get_payload() )
return ''.join(parts)
def split_email_addresses(line):
'''To separate multiple email addresses'''
if line:
addrs = line.split(',')
addrs = frozenset(map(lambda x: x.strip(), addrs))
else:
addrs = None
return addrs
In [37]:
# Parse the emails into a list email objects
messages = list(map(email.message_from_string, df['message']))
df.drop('message', axis=1, inplace=True)
# Get fields from parsed email objects
keys = messages[0].keys()
for key in keys:
df[key] = [doc[key] for doc in messages]
# Parse content from emails
df['content'] = list(map(get_text_from_email, messages))
# Split multiple email addresses
df['From'] = df['From'].map(split_email_addresses)
df['To'] = df['To'].map(split_email_addresses)
# Extract the root of 'file' as 'user'
df['user'] = df['file'].map(lambda x:x.split('/')[0])
del messages
df.head()
Out[37]:
In [38]:
# Parse datetime
df['Date'] = pd.to_datetime(df['Date'], infer_datetime_format=True)
df.dtypes
Out[38]:
What should be the index? Message-ID or Date timestamp
In [39]:
# Set index and drop columns with two few values
df = df.set_index('Message-ID')\
.drop(['file', 'Mime-Version', 'Content-Type', 'Content-Transfer-Encoding'], axis=1)
In [40]:
df.head()
Out[40]:
In [4]:
## Save as csv
df.to_csv('data/enron/enron_cleaned.csv')
In [2]:
df = pd.read_csv('data/enron/enron_cleaned.csv', parse_dates=[1])
df.head()
Out[2]:
In [3]:
df.dtypes
Out[3]:
In [4]:
len(df.From.unique()) # how many unique senders
Out[4]:
In [5]:
len(df.To.unique()) # how many unique senders
Out[5]:
In [6]:
G = nx.from_pandas_dataframe(df, 'From', 'To')
In [ ]:
spring_lay = nx.spring_layout(G)
In [ ]:
nx.draw_networkx_nodes(G, spring_lay, node_size=20)
nx.draw_networkx_edges(G, spring_lay, alpha=0.4)
In [16]:
def clean(text):
stop = set(stopwords.words('english'))
stop.update(("to","cc","subject","http","from","sent","aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()
porter= PorterStemmer()
text=text.rstrip()
text = re.sub(r'[^a-zA-Z]', ' ', text)
stop_free = " ".join([i for i in text.lower().split() if((i not in stop) and (not i.isdigit()))])
punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
#stem = " ".join(porter.stem(token) for token in normalized.split())
return normalized
In [17]:
analysis_df = df[['Date', 'From', 'To', 'Date','content']].dropna().copy()
analysis_df.shape
Out[17]:
In [23]:
analysis_df.head()
Out[23]:
In [ ]:
fig, ax = plt.subplots(figsize=(25, 10))
sns.countplot(ax=ax, x="From", data=df)
plt.xticks(rotation=45)
In [ ]:
fig, ax = plt.subplots(figsize=(25, 10))
sns.countplot(ax=ax, x="To", data=df)
plt.xticks(rotation=45)
In [ ]:
In [ ]:
In [ ]:
In [29]:
df['Date'] = pd.to_datetime(analysis_df.Date)
In [19]:
test = analysis_df.set_index('Date')
test.head()
Out[19]:
In [ ]: