notebook.community

Edit and run



In [1]:

    
import os, sys, email
import numpy as np 
import pandas as pd
# Plotting
import matplotlib.pyplot as plt
%matplotlib inline
#import seaborn as sns; sns.set_style('whitegrid')
#import plotly
#plotly.offline.init_notebook_mode()
#import plotly.graph_objs as go
#import wordcloud

# Network analysis
#import networkx as nx
# NLP
#from nltk.tokenize.regexp import RegexpTokenizer

#from subprocess import check_output
#print(check_output(["ls", "../input"]).decode("utf8"))



In [3]:

    
emails_df = pd.read_csv('data/emails.csv')
# Read the data into a DataFrame

print(emails_df.shape)









    



(517401, 2)



In [4]:

    
## Helper functions
def get_text_from_email(msg):
    '''To get the content from email objects'''
    parts = []
    for part in msg.walk():
        if part.get_content_type() == 'text/plain':
            parts.append( part.get_payload() )
    return ''.join(parts)

def split_email_addresses(line):
    '''To separate multiple email addresses'''
    if line:
        addrs = line.split(',')
        addrs = frozenset(map(lambda x: x.strip(), addrs))
    else:
        addrs = None
    return addrs



In [5]:

    
messages = list(map(email.message_from_string, emails_df['message']))
emails_df.drop('message', axis=1, inplace=True)
# Get fields from parsed email objects
keys = messages[0].keys()
for key in keys:
    emails_df[key] = [doc[key] for doc in messages]
# Parse content from emails
emails_df['content'] = list(map(get_text_from_email, messages))
# Split multiple email addresses
emails_df['From'] = emails_df['From'].map(split_email_addresses)
emails_df['To'] = emails_df['To'].map(split_email_addresses)

# Extract the root of 'file' as 'user'
emails_df['user'] = emails_df['file'].map(lambda x:x.split('/')[0])
del messages

emails_df.head()









    Out[5]:






  
    
      
      file
      Message-ID
      Date
      From
      To
      Subject
      Mime-Version
      Content-Type
      Content-Transfer-Encoding
      X-From
      X-To
      X-cc
      X-bcc
      X-Folder
      X-Origin
      X-FileName
      content
      user
    
  
  
    
      0
      allen-p/_sent_mail/1.
      <18782981.1075855378110.JavaMail.evans@thyme>
      Mon, 14 May 2001 16:39:00 -0700 (PDT)
      (phillip.allen@enron.com)
      (tim.belden@enron.com)
      
      1.0
      text/plain; charset=us-ascii
      7bit
      Phillip K Allen
      Tim Belden <Tim Belden/Enron@EnronXGate>
      
      
      \Phillip_Allen_Jan2002_1\Allen, Phillip K.\'Se...
      Allen-P
      pallen (Non-Privileged).pst
      Here is our forecast\n\n
      allen-p
    
    
      1
      allen-p/_sent_mail/10.
      <15464986.1075855378456.JavaMail.evans@thyme>
      Fri, 4 May 2001 13:51:00 -0700 (PDT)
      (phillip.allen@enron.com)
      (john.lavorato@enron.com)
      Re:
      1.0
      text/plain; charset=us-ascii
      7bit
      Phillip K Allen
      John J Lavorato <John J Lavorato/ENRON@enronXg...
      
      
      \Phillip_Allen_Jan2002_1\Allen, Phillip K.\'Se...
      Allen-P
      pallen (Non-Privileged).pst
      Traveling to have a business meeting takes the...
      allen-p
    
    
      2
      allen-p/_sent_mail/100.
      <24216240.1075855687451.JavaMail.evans@thyme>
      Wed, 18 Oct 2000 03:00:00 -0700 (PDT)
      (phillip.allen@enron.com)
      (leah.arsdall@enron.com)
      Re: test
      1.0
      text/plain; charset=us-ascii
      7bit
      Phillip K Allen
      Leah Van Arsdall
      
      
      \Phillip_Allen_Dec2000\Notes Folders\'sent mail
      Allen-P
      pallen.nsf
      test successful.  way to go!!!
      allen-p
    
    
      3
      allen-p/_sent_mail/1000.
      <13505866.1075863688222.JavaMail.evans@thyme>
      Mon, 23 Oct 2000 06:13:00 -0700 (PDT)
      (phillip.allen@enron.com)
      (randall.gay@enron.com)
      
      1.0
      text/plain; charset=us-ascii
      7bit
      Phillip K Allen
      Randall L Gay
      
      
      \Phillip_Allen_Dec2000\Notes Folders\'sent mail
      Allen-P
      pallen.nsf
      Randy,\n\n Can you send me a schedule of the s...
      allen-p
    
    
      4
      allen-p/_sent_mail/1001.
      <30922949.1075863688243.JavaMail.evans@thyme>
      Thu, 31 Aug 2000 05:07:00 -0700 (PDT)
      (phillip.allen@enron.com)
      (greg.piper@enron.com)
      Re: Hello
      1.0
      text/plain; charset=us-ascii
      7bit
      Phillip K Allen
      Greg Piper
      
      
      \Phillip_Allen_Dec2000\Notes Folders\'sent mail
      Allen-P
      pallen.nsf
      Let's shoot for Tuesday at 11:45.
      allen-p



In [6]:

    
emails_df[['Message-ID','Subject','content']].to_csv('enron_subject.csv',sep = '\t',encoding = 'utf-8')



In [ ]:

	file	Message-ID	Date	From	To	Subject	Mime-Version	Content-Type	Content-Transfer-Encoding	X-From	X-To	X-Folder	X-Origin	X-FileName	content	user
0	allen-p/_sent_mail/1.	<18782981.1075855378110.JavaMail.evans@thyme>	Mon, 14 May 2001 16:39:00 -0700 (PDT)	(phillip.allen@enron.com)	(tim.belden@enron.com)		1.0	text/plain; charset=us-ascii	7bit	Phillip K Allen	Tim Belden <Tim Belden/Enron@EnronXGate>	\Phillip_Allen_Jan2002_1\Allen, Phillip K.\'Se...	Allen-P	pallen (Non-Privileged).pst	Here is our forecast\n\n	allen-p
1	allen-p/_sent_mail/10.	<15464986.1075855378456.JavaMail.evans@thyme>	Fri, 4 May 2001 13:51:00 -0700 (PDT)	(phillip.allen@enron.com)	(john.lavorato@enron.com)	Re:	1.0	text/plain; charset=us-ascii	7bit	Phillip K Allen	John J Lavorato <John J Lavorato/ENRON@enronXg...	\Phillip_Allen_Jan2002_1\Allen, Phillip K.\'Se...	Allen-P	pallen (Non-Privileged).pst	Traveling to have a business meeting takes the...	allen-p
2	allen-p/_sent_mail/100.	<24216240.1075855687451.JavaMail.evans@thyme>	Wed, 18 Oct 2000 03:00:00 -0700 (PDT)	(phillip.allen@enron.com)	(leah.arsdall@enron.com)	Re: test	1.0	text/plain; charset=us-ascii	7bit	Phillip K Allen	Leah Van Arsdall	\Phillip_Allen_Dec2000\Notes Folders\'sent mail	Allen-P	pallen.nsf	test successful. way to go!!!	allen-p
3	allen-p/_sent_mail/1000.	<13505866.1075863688222.JavaMail.evans@thyme>	Mon, 23 Oct 2000 06:13:00 -0700 (PDT)	(phillip.allen@enron.com)	(randall.gay@enron.com)		1.0	text/plain; charset=us-ascii	7bit	Phillip K Allen	Randall L Gay	\Phillip_Allen_Dec2000\Notes Folders\'sent mail	Allen-P	pallen.nsf	Randy,\n\n Can you send me a schedule of the s...	allen-p
4	allen-p/_sent_mail/1001.	<30922949.1075863688243.JavaMail.evans@thyme>	Thu, 31 Aug 2000 05:07:00 -0700 (PDT)	(phillip.allen@enron.com)	(greg.piper@enron.com)	Re: Hello	1.0	text/plain; charset=us-ascii	7bit	Phillip K Allen	Greg Piper	\Phillip_Allen_Dec2000\Notes Folders\'sent mail	Allen-P	pallen.nsf	Let's shoot for Tuesday at 11:45.	allen-p