In [37]:
%cd "C:\Users\eshatiw\Desktop\enron\enron_mail_20110402\maildir"
%pwd


C:\Users\eshatiw\Desktop\enron\enron_mail_20110402\maildir
Out[37]:
u'C:\\Users\\eshatiw\\Desktop\\enron\\enron_mail_20110402\\maildir'

In [38]:
#%cd "C:\Users\eshatiw\Documents\IPython Notebooks\a"

# Values will be array with 2 entries: From email number, to email number
email_dict_to = {}
email_dict_from = {}

In [50]:
def find_emails(email_dict_to, email_dict_from, input_file):
    with open(input_file, "r") as f:
        b = f.read()
    b = b.replace("\t", "")

    b = b.replace("\n", "")
    try:
        from_email = re.search("From: ([\w@,. \-\'\<\>]*)To", b).group(1)
    
        to_list = re.search("To: ([\w@,. \-\'\<\>]*)Subject", b).group(1)
    
        to_list = to_list.split(",")
        
    
        # Remove spaces at beginning. Actually work thru this, showing why Im doing this
        to_list = [t.replace(" ","") for t in to_list]
    except:
        return
        
    
    if from_email not in email_dict_from:
        email_dict_from.setdefault(from_email, 1)
    else:
        email_dict_from[from_email] += 1
    
    for email in to_list:
        if email not in email_dict_to:
            email_dict_to.setdefault(email, 1)
        else:
            email_dict_to[email] += 1

In [51]:
import os
import re

rootDir = '.'
for dirName, subdirList, fileList in os.walk(rootDir):
    for fname in fileList:
        find_emails(email_dict_to, email_dict_from, os.path.join(dirName, fname))

In [52]:
sorted_to = {}
from collections import OrderedDict
from operator import itemgetter
sorted_to = OrderedDict(sorted(email_dict_to.items(), key=itemgetter(1), reverse=True))
for s in sorted_to.keys()[:15]:
    print s, sorted_to[s]


richard.shapiro@enron.com 15164
jeff.dasovich@enron.com 14199
tana.jones@enron.com 12807
steven.kean@enron.com 12757
sara.shackleton@enron.com 11414
james.steffes@enron.com 10365
mark.taylor@enron.com 9791
pete.davis@enron.com 9281
susan.mara@enron.com 9056
paul.kaufman@enron.com 8531
louise.kitchen@enron.com 8327
tim.belden@enron.com 7971
john.lavorato@enron.com 7441
sally.beck@enron.com 7311
gerald.nemec@enron.com 6707

In [53]:
sorted_from = {}
from collections import OrderedDict
from operator import itemgetter
sorted_from = OrderedDict(sorted(email_dict_from.items(), key=itemgetter(1), reverse=True))
for s in sorted_from.keys()[:15]:
    print s, sorted_from[s]


kay.mann@enron.com 16721
vince.kaminski@enron.com 14329
jeff.dasovich@enron.com 11386
pete.davis@enron.com 9149
sara.shackleton@enron.com 8756
chris.germany@enron.com 8740
enron.announcements@enron.com 8521
tana.jones@enron.com 8447
john.arnold@enron.com 7049
steven.kean@enron.com 6220
kate.symes@enron.com 5435
matthew.lenhart@enron.com 5256
eric.bass@enron.com 5154
debra.perlingiere@enron.com 4355
sally.beck@enron.com 4331

In [ ]: