In [6]:
%cd "C:\Users\eshatiw\Desktop\enron\enron_mail_20110402\maildir"
%pwd


C:\Users\eshatiw\Desktop\enron\enron_mail_20110402\maildir
Out[6]:
u'C:\\Users\\eshatiw\\Desktop\\enron\\enron_mail_20110402\\maildir'

In [7]:
#%cd "C:\Users\eshatiw\Documents\IPython Notebooks\a"

# Values will be array with 2 entries: From email number, to email number
email_dict_to = {}
email_dict_from = {}

In [8]:
def find_emails(email_dict_to, email_dict_from, input_file):
    with open(input_file, "r") as f:
        b = f.read()
    b = b.replace("\t", "")

    b = b.replace("\n", "")
    try:
        from_email = re.search("From: [\w]*([@][\w.]*)To", b).group(1)
    
        to_list = re.search("To: [\w]*([@][\w.]*)Subject", b).group(1)
    
        to_list = to_list.split(",")
        
    
        # Remove spaces at beginning. Actually work thru this, showing why Im doing this
        to_list = [t.replace(" ","") for t in to_list]
    except:
        return
        
    
    if from_email not in email_dict_from:
        email_dict_from.setdefault(from_email, 1)
    else:
        email_dict_from[from_email] += 1
    
    for email in to_list:
        if email not in email_dict_to:
            email_dict_to.setdefault(email, 1)
        else:
            email_dict_to[email] += 1

In [9]:
import os
import re

rootDir = '.'
for dirName, subdirList, fileList in os.walk(rootDir):
    for fname in fileList:
        find_emails(email_dict_to, email_dict_from, os.path.join(dirName, fname))

In [15]:
#print email_dict_to
#print email_dict_from

sorted_to = {}
from collections import OrderedDict
from operator import itemgetter
sorted_to = OrderedDict(sorted(email_dict_to.items(), key=itemgetter(1), reverse=True))
for s in sorted_to.keys()[:20]:
    print s, sorted_to[s]


@enron.com 10320
@ect.enron.com 1869
@carrfut.com 1259
@caiso.com 632
@list.intcx.com 607
@aol.com 527
@nyiso.com 484
@wordsmith.org 454
@yahoogroups.com 434
@pira.com 387
@svmg.org 339
@haas.berkeley.edu 338
@egroups.com 308
@nisource.com 298
@nymex.com 283
@hotmail.com 230
@industrialinfo.com 224
@cera.com 158
@isda.org 157
@mailman.enron.com 140

In [16]:
sorted_from = {}
from collections import OrderedDict
from operator import itemgetter
sorted_from = OrderedDict(sorted(email_dict_from.items(), key=itemgetter(1), reverse=True))
for s in sorted_from.keys()[:20]:
    print s, sorted_from[s]


@carrfut.com 1262
@aol.com 1088
@caiso.com 636
@enron.com 622
@intcx.com 613
@hotmail.com 528
@nyiso.com 527
@yahoo.com 459
@wordsmith.org 454
@pira.com 387
@svmg.org 341
@nisource.com 299
@cheatsheets.net 286
@nymex.com 284
@earthlink.net 281
@venturewire.com 270
@industrialinfo.com 224
@ccomad3.uu.commissioner.com 217
@datek.com 214
@cera.com 213

In [ ]: