In [6]:
%cd "C:\Users\eshatiw\Desktop\enron\enron_mail_20110402\maildir"
%pwd
Out[6]:
In [7]:
#%cd "C:\Users\eshatiw\Documents\IPython Notebooks\a"
# Values will be array with 2 entries: From email number, to email number
email_dict_to = {}
email_dict_from = {}
In [8]:
def find_emails(email_dict_to, email_dict_from, input_file):
with open(input_file, "r") as f:
b = f.read()
b = b.replace("\t", "")
b = b.replace("\n", "")
try:
from_email = re.search("From: [\w]*([@][\w.]*)To", b).group(1)
to_list = re.search("To: [\w]*([@][\w.]*)Subject", b).group(1)
to_list = to_list.split(",")
# Remove spaces at beginning. Actually work thru this, showing why Im doing this
to_list = [t.replace(" ","") for t in to_list]
except:
return
if from_email not in email_dict_from:
email_dict_from.setdefault(from_email, 1)
else:
email_dict_from[from_email] += 1
for email in to_list:
if email not in email_dict_to:
email_dict_to.setdefault(email, 1)
else:
email_dict_to[email] += 1
In [9]:
import os
import re
rootDir = '.'
for dirName, subdirList, fileList in os.walk(rootDir):
for fname in fileList:
find_emails(email_dict_to, email_dict_from, os.path.join(dirName, fname))
In [15]:
#print email_dict_to
#print email_dict_from
sorted_to = {}
from collections import OrderedDict
from operator import itemgetter
sorted_to = OrderedDict(sorted(email_dict_to.items(), key=itemgetter(1), reverse=True))
for s in sorted_to.keys()[:20]:
print s, sorted_to[s]
In [16]:
sorted_from = {}
from collections import OrderedDict
from operator import itemgetter
sorted_from = OrderedDict(sorted(email_dict_from.items(), key=itemgetter(1), reverse=True))
for s in sorted_from.keys()[:20]:
print s, sorted_from[s]
In [ ]: