In [ ]:
import os
import pickle
import re
import sys

The list of all the emails from Sara are in the from_sara list likewise for emails from Chris (from_chris). The actual documents are in the Enron email dataset, which you downloaded/unpacked. The data is stored in lists and packed away in pickle files at the end.

In [ ]:
from_sara  = open('../text_learning/from_sara.txt', "r")
from_chris = open('../text_learning/from_chris.txt', "r")

from_data = []
word_data = []

In [ ]:
from nltk.stem.snowball import SnowballStemmer
import string

filePath = '/Users/omojumiller/mycode/hiphopathy/HipHopDataExploration/JayZ/'

f = open(filePath+"JayZ_American Gangster_American Gangster.txt", "r")  ### go back to beginning of file (annoying)
all_text =

content = all_text.split("X-FileName:")
words = ""
stemmer = SnowballStemmer("english")
text_string = content
for sentence in text_string:
    words = sentence.split()
stemmed_words = [stemmer.stem(word) for word in words]

In [ ]:
def parseOutText(f):
    """ given an opened email file f, parse out all text below the
        metadata block at the top
        example use case:
        f = open("email_file_name.txt", "r")
        text = parseOutText(f)

    stemmer = SnowballStemmer("english")  ### go back to beginning of file (annoying)
    all_text =

    ### split off metadata
    content = all_text.split("X-FileName:")
    words = ""
    if len(content) > 1:
        ### remove punctuation
        text_string = content[1].translate(string.maketrans("", ""), string.punctuation)

        ### split the text string into individual words, stem each word,
        ### and append the stemmed word to words (make sure there's a single
        ### space between each stemmed word)
        words = ' '.join([stemmer.stem(word) for word in text_string.split()])
    return words

In [ ]:
ff = open("../text_learning/test_email.txt", "r")
text = parseOutText(ff)
print text

temp_counter is a way to speed up the development--there are thousands of emails from Sara and Chris, so running over all of them can take a long time. temp_counter helps you only look at the first 200 emails in the list so you can iterate your modifications quicker

In [ ]:
temp_counter = 1

for name, from_person in [("sara", from_sara), ("chris", from_chris)]:   
    for path in from_person:
        ### only look at first 200 emails when developing
        ### once everything is working, remove this line to run over full dataset
        #temp_counter += 1
        if temp_counter:
            path = os.path.join('..', path[:-1])
            #print path
            email = open(path, "r")

            ### use parseOutText to extract the text from the opened email
            text = parseOutText(email)

            ### use str.replace() to remove any instances of the words
            replaceWords = ["sara", "shackleton", "chris", "germani"]
            for word in replaceWords:
                text = text.replace(word, '')

            ### append the text to word_data

            ### append a 0 to from_data if email is from Sara, and 1 if email is from Chris
            if name == "sara": 


print "emails processed"

In [ ]:


  • Tf Term Frequency
  • Idf Inverse document frequency

In [ ]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words="english",lowercase=True)
bag_of_words =

How many different words are there?

In [ ]:

What is word number 34597 in your TfIdf?

In [ ]: