In [2]:
%matplotlib inline

In [3]:
from sklearn.feature_extraction.text import CountVectorizer

In [4]:
vectorizer = CountVectorizer()

In [5]:
from nltk.corpus import stopwords

In [6]:
sw = stopwords.words('english')

In [7]:
len(sw)


Out[7]:
127

In [8]:
from nltk.stem.snowball import SnowballStemmer

In [9]:
stemmer = SnowballStemmer('english')

In [10]:
stemmer.stem('unresponsive')


Out[10]:
u'unrespons'

In [10]:
#%load ../ud120-projects/tools/parse_out_email_text.py

In [11]:
#%%writefile ../ud120-projects/tools/parse_out_email_text.py
#!/usr/bin/python

from nltk.stem.snowball import SnowballStemmer
import string

def parseOutText(f):
    """ given an opened email file f, parse out all text below the
        metadata block at the top
        (in Part 2, you will also add stemming capabilities)
        and return a string that contains all the words
        in the email (space-separated) 
        
        example use case:
        f = open("email_file_name.txt", "r")
        text = parseOutText(f)
        
        """


    f.seek(0)  ### go back to beginning of file (annoying)
    all_text = f.read()

    ### split off metadata
    content = all_text.split("X-FileName:")
    words = ""
    if len(content) > 1:
        ### remove punctuation
        text_string = content[1].translate(string.maketrans("", ""), string.punctuation)

        ### project part 2: comment out the line below
        # words = text_string
        
        ### split the text string into individual words, stem each word,
        ### and append the stemmed word to words (make sure there's a single
        ### space between each stemmed word)
        stemmer = SnowballStemmer('english')
        
        text_string = text_string.split()

        words = [stemmer.stem(word).strip()for word in text_string if word != ""]
        
        words = " ".join(words)
        
    return words

    

def main():
    ff = open("../ud120-projects/text_learning/test_email.txt", "r")
    text = parseOutText(ff)
    print text



if __name__ == '__main__':
    main()


hi everyon if you can read this messag your proper use parseouttext pleas proceed to the next part of the project

In [12]:
#%load ../ud120-projects/text_learning/vectorize_text.py

In [74]:
#%%writefile ../ud120-projects/text_learning/vectorize_text.py
#!/usr/bin/python

import os
import pickle
import re
import sys
from nltk.stem.snowball import SnowballStemmer

sys.path.append( "../ud120-projects/tools/" )
from parse_out_email_text import parseOutText

"""
    starter code to process the emails from Sara and Chris to extract
    the features and get the documents ready for classification

    the list of all the emails from Sara are in the from_sara list
    likewise for emails from Chris (from_chris)

    the actual documents are in the Enron email dataset, which
    you downloaded/unpacked in Part 0 of the first mini-project

    the data is stored in lists and packed away in pickle files at the end

"""


from_sara  = open("../ud120-projects/text_learning/from_sara.txt", "r")
from_chris = open("../ud120-projects/text_learning/from_chris.txt", "r")

from_data = []
word_data = []

### temp_counter is a way to speed up the development--there are
### thousands of emails from Sara and Chris, so running over all of them
### can take a long time
### temp_counter helps you only look at the first 200 emails in the list
temp_counter = 0
text = ""

# sw = stopwords.words('english')

for name, from_person in [("sara", from_sara), ("chris", from_chris)]:
    for path in from_person:
        ### only look at first 200 emails when developing
        ### once everything is working, remove this line to run over full dataset
        #temp_counter += 1
        #if temp_counter < 200:
            path = os.path.join('../ud120-projects/', path[:-1])
            #print path
            email = open(path, "r")
            ### use parseOutText to extract the text from the opened email
            text = parseOutText(email)
            # print text
            ### use str.replace() to remove any instances of the words
            remove_these = ["sara", "shackleton", "chris", "germani", "sshacklensf", "cgermannsf"]
            for word in remove_these:
                text = text.replace(word, "")
            
            # remove nltk stopwords:    
            # text = ' '.join([word for word in text.split() if word not in sw])
            
            ### append the text to word_data
            word_data.append(text)
            if name == 'sara':
                from_data.append(0)
            elif name == 'chris':
                from_data.append(1)
                
            #print word_data
            #print from_data
            ### append a 0 to from_data if email is from Sara, and 1 if email is from Chris
            email.close()

print "emails processed"
from_sara.close()
from_chris.close()

pickle.dump( word_data, open("../ud120-projects/text_learning/your_word_data.pkl", "w") )
pickle.dump( from_data, open("../ud120-projects/text_learning/your_email_authors.pkl", "w") )


### in Part 4, do TfIdf vectorization here


emails processed

In [75]:
s = pickle.load(open("../ud120-projects/text_learning/your_word_data.pkl"))
print s[152]


tjonesnsf stephani and sam need nymex calendar

In [76]:
s = pickle.load(open("../ud120-projects/text_learning/your_email_authors.pkl"))
print s[16000:16060]


[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

In [43]:
#from sklearn.feature_extraction.text import TfidfTransformer
#from sklearn.feature_extraction.text import CountVectorizer

In [44]:
#vectorizer = CountVectorizer()
#bow = vectorizer.fit(word_data)
#bow = vectorizer.transform(word_data)

In [45]:
#transformer = TfidfTransformer()
#tfidf = transformer.fit_transform(bow)

In [65]:
#tfidf.toarray()  
word_data[152]


Out[65]:
u'tjonesnsf stephani and sam need nymex calendar'

In [66]:
from sklearn.feature_extraction.text import TfidfVectorizer
vect = TfidfVectorizer(min_df=1, stop_words='english')
tdidf = vect.fit_transform(word_data)

In [67]:
# tdidf2.toarray()
len(vect.get_feature_names())


Out[67]:
38757

In [68]:
vect.get_feature_names()[34597]


Out[68]:
u'stephaniethank'

In [ ]: