In [10]:
import nltk.classify.util
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import movie_reviews
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import os
import random
Get Data set from here: http://www.aueb.gr/users/ion/data/enron-spam/
In [3]:
rootdir = "C:\\Users\\Shantnu\\Desktop\\Data Sources\\Enron Spam"
In [4]:
# Loop through all the directories, sub directories and files in the above folder, and print them.
# For files, print number of files.
In [6]:
os.path.split("C:\\Users\\Shantnu\\Desktop\\Data Sources\\Enron Spam\\enron1\\ham")
Out[6]:
In [7]:
# Same as before, but only print the ham and spam folders
In [8]:
ham_list = []
spam_list = []
# Same as before, but this time, read the files, and append them to the ham and spam list
# You WILL get a Unicode error for the spam part- either Google it, or look at my solution
print(ham_list[0])
print(spam_list[0])
In [11]:
# Write a function , that when passed in words, will return a dictionary of the form
# {Word1: True, Word2: True, Words3: True}
# Removing stop words is optional
def create_word_features(words):
pass
create_word_features(["the", "quick", "brown", "quick", "a", "fox"])
Out[11]:
In [ ]:
ham_list = []
spam_list = []
# Same as before, but this time:
# 1. Break the sentences into words using word_tokenize
# 2. Use the create_word_features() function you just wrote
print(ham_list[0])
print(spam_list[0])
In [ ]:
combined_list = ham_list + spam_list
random.shuffle(combined_list)
In [ ]:
# Create a test and train section.
# 70% of the data is training. 30% is test
In [ ]:
# Create the Naive Bayes filter
# Find the accuracy, using the test data
print("Accuracy is: ", accuracy * 100)
In [ ]:
classifier.show_most_informative_features(20)
In [ ]:
# Clasify the below as spam or ham
# Hint: 1. Break into words using word_tokenzise
# 2. create_word_features
# 3. Use the classify function
msg1 = '''Hello th̓ere seُx master :-)
i need c0ck ri͏ght noِw ..͏. don't tell my hǔbbٚy.ٚ. ))
My sc͕rٞeٚe̻nname is Dorry.
My accֺo֔unt is h֯ere: http:nxusxbnd.GirlsBadoo.ru
C u late٘r!'''
msg2 = '''As one of our top customers we are providing 10% OFF the total of your next used book purchase from www.letthestoriesliveon.com. Please use the promotional code, TOPTENOFF at checkout. Limited to 1 use per customer. All books have free shipping within the contiguous 48 United States and there is no minimum purchase.
We have millions of used books in stock that are up to 90% off MRSP and add tens of thousands of new items every day. Don’t forget to check back frequently for new arrivals.'''
msg3 = '''To start off, I have a 6 new videos + transcripts in the members section. In it, we analyse the Enron email dataset, half a million files, spread over 2.5GB. It's about 1.5 hours of video.
I have also created a Conda environment for running the code (both free and member lessons). This is to ensure everyone is running the same version of libraries, preventing the Works on my machine problems. If you get a second, do you mind trying it here?'''