notebook.community

Edit and run



In [10]:

    
import nltk.classify.util
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import movie_reviews
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import os
import random

Get Data set from here: http://www.aueb.gr/users/ion/data/enron-spam/



In [3]:

    
rootdir = "C:\\Users\\Shantnu\\Desktop\\Data Sources\\Enron Spam"



In [4]:

    
# Loop through all the directories, sub directories and files in the above folder, and print them.

# For files, print number of files.









    



C:\Users\Shantnu\Desktop\Data Sources\Enron Spam ['enron1', 'enron2', 'enron3', 'enron4', 'enron5', 'enron6'] 0
C:\Users\Shantnu\Desktop\Data Sources\Enron Spam\enron1 ['ham', 'spam'] 1
C:\Users\Shantnu\Desktop\Data Sources\Enron Spam\enron1\ham [] 3672
C:\Users\Shantnu\Desktop\Data Sources\Enron Spam\enron1\spam [] 1500
C:\Users\Shantnu\Desktop\Data Sources\Enron Spam\enron2 ['ham', 'spam'] 1
C:\Users\Shantnu\Desktop\Data Sources\Enron Spam\enron2\ham [] 4361
C:\Users\Shantnu\Desktop\Data Sources\Enron Spam\enron2\spam [] 1496
C:\Users\Shantnu\Desktop\Data Sources\Enron Spam\enron3 ['ham', 'spam'] 1
C:\Users\Shantnu\Desktop\Data Sources\Enron Spam\enron3\ham [] 4012
C:\Users\Shantnu\Desktop\Data Sources\Enron Spam\enron3\spam [] 1500
C:\Users\Shantnu\Desktop\Data Sources\Enron Spam\enron4 ['ham', 'spam'] 1
C:\Users\Shantnu\Desktop\Data Sources\Enron Spam\enron4\ham [] 1500
C:\Users\Shantnu\Desktop\Data Sources\Enron Spam\enron4\spam [] 4500
C:\Users\Shantnu\Desktop\Data Sources\Enron Spam\enron5 ['ham', 'spam'] 1
C:\Users\Shantnu\Desktop\Data Sources\Enron Spam\enron5\ham [] 1500
C:\Users\Shantnu\Desktop\Data Sources\Enron Spam\enron5\spam [] 3675
C:\Users\Shantnu\Desktop\Data Sources\Enron Spam\enron6 ['ham', 'spam'] 1
C:\Users\Shantnu\Desktop\Data Sources\Enron Spam\enron6\ham [] 1500
C:\Users\Shantnu\Desktop\Data Sources\Enron Spam\enron6\spam [] 4500



In [6]:

    
os.path.split("C:\\Users\\Shantnu\\Desktop\\Data Sources\\Enron Spam\\enron1\\ham")









    Out[6]:





('C:\\Users\\Shantnu\\Desktop\\Data Sources\\Enron Spam\\enron1', 'ham')



In [7]:

    
# Same as before, but only print the ham and spam folders









    



C:\Users\Shantnu\Desktop\Data Sources\Enron Spam\enron1\ham [] 3672
C:\Users\Shantnu\Desktop\Data Sources\Enron Spam\enron1\spam [] 1500
C:\Users\Shantnu\Desktop\Data Sources\Enron Spam\enron2\ham [] 4361
C:\Users\Shantnu\Desktop\Data Sources\Enron Spam\enron2\spam [] 1496
C:\Users\Shantnu\Desktop\Data Sources\Enron Spam\enron3\ham [] 4012
C:\Users\Shantnu\Desktop\Data Sources\Enron Spam\enron3\spam [] 1500
C:\Users\Shantnu\Desktop\Data Sources\Enron Spam\enron4\ham [] 1500
C:\Users\Shantnu\Desktop\Data Sources\Enron Spam\enron4\spam [] 4500
C:\Users\Shantnu\Desktop\Data Sources\Enron Spam\enron5\ham [] 1500
C:\Users\Shantnu\Desktop\Data Sources\Enron Spam\enron5\spam [] 3675
C:\Users\Shantnu\Desktop\Data Sources\Enron Spam\enron6\ham [] 1500
C:\Users\Shantnu\Desktop\Data Sources\Enron Spam\enron6\spam [] 4500



In [8]:

    
ham_list = []
spam_list = []

# Same as before, but this time, read the files, and append them to the ham and spam list

# You WILL get a Unicode error for the spam part- either Google it, or look at my solution

print(ham_list[0])
print(spam_list[0])









    



Subject: christmas tree farm pictures

Subject: dobmeos with hgh my energy level has gone up ! stukm
introducing
doctor - formulated
hgh
human growth hormone - also called hgh
is referred to in medical science as the master hormone . it is very plentiful
when we are young , but near the age of twenty - one our bodies begin to produce
less of it . by the time we are forty nearly everyone is deficient in hgh ,
and at eighty our production has normally diminished at least 90 - 95 % .
advantages of hgh :
- increased muscle strength
- loss in body fat
- increased bone density
- lower blood pressure
- quickens wound healing
- reduces cellulite
- improved vision
- wrinkle disappearance
- increased skin thickness texture
- increased energy levels
- improved sleep and emotional stability
- improved memory and mental alertness
- increased sexual potency
- resistance to common illness
- strengthened heart muscle
- controlled cholesterol
- controlled mood swings
- new hair growth and color restore
read
more at this website
unsubscribe



In [11]:

    
# Write a function , that when passed in words, will return a dictionary of the form

# {Word1: True, Word2: True, Words3: True}

# Removing stop words is optional

def create_word_features(words):
    pass

create_word_features(["the", "quick", "brown", "quick", "a", "fox"])









    Out[11]:





{'a': True, 'brown': True, 'fox': True, 'quick': True, 'the': True}



In [ ]:

    
ham_list = []
spam_list = []

# Same as before, but this time:

# 1. Break the sentences into words using word_tokenize
# 2. Use the create_word_features() function you just wrote

print(ham_list[0])
print(spam_list[0])



In [ ]:

    
combined_list = ham_list + spam_list

random.shuffle(combined_list)



In [ ]:

    
# Create a test and train section.

# 70% of the data is training. 30% is test



In [ ]:

    
# Create the Naive Bayes filter

# Find the accuracy, using the test data

print("Accuracy is: ", accuracy * 100)



In [ ]:

    
classifier.show_most_informative_features(20)



In [ ]:

    
# Clasify the below as spam or ham

# Hint: 1. Break into words using word_tokenzise
# 2. create_word_features
# 3. Use the classify function

msg1 = '''Hello th̓ere seُx master :-)
i need c0ck ri͏ght noِw ..͏. don't tell my hǔbbٚy.ٚ. ))
My sc͕rٞeٚe̻nname is Dorry.
My accֺo֔unt is h֯ere: http:nxusxbnd.GirlsBadoo.ru
C u late٘r!'''


msg2 = '''As one of our top customers we are providing 10% OFF the total of your next used book purchase from www.letthestoriesliveon.com. Please use the promotional code, TOPTENOFF at checkout. Limited to 1 use per customer. All books have free shipping within the contiguous 48 United States and there is no minimum purchase.

We have millions of used books in stock that are up to 90% off MRSP and add tens of thousands of new items every day. Don’t forget to check back frequently for new arrivals.'''



msg3 = '''To start off, I have a 6 new videos + transcripts in the members section. In it, we analyse the Enron email dataset, half a million files, spread over 2.5GB. It's about 1.5 hours of  video.

I have also created a Conda environment for running the code (both free and member lessons). This is to ensure everyone is running the same version of libraries, preventing the Works on my machine problems. If you get a second, do you mind trying it here?'''