notebook.community

Edit and run



In [2]:

    
import pandas as pd

dataframe = pd.read_table("smsspamcollection/SMSSpamCollection",
                         sep="\t",
                         header=None,
                         names=["label","sms_message"])
dataframe.head()









    Out[2]:







  
    
      
      label
      sms_message
    
  
  
    
      0
      ham
      Go until jurong point, crazy.. Available only ...
    
    
      1
      ham
      Ok lar... Joking wif u oni...
    
    
      2
      spam
      Free entry in 2 a wkly comp to win FA Cup fina...
    
    
      3
      ham
      U dun say so early hor... U c already then say...
    
    
      4
      ham
      Nah I don't think he goes to usf, he lives aro...
    
  








    Out[2]:







  
    
      
      label
      sms_message
    
  
  
    
      0
      ham
      Go until jurong point, crazy.. Available only ...
    
    
      1
      ham
      Ok lar... Joking wif u oni...
    
    
      2
      spam
      Free entry in 2 a wkly comp to win FA Cup fina...
    
    
      3
      ham
      U dun say so early hor... U c already then say...
    
    
      4
      ham
      Nah I don't think he goes to usf, he lives aro...



In [3]:

    
dataframe["label"] = dataframe.label.map({"ham":1,"spam":0})

dataframe.head()









    Out[3]:







  
    
      
      label
      sms_message
    
  
  
    
      0
      1
      Go until jurong point, crazy.. Available only ...
    
    
      1
      1
      Ok lar... Joking wif u oni...
    
    
      2
      0
      Free entry in 2 a wkly comp to win FA Cup fina...
    
    
      3
      1
      U dun say so early hor... U c already then say...
    
    
      4
      1
      Nah I don't think he goes to usf, he lives aro...
    
  








    Out[3]:







  
    
      
      label
      sms_message
    
  
  
    
      0
      1
      Go until jurong point, crazy.. Available only ...
    
    
      1
      1
      Ok lar... Joking wif u oni...
    
    
      2
      0
      Free entry in 2 a wkly comp to win FA Cup fina...
    
    
      3
      1
      U dun say so early hor... U c already then say...
    
    
      4
      1
      Nah I don't think he goes to usf, he lives aro...



In [4]:

    
import string
documents = ['Hello, how are you!',
             'Win money, win from home.',
             'Call me now.',
             'Hello, Call hello you tomorrow?']

#doc = [i.lower().translate(str.maketrans("","",string.punctuations)) 
#       for i in documents]   python 3
doc









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-4-09f16edf3440> in <module>()
      7 #doc = [i.lower().translate(str.maketrans("","",string.punctuations))
      8 #       for i in documents]   python 3
----> 9 doc

NameError: name 'doc' is not defined





    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-4-09f16edf3440> in <module>()
      7 #doc = [i.lower().translate(str.maketrans("","",string.punctuations))
      8 #       for i in documents]   python 3
----> 9 doc

NameError: name 'doc' is not defined



In [ ]:

    
doc = [i.translate(string.maketrans(" "," "),string.punctuation)
       for i in documents]
doc



In [ ]:

    
tokenz = [i.split(" ") for i in documents]
tokenz



In [ ]:

    
from collections import Counter

freq = [Counter(i) for i in tokenz]
freq



In [ ]:

    
from sklearn.feature_extraction.text import CountVectorizer
count_vector = CountVectorizer()
count_vector.fit(documents)
count_vector.get_feature_names()



In [ ]:

    
doc_array = count_vector.transform(documents).toarray()
doc_array



In [ ]:

    
freq_mat = pd.DataFrame(doc_array,
                       columns = count_vector.get_feature_names())
freq_mat



In [ ]:

    
from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(dataframe["sms_message"],
                                                             dataframe["label"],
                                                             random_state=1)

print "number of total set: {}".format(dataframe.shape[0])
print "number of traing set: {}".format(X_train.shape[0])
print "number of rows in test set: {}".format(X_test.shape[0])



In [ ]:

    
count_vector = CountVectorizer()
training_data = count_vector.fit_transform(X_train)
testing_data = count_vector.fit_transform(X_test)



In [ ]:

    
from sklearn.naive_bayes import MultinomialNB
naive_bayes = MultinomialNB()
naive_bayes.fit(traning_data, y_train)



In [ ]:

    
prediction = naive_bayes.predict(testing_data)



In [ ]:

    
'''
Solution
'''
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print('Accuracy score: ', format(accuracy_score(y_test, predictions)))
print('Precision score: ', format(precision_score(y_test, predictions)))
print('Recall score: ', format(recall_score(y_test, predictions)))
print('F1 score: ', format(f1_score(y_test, predictions)))



In [ ]:



In [ ]:

	label	sms_message
0	ham	Go until jurong point, crazy.. Available only ...
1	ham	Ok lar... Joking wif u oni...
2	spam	Free entry in 2 a wkly comp to win FA Cup fina...
3	ham	U dun say so early hor... U c already then say...
4	ham	Nah I don't think he goes to usf, he lives aro...