In [2]:
import pandas as pd

dataframe = pd.read_table("smsspamcollection/SMSSpamCollection",
                         sep="\t",
                         header=None,
                         names=["label","sms_message"])
dataframe.head()


Out[2]:
label sms_message
0 ham Go until jurong point, crazy.. Available only ...
1 ham Ok lar... Joking wif u oni...
2 spam Free entry in 2 a wkly comp to win FA Cup fina...
3 ham U dun say so early hor... U c already then say...
4 ham Nah I don't think he goes to usf, he lives aro...
Out[2]:
label sms_message
0 ham Go until jurong point, crazy.. Available only ...
1 ham Ok lar... Joking wif u oni...
2 spam Free entry in 2 a wkly comp to win FA Cup fina...
3 ham U dun say so early hor... U c already then say...
4 ham Nah I don't think he goes to usf, he lives aro...

In [3]:
dataframe["label"] = dataframe.label.map({"ham":1,"spam":0})

dataframe.head()


Out[3]:
label sms_message
0 1 Go until jurong point, crazy.. Available only ...
1 1 Ok lar... Joking wif u oni...
2 0 Free entry in 2 a wkly comp to win FA Cup fina...
3 1 U dun say so early hor... U c already then say...
4 1 Nah I don't think he goes to usf, he lives aro...
Out[3]:
label sms_message
0 1 Go until jurong point, crazy.. Available only ...
1 1 Ok lar... Joking wif u oni...
2 0 Free entry in 2 a wkly comp to win FA Cup fina...
3 1 U dun say so early hor... U c already then say...
4 1 Nah I don't think he goes to usf, he lives aro...

In [4]:
import string
documents = ['Hello, how are you!',
             'Win money, win from home.',
             'Call me now.',
             'Hello, Call hello you tomorrow?']

#doc = [i.lower().translate(str.maketrans("","",string.punctuations)) 
#       for i in documents]   python 3
doc


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-4-09f16edf3440> in <module>()
      7 #doc = [i.lower().translate(str.maketrans("","",string.punctuations))
      8 #       for i in documents]   python 3
----> 9 doc

NameError: name 'doc' is not defined
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-4-09f16edf3440> in <module>()
      7 #doc = [i.lower().translate(str.maketrans("","",string.punctuations))
      8 #       for i in documents]   python 3
----> 9 doc

NameError: name 'doc' is not defined

In [ ]:
doc = [i.translate(string.maketrans(" "," "),string.punctuation)
       for i in documents]
doc

In [ ]:
tokenz = [i.split(" ") for i in documents]
tokenz

In [ ]:
from collections import Counter

freq = [Counter(i) for i in tokenz]
freq

In [ ]:
from sklearn.feature_extraction.text import CountVectorizer
count_vector = CountVectorizer()
count_vector.fit(documents)
count_vector.get_feature_names()

In [ ]:
doc_array = count_vector.transform(documents).toarray()
doc_array

In [ ]:
freq_mat = pd.DataFrame(doc_array,
                       columns = count_vector.get_feature_names())
freq_mat

In [ ]:
from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(dataframe["sms_message"],
                                                             dataframe["label"],
                                                             random_state=1)

print "number of total set: {}".format(dataframe.shape[0])
print "number of traing set: {}".format(X_train.shape[0])
print "number of rows in test set: {}".format(X_test.shape[0])

In [ ]:
count_vector = CountVectorizer()
training_data = count_vector.fit_transform(X_train)
testing_data = count_vector.fit_transform(X_test)

In [ ]:
from sklearn.naive_bayes import MultinomialNB
naive_bayes = MultinomialNB()
naive_bayes.fit(traning_data, y_train)

In [ ]:
prediction = naive_bayes.predict(testing_data)

In [ ]:
'''
Solution
'''
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print('Accuracy score: ', format(accuracy_score(y_test, predictions)))
print('Precision score: ', format(precision_score(y_test, predictions)))
print('Recall score: ', format(recall_score(y_test, predictions)))
print('F1 score: ', format(f1_score(y_test, predictions)))

In [ ]:


In [ ]: