In [2]:
import pandas as pd
dataframe = pd.read_table("smsspamcollection/SMSSpamCollection",
sep="\t",
header=None,
names=["label","sms_message"])
dataframe.head()
Out[2]:
Out[2]:
In [3]:
dataframe["label"] = dataframe.label.map({"ham":1,"spam":0})
dataframe.head()
Out[3]:
Out[3]:
In [4]:
import string
documents = ['Hello, how are you!',
'Win money, win from home.',
'Call me now.',
'Hello, Call hello you tomorrow?']
#doc = [i.lower().translate(str.maketrans("","",string.punctuations))
# for i in documents] python 3
doc
In [ ]:
doc = [i.translate(string.maketrans(" "," "),string.punctuation)
for i in documents]
doc
In [ ]:
tokenz = [i.split(" ") for i in documents]
tokenz
In [ ]:
from collections import Counter
freq = [Counter(i) for i in tokenz]
freq
In [ ]:
from sklearn.feature_extraction.text import CountVectorizer
count_vector = CountVectorizer()
count_vector.fit(documents)
count_vector.get_feature_names()
In [ ]:
doc_array = count_vector.transform(documents).toarray()
doc_array
In [ ]:
freq_mat = pd.DataFrame(doc_array,
columns = count_vector.get_feature_names())
freq_mat
In [ ]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(dataframe["sms_message"],
dataframe["label"],
random_state=1)
print "number of total set: {}".format(dataframe.shape[0])
print "number of traing set: {}".format(X_train.shape[0])
print "number of rows in test set: {}".format(X_test.shape[0])
In [ ]:
count_vector = CountVectorizer()
training_data = count_vector.fit_transform(X_train)
testing_data = count_vector.fit_transform(X_test)
In [ ]:
from sklearn.naive_bayes import MultinomialNB
naive_bayes = MultinomialNB()
naive_bayes.fit(traning_data, y_train)
In [ ]:
prediction = naive_bayes.predict(testing_data)
In [ ]:
'''
Solution
'''
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print('Accuracy score: ', format(accuracy_score(y_test, predictions)))
print('Precision score: ', format(precision_score(y_test, predictions)))
print('Recall score: ', format(recall_score(y_test, predictions)))
print('F1 score: ', format(f1_score(y_test, predictions)))
In [ ]:
In [ ]: