In [1]:
import pandas as pd
import sklearn
In [3]:
df = pd.read_table('https://raw.githubusercontent.com/sinanuozdemir/sfdat22/master/data/sms.tsv', sep='\t', header=None, names=['label', 'msg'])
df
Out[3]:
In [4]:
df.label.value_counts()
Out[4]:
In [13]:
value_probablity = df.label.value_counts()/len(df)
spam_probability = value_probablity.spam
ham_probability = value_probablity.ham
print('spam probability: {}, ham probability: {}'.format(spam_probability, ham_probability))
In [17]:
spams = df[df.label == 'spam']
sentence = 'send cash now'
spam_words_probability = 1
for word in sentence.split():
word_probability = spams[spams.msg.str.contains(word)].shape[0]/float(spams.shape[0])
print("word {} probability: {}".format(word, word_probability))
spam_words_probability *= word_probability
spam_words_probability *= spam_probability
print('spam words probability: {}'.format(spam_words_probability))
In [19]:
hams = df[df.label == 'ham']
sentence = 'send cash now'
ham_words_probability = 1
for word in sentence.split():
word_probability = hams[hams.msg.str.contains(word)].shape[0]/float(hams.shape[0])
print("word {} probability: {}".format(word, word_probability))
ham_words_probability *= word_probability
ham_words_probability *= ham_probability
print('ham words probability: {}'.format(ham_words_probability))
In [20]:
if spam_words_probability > ham_words_probability:
print('{} is more likely a spam'.format(sentence))
else:
print('{} is more likely NOT a spam'.format(sentence))
In [31]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
x_train, x_test, y_train, y_test = train_test_split(df.msg, df.label, random_state=1)
vect = CountVectorizer()
train_dtm = vect.fit_transform(x_train)
test_dtm = vect.transform(x_test)
nb = MultinomialNB()
nb.fit(train_dtm, y_train)
predicts = nb.predict(test_dtm)
predicts
Out[31]:
In [36]:
from sklearn import metrics
print('accuracy: {}, confusion matrix: {}'
.format(metrics.accuracy_score(y_test, predicts), metrics.confusion_matrix(y_test, predicts)))
for confusion matrix, see: http://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html
In [ ]: