In [89]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
In [90]:
# Grab and process the raw data.
data_path = ("https://raw.githubusercontent.com/Thinkful-Ed/data-201-resources/"
"master/sms_spam_collection/SMSSpamCollection"
)
sms_raw = pd.read_csv(data_path, delimiter= '\t', header=None)
sms_raw.columns = ['spam', 'message']
sms_raw.head()
Out[90]:
In [91]:
sms_raw.dropna(subset=['spam'], inplace=True)
In [92]:
keywords = ['click', 'offer', 'winner', 'buy', 'free', 'cash', 'urgent']
for key in keywords:
# Note that we add spaces around the key so that we're getting the word,
# not just pattern matching.
sms_raw[str(key)] = sms_raw.message.str.contains(
' ' + str(key) + ' ',
case=False
)
In [93]:
sms_raw['allcaps'] = sms_raw.message.str.isupper()
In [94]:
sms_raw['spam'] = (sms_raw['spam'] == 'spam')
# Note that if you run this cell a second time everything will become false.
# So... Don't.
In [95]:
sns.heatmap(sms_raw.corr())
Out[95]:
In [96]:
data = sms_raw[keywords + ['allcaps']]
target = sms_raw['spam']
In [97]:
from sklearn.naive_bayes import BernoulliNB
# Instantiate our model and store it in a new variable.
bnb = BernoulliNB()
# Fit our model to the data.
bnb.fit(data, target)
# Classify, storing the result in a new variable.
y_pred = bnb.predict(data)
# Display our results.
print("Number of mislabeled points out of a total {} points : {}".format(
data.shape[0],
(target != y_pred).sum()
))
print("Success rate of our model out of {} : {}".format(
data.shape[0],
1-((target != y_pred).sum()/((target != y_pred).sum()+(target == y_pred).sum()))
))
In [99]:
from sklearn.metrics import confusion_matrix
confusion_matrix(target, y_pred)
Out[99]:
In [114]:
# Build your confusion matrix and calculate sensitivity and specificity here.
import pandas as pd
y_actu = pd.Series(data=target, name='Actual')
y_pred = pd.Series(data=y_pred, name='Predicted')
df_confusion = pd.crosstab(y_actu, y_pred)
print(df_confusion)
#Calculate Sensitivity (Correctly identified, true within true)
a = df_confusion.loc[True,True]
b = df_confusion.iloc[1,0:3].sum()
print (a/b)
#Calculate specifity (incorrecly identifie, false wihin the false)
c = df_confusion.loc[False,False]
d = df_confusion.iloc[0, 0:3].sum()
print (c/d)