In [89]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns

In [90]:
# Grab and process the raw data.
data_path = ("https://raw.githubusercontent.com/Thinkful-Ed/data-201-resources/"
             "master/sms_spam_collection/SMSSpamCollection"
            )
sms_raw = pd.read_csv(data_path, delimiter= '\t', header=None)
sms_raw.columns = ['spam', 'message']
sms_raw.head()


Out[90]:
spam message
0 ham Go until jurong point, crazy.. Available only ...
1 ham Ok lar... Joking wif u oni...
2 spam Free entry in 2 a wkly comp to win FA Cup fina...
3 ham U dun say so early hor... U c already then say...
4 ham Nah I don't think he goes to usf, he lives aro...

In [91]:
sms_raw.dropna(subset=['spam'], inplace=True)

In [92]:
keywords = ['click', 'offer', 'winner', 'buy', 'free', 'cash', 'urgent']

for key in keywords:
    # Note that we add spaces around the key so that we're getting the word,
    # not just pattern matching.
    sms_raw[str(key)] = sms_raw.message.str.contains(
        ' ' + str(key) + ' ',
        case=False
    )

In [93]:
sms_raw['allcaps'] = sms_raw.message.str.isupper()

In [94]:
sms_raw['spam'] = (sms_raw['spam'] == 'spam')
# Note that if you run this cell a second time everything will become false.
# So... Don't.

In [95]:
sns.heatmap(sms_raw.corr())


Out[95]:
<matplotlib.axes._subplots.AxesSubplot at 0x9479290>

In [96]:
data = sms_raw[keywords + ['allcaps']]
target = sms_raw['spam']

In [97]:
from sklearn.naive_bayes import BernoulliNB

# Instantiate our model and store it in a new variable.
bnb = BernoulliNB()

# Fit our model to the data.
bnb.fit(data, target)

# Classify, storing the result in a new variable.
y_pred = bnb.predict(data)

# Display our results.
print("Number of mislabeled points out of a total {} points : {}".format(
    data.shape[0],
    (target != y_pred).sum()
))
print("Success rate of our model out of {} : {}".format(
    data.shape[0],
    1-((target != y_pred).sum()/((target != y_pred).sum()+(target == y_pred).sum()))
))


Number of mislabeled points out of a total 5572 points : 604
Success rate of our model out of 5572 : 0.8916008614501076

In [99]:
from sklearn.metrics import confusion_matrix
confusion_matrix(target, y_pred)


Out[99]:
array([[4770,   55],
       [ 549,  198]], dtype=int64)

In [114]:
# Build your confusion matrix and calculate sensitivity and specificity here.
import pandas as pd
y_actu = pd.Series(data=target, name='Actual')
y_pred = pd.Series(data=y_pred, name='Predicted')
df_confusion = pd.crosstab(y_actu, y_pred)
print(df_confusion)

#Calculate Sensitivity (Correctly identified, true within true)
a = df_confusion.loc[True,True]
b = df_confusion.iloc[1,0:3].sum()
print (a/b)

#Calculate specifity (incorrecly identifie, false wihin the false)
c = df_confusion.loc[False,False]
d = df_confusion.iloc[0, 0:3].sum()
print (c/d)


Predicted  False  True 
Actual                 
False       4770     55
True         549    198
0.265060240964
0.988601036269