In [25]:
import pandas as pd
import re

data = pd.read_csv('data/smsspamcollection/SMSSpamCollection', 
            sep='\t', 
            header=None, 
            names=['Status', 'Sentence'], 
            true_values=['spam'], 
            false_values=['ham'])
data.head()


Out[25]:
Status Sentence
0 False Go until jurong point, crazy.. Available only ...
1 False Ok lar... Joking wif u oni...
2 True Free entry in 2 a wkly comp to win FA Cup fina...
3 False U dun say so early hor... U c already then say...
4 False Nah I don't think he goes to usf, he lives aro...

In [34]:
data['lower'] = [sentence.lower() for sentence in data['Sentence']]

In [27]:
data.head()


Out[27]:
Status Sentence lower
0 False Go until jurong point, crazy.. Available only ... go until jurong point, crazy.. available only ...
1 False Ok lar... Joking wif u oni... ok lar... joking wif u oni...
2 True Free entry in 2 a wkly comp to win FA Cup fina... free entry in 2 a wkly comp to win fa cup fina...
3 False U dun say so early hor... U c already then say... u dun say so early hor... u c already then say...
4 False Nah I don't think he goes to usf, he lives aro... nah i don't think he goes to usf, he lives aro...

Create the bag of words


In [57]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 5000) 

train_data_features = vectorizer.fit_transform(data['lower'])

# Numpy arrays are easy to work with, so convert the result to an 
# array
train_data_features = train_data_features.toarray()

# let's see what we have there
vectorizer.get_feature_names()[-5:]


Out[57]:
['zogtorius', 'zoom', 'zouk', 'zyada', 'èn']

In [59]:
len(data)


Out[59]:
5572

Claculate cosine

I've tried here to compare first centence's vector to all other vectors.

First vector status is not spam (=False). I also calculate how many true positives (vectors with cosine < 1, which is also not spam) and false positive (cosine < 1, but marked as a spam).

I'll evaluate each method with F1 metric: $$F1=\dfrac{2*TP}{(2*TP + FP + FN)}$$


In [79]:
from scipy.spatial.distance import cosine
cosines = {}

# print("First sentence: %s\nSpam: %s\n\n" % (data['lower'][0], data['Status'][0]))

first_vector = train_data_features[0]
for i in range(1, len(data)):
    cosines[i] = cosine(first_vector, train_data_features[i])
    
# print(cosines)
false_status = 0
true_status = 0
FN = 0
for i in range(1, len(data)):
    if cosines[i] < 1.0:
        if data['Status'][i] == True:
            true_status += 1
        else:
            false_status += 1
    else:
        if data['Status'][i] == False:
            FN += 1
            
TP = false_status
FP = true_status
F1 = 2*TP/(2*TP+FP+FN)
print("F1 = %0.4f" % F1)


F1 = 0.4611

In [ ]: