In [25]:
import pandas as pd
import re
data = pd.read_csv('data/smsspamcollection/SMSSpamCollection',
sep='\t',
header=None,
names=['Status', 'Sentence'],
true_values=['spam'],
false_values=['ham'])
data.head()
Out[25]:
In [34]:
data['lower'] = [sentence.lower() for sentence in data['Sentence']]
In [27]:
data.head()
Out[27]:
In [57]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(analyzer = "word", \
tokenizer = None, \
preprocessor = None, \
stop_words = None, \
max_features = 5000)
train_data_features = vectorizer.fit_transform(data['lower'])
# Numpy arrays are easy to work with, so convert the result to an
# array
train_data_features = train_data_features.toarray()
# let's see what we have there
vectorizer.get_feature_names()[-5:]
Out[57]:
In [59]:
len(data)
Out[59]:
I've tried here to compare first centence's vector to all other vectors.
First vector status is not spam (=False). I also calculate how many true positives (vectors with cosine < 1, which is also not spam) and false positive (cosine < 1, but marked as a spam).
I'll evaluate each method with F1 metric: $$F1=\dfrac{2*TP}{(2*TP + FP + FN)}$$
In [79]:
from scipy.spatial.distance import cosine
cosines = {}
# print("First sentence: %s\nSpam: %s\n\n" % (data['lower'][0], data['Status'][0]))
first_vector = train_data_features[0]
for i in range(1, len(data)):
cosines[i] = cosine(first_vector, train_data_features[i])
# print(cosines)
false_status = 0
true_status = 0
FN = 0
for i in range(1, len(data)):
if cosines[i] < 1.0:
if data['Status'][i] == True:
true_status += 1
else:
false_status += 1
else:
if data['Status'][i] == False:
FN += 1
TP = false_status
FP = true_status
F1 = 2*TP/(2*TP+FP+FN)
print("F1 = %0.4f" % F1)
In [ ]: