notebook.community



In [25]:

    
import pandas as pd
import re

data = pd.read_csv('data/smsspamcollection/SMSSpamCollection', 
            sep='\t', 
            header=None, 
            names=['Status', 'Sentence'], 
            true_values=['spam'], 
            false_values=['ham'])
data.head()









    Out[25]:






  
    
      
      Status
      Sentence
    
  
  
    
      0
      False
      Go until jurong point, crazy.. Available only ...
    
    
      1
      False
      Ok lar... Joking wif u oni...
    
    
      2
      True
      Free entry in 2 a wkly comp to win FA Cup fina...
    
    
      3
      False
      U dun say so early hor... U c already then say...
    
    
      4
      False
      Nah I don't think he goes to usf, he lives aro...



In [34]:

    
data['lower'] = [sentence.lower() for sentence in data['Sentence']]



In [27]:

    
data.head()









    Out[27]:






  
    
      
      Status
      Sentence
      lower
    
  
  
    
      0
      False
      Go until jurong point, crazy.. Available only ...
      go until jurong point, crazy.. available only ...
    
    
      1
      False
      Ok lar... Joking wif u oni...
      ok lar... joking wif u oni...
    
    
      2
      True
      Free entry in 2 a wkly comp to win FA Cup fina...
      free entry in 2 a wkly comp to win fa cup fina...
    
    
      3
      False
      U dun say so early hor... U c already then say...
      u dun say so early hor... u c already then say...
    
    
      4
      False
      Nah I don't think he goes to usf, he lives aro...
      nah i don't think he goes to usf, he lives aro...

Create the bag of words



In [57]:

    
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 5000) 

train_data_features = vectorizer.fit_transform(data['lower'])

# Numpy arrays are easy to work with, so convert the result to an 
# array
train_data_features = train_data_features.toarray()

# let's see what we have there
vectorizer.get_feature_names()[-5:]









    Out[57]:





['zogtorius', 'zoom', 'zouk', 'zyada', 'èn']



In [59]:

    
len(data)









    Out[59]:





5572

Claculate cosine

I've tried here to compare first centence's vector to all other vectors.

First vector status is not spam (=False). I also calculate how many true positives (vectors with cosine < 1, which is also not spam) and false positive (cosine < 1, but marked as a spam).

I'll evaluate each method with F1 metric: $$F1=\dfrac{2*TP}{(2*TP + FP + FN)}$$



In [79]:

    
from scipy.spatial.distance import cosine
cosines = {}

# print("First sentence: %s\nSpam: %s\n\n" % (data['lower'][0], data['Status'][0]))

first_vector = train_data_features[0]
for i in range(1, len(data)):
    cosines[i] = cosine(first_vector, train_data_features[i])
    
# print(cosines)
false_status = 0
true_status = 0
FN = 0
for i in range(1, len(data)):
    if cosines[i] < 1.0:
        if data['Status'][i] == True:
            true_status += 1
        else:
            false_status += 1
    else:
        if data['Status'][i] == False:
            FN += 1
            
TP = false_status
FP = true_status
F1 = 2*TP/(2*TP+FP+FN)
print("F1 = %0.4f" % F1)









    



F1 = 0.4611



In [ ]:

	Status	Sentence
0	False	Go until jurong point, crazy.. Available only ...
1	False	Ok lar... Joking wif u oni...
2	True	Free entry in 2 a wkly comp to win FA Cup fina...
3	False	U dun say so early hor... U c already then say...
4	False	Nah I don't think he goes to usf, he lives aro...