In [28]:
import pandas as pd
import numpy as np

corpus = pd.read_csv('smsspamcollection/SMSSpamCollection.txt', sep='\t', header=None)

In [7]:
corpus.head()


Out[7]:
0 1
0 ham Go until jurong point, crazy.. Available only ...
1 ham Ok lar... Joking wif u oni...
2 spam Free entry in 2 a wkly comp to win FA Cup fina...
3 ham U dun say so early hor... U c already then say...
4 ham Nah I don't think he goes to usf, he lives aro...

In [10]:
corpus[2] = corpus[0].map(lambda x: 1 if x == 'spam' else 0)
X = corpus[1].as_matrix()
y = corpus[2].as_matrix()

In [12]:
X.shape, y.shape


Out[12]:
((5572,), (5572,))

In [14]:
from sklearn.feature_extraction.text import CountVectorizer

In [19]:
vectorizer = CountVectorizer()
X_vectorized = vectorizer.fit_transform(X)

In [20]:
X_vectorized


Out[20]:
<5572x8713 sparse matrix of type '<class 'numpy.int64'>'
	with 74169 stored elements in Compressed Sparse Row format>

In [23]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

In [27]:
estimator = LogisticRegression(random_state=2)
cv = cross_val_score(estimator, X_vectorized, y, scoring='f1', cv=10)

In [30]:
np.mean(cv)


Out[30]:
0.9326402983610631

In [31]:
estimator = LogisticRegression(random_state=2)
estimator.fit(X_vectorized, y)


Out[31]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=2, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [32]:
msg = [
    "FreeMsg: Txt: CALL to No: 86888 & claim your reward of 3 hours talk time to use from your phone now! Subscribe6GB",
    "FreeMsg: Txt: claim your reward of 3 hours talk time",
    "Have you visited the last lecture on physics?",
    "Have you visited the last lecture on physics? Just buy this book and you will have all materials! Only 99$",
    "Only 99$",
]

In [33]:
msg_vectorized = vectorizer.transform(msg)
estimator.predict(msg_vectorized)


Out[33]:
array([1, 1, 0, 0, 0])

In [36]:
vectorizer = CountVectorizer(ngram_range=(2, 2))
X_vectorized = vectorizer.fit_transform(X)

estimator = LogisticRegression(random_state=2)
cv = cross_val_score(estimator, X_vectorized, y, scoring='f1', cv=10)
np.mean(cv)


Out[36]:
0.82242206641871329

In [37]:
vectorizer = CountVectorizer(ngram_range=(3, 3))
X_vectorized = vectorizer.fit_transform(X)

estimator = LogisticRegression(random_state=2)
cv = cross_val_score(estimator, X_vectorized, y, scoring='f1', cv=10)
np.mean(cv)


Out[37]:
0.72501615554673771

In [38]:
vectorizer = CountVectorizer(ngram_range=(1, 3))
X_vectorized = vectorizer.fit_transform(X)

estimator = LogisticRegression(random_state=2)
cv = cross_val_score(estimator, X_vectorized, y, scoring='f1', cv=10)
np.mean(cv)


Out[38]:
0.92513825586488374

In [39]:
from sklearn.naive_bayes import MultinomialNB

In [40]:
vectorizer = CountVectorizer(ngram_range=(2, 2))
X_vectorized = vectorizer.fit_transform(X)

estimator = MultinomialNB()
cv = cross_val_score(estimator, X_vectorized, y, scoring='f1', cv=10)
np.mean(cv)


Out[40]:
0.64550151779854426

In [42]:
vectorizer = CountVectorizer(ngram_range=(3, 3))
X_vectorized = vectorizer.fit_transform(X)

estimator = MultinomialNB()
cv = cross_val_score(estimator, X_vectorized, y, scoring='f1', cv=10)
np.mean(cv)


Out[42]:
0.37871948524573595

In [43]:
vectorizer = CountVectorizer(ngram_range=(1, 3))
X_vectorized = vectorizer.fit_transform(X)

estimator = MultinomialNB()
cv = cross_val_score(estimator, X_vectorized, y, scoring='f1', cv=10)
np.mean(cv)


Out[43]:
0.88848596560610016

In [45]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [46]:
vectorizer = TfidfVectorizer()
X_vectorized = vectorizer.fit_transform(X)

estimator = LogisticRegression(random_state=2)
cv = cross_val_score(estimator, X_vectorized, y, scoring='f1', cv=10)
np.mean(cv)


Out[46]:
0.85285995541724557

In [ ]: