In [28]:
import pandas as pd
import numpy as np
corpus = pd.read_csv('smsspamcollection/SMSSpamCollection.txt', sep='\t', header=None)
In [7]:
corpus.head()
Out[7]:
In [10]:
corpus[2] = corpus[0].map(lambda x: 1 if x == 'spam' else 0)
X = corpus[1].as_matrix()
y = corpus[2].as_matrix()
In [12]:
X.shape, y.shape
Out[12]:
In [14]:
from sklearn.feature_extraction.text import CountVectorizer
In [19]:
vectorizer = CountVectorizer()
X_vectorized = vectorizer.fit_transform(X)
In [20]:
X_vectorized
Out[20]:
In [23]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
In [27]:
estimator = LogisticRegression(random_state=2)
cv = cross_val_score(estimator, X_vectorized, y, scoring='f1', cv=10)
In [30]:
np.mean(cv)
Out[30]:
In [31]:
estimator = LogisticRegression(random_state=2)
estimator.fit(X_vectorized, y)
Out[31]:
In [32]:
msg = [
"FreeMsg: Txt: CALL to No: 86888 & claim your reward of 3 hours talk time to use from your phone now! Subscribe6GB",
"FreeMsg: Txt: claim your reward of 3 hours talk time",
"Have you visited the last lecture on physics?",
"Have you visited the last lecture on physics? Just buy this book and you will have all materials! Only 99$",
"Only 99$",
]
In [33]:
msg_vectorized = vectorizer.transform(msg)
estimator.predict(msg_vectorized)
Out[33]:
In [36]:
vectorizer = CountVectorizer(ngram_range=(2, 2))
X_vectorized = vectorizer.fit_transform(X)
estimator = LogisticRegression(random_state=2)
cv = cross_val_score(estimator, X_vectorized, y, scoring='f1', cv=10)
np.mean(cv)
Out[36]:
In [37]:
vectorizer = CountVectorizer(ngram_range=(3, 3))
X_vectorized = vectorizer.fit_transform(X)
estimator = LogisticRegression(random_state=2)
cv = cross_val_score(estimator, X_vectorized, y, scoring='f1', cv=10)
np.mean(cv)
Out[37]:
In [38]:
vectorizer = CountVectorizer(ngram_range=(1, 3))
X_vectorized = vectorizer.fit_transform(X)
estimator = LogisticRegression(random_state=2)
cv = cross_val_score(estimator, X_vectorized, y, scoring='f1', cv=10)
np.mean(cv)
Out[38]:
In [39]:
from sklearn.naive_bayes import MultinomialNB
In [40]:
vectorizer = CountVectorizer(ngram_range=(2, 2))
X_vectorized = vectorizer.fit_transform(X)
estimator = MultinomialNB()
cv = cross_val_score(estimator, X_vectorized, y, scoring='f1', cv=10)
np.mean(cv)
Out[40]:
In [42]:
vectorizer = CountVectorizer(ngram_range=(3, 3))
X_vectorized = vectorizer.fit_transform(X)
estimator = MultinomialNB()
cv = cross_val_score(estimator, X_vectorized, y, scoring='f1', cv=10)
np.mean(cv)
Out[42]:
In [43]:
vectorizer = CountVectorizer(ngram_range=(1, 3))
X_vectorized = vectorizer.fit_transform(X)
estimator = MultinomialNB()
cv = cross_val_score(estimator, X_vectorized, y, scoring='f1', cv=10)
np.mean(cv)
Out[43]:
In [45]:
from sklearn.feature_extraction.text import TfidfVectorizer
In [46]:
vectorizer = TfidfVectorizer()
X_vectorized = vectorizer.fit_transform(X)
estimator = LogisticRegression(random_state=2)
cv = cross_val_score(estimator, X_vectorized, y, scoring='f1', cv=10)
np.mean(cv)
Out[46]:
In [ ]: