In [ ]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
In [ ]:
import os
with open(os.path.join("datasets", "smsspam", "SMSSpamCollection")) as f:
lines = [line.strip().split("\t") for line in f.readlines()]
text = [x[1] for x in lines]
y = [x[0] == "ham" for x in lines]
In [ ]:
text[:10]
In [ ]:
y[:10]
In [ ]:
type(text)
In [ ]:
type(y)
In [ ]:
from sklearn.cross_validation import train_test_split
text_train, text_test, y_train, y_test = train_test_split(text, y, random_state=42)
In [ ]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
vectorizer.fit(text_train)
X_train = vectorizer.transform(text_train)
X_test = vectorizer.transform(text_test)
In [ ]:
print(len(vectorizer.vocabulary_))
In [ ]:
X_train.shape
In [ ]:
print(vectorizer.get_feature_names()[:20])
In [ ]:
print(vectorizer.get_feature_names()[3000:3020])
In [ ]:
print(X_train.shape)
print(X_test.shape)
We can now train a classifier, for instance a logistic regression classifier which is a fast baseline for text classification tasks:
In [ ]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf
In [ ]:
clf.fit(X_train, y_train)
We can now evaluate the classifier on the testing set. Let's first use the builtin score function, which is the rate of correct classification in the test set:
In [ ]:
clf.score(X_test, y_test)
We can also compute the score on the training set, to see how well we do there:
In [ ]:
clf.score(X_train, y_train)
In [ ]:
def visualize_coefficients(classifier, feature_names, n_top_features=25):
# get coefficients with large absolute values
coef = classifier.coef_.ravel()
positive_coefficients = np.argsort(coef)[-n_top_features:]
negative_coefficients = np.argsort(coef)[:n_top_features]
interesting_coefficients = np.hstack([negative_coefficients, positive_coefficients])
# plot them
plt.figure(figsize=(15, 5))
colors = ["red" if c < 0 else "blue" for c in coef[interesting_coefficients]]
plt.bar(np.arange(50), coef[interesting_coefficients], color=colors)
feature_names = np.array(feature_names)
plt.xticks(np.arange(1, 51), feature_names[interesting_coefficients], rotation=60, ha="right");
In [ ]:
visualize_coefficients(clf, vectorizer.get_feature_names())
In [ ]:
vectorizer = CountVectorizer(min_df=2)
vectorizer.fit(text_train)
X_train = vectorizer.transform(text_train)
X_test = vectorizer.transform(text_test)
clf = LogisticRegression()
clf.fit(X_train, y_train)
print(clf.score(X_train, y_train))
print(clf.score(X_test, y_test))
In [ ]:
visualize_coefficients(clf, vectorizer.get_feature_names())
In [ ]: