A good starting point for understanding recent work in sentiment analysis and text classification is Baselines and Bigrams: Simple, Good Sentiment and Topic Classification by Sida Wang and Christopher D. Manning. In this notebook, I'll implement the models described in that paper and try to reproduce their results on several datasets.
| AthR | XGraph | BbCrypt | CR | IMDB | MPQA | RT-2k | RTs | subj | |
|---|---|---|---|---|---|---|---|---|---|
| 85.13 | 91.19 | 99.40 | 79.97 | 86.59 | 86.27 | 85.85 | 79.03 | 93.56 | MNB-bigram |
| 84.99 | 89.96 | 99.29 | 79.76 | 83.55 | 85.29 | 83.45 | 77.94 | 92.58 | MNB-unigram |
| 83.73 | 86.17 | 97.68 | 80.85 | 89.16 | 86.72 | 87.40 | 77.72 | 91.74 | SVM-bigram |
| 82.61 | 85.14 | 98.29 | 79.02 | 86.95 | 86.15 | 86.25 | 76.23 | 90.84 | SVM-unigram |
| 87.66 | 90.68 | 99.50 | 81.75 | 91.22 | 86.32 | 89.45 | 79.38 | 93.18 | NBSVM-bigram |
| 87.94 | 91.19 | 99.70 | 80.45 | 88.29 | 85.25 | 87.80 | 78.05 | 92.40 | SVM-unigram |
The baselines and bigrams paper uses several standard datasets to run sentiment analysis experiments. In this section I'll show how to prepare these datasets for training and evaluating classifiers.
The dataset consists of 2,000 full-length movie reviews and was introducted in Pang and Lee, 2004.
The dataset consists of 2,000 full-length movie reviews and was introducted in Pang and Lee, 2004.
In [ ]:
import numpy as np
import pandas as pd
A large movie review dataset with 50K full-length reviews Maas et al., 2011.
In [64]:
imdb_df = pd.read_csv('/home/data/sentiment-analysis-and-text-classification/baselines-and-bigrams/aclImdb/data-frame.csv')
In [66]:
imdb_X_train = imdb_df.loc[:25000, 'review'].values
imdb_y_train = imdb_df.loc[:25000, 'sentiment'].values
imdb_X_test = imdb_df.loc[25000:, 'review'].values
imdb_y_test = imdb_df.loc[25000:, 'sentiment'].values
In [67]:
import numpy as np
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
In [71]:
imdb_clf = Pipeline([
('vect', CountVectorizer(ngram_range=(1,2))),
('tfidf', TfidfTransformer()),
('clf', MultinomialNB())
])
_ = imdb_clf.fit(imdb_X_train, imdb_y_train)
In [72]:
predicted = imdb_clf.predict(imdb_X_test)
np.mean(predicted == imdb_y_test)
Out[72]:
In [73]:
print(metrics.classification_report(y_test, predicted, target_names=['neg', 'pos']))
metrics.confusion_matrix(imdb_y_test, predicted)
Out[73]:
There are several implementations of NBSVM available; for example:
I'll follow the elegant implementation in scikit-learn by Joshua Chin.
In [ ]:
from scipy.sparse import spmatrix, coo_matrix
from sklearn.base import BaseEstimator
from sklearn.linear_model.base import LinearClassifierMixin, SparseCoefMixin
from sklearn.svm import LinearSVC
In [32]:
class NBSVM(BaseEstimator, LinearClassifierMixin, SparseCoefMixin):
def __init__(self, alpha=1, C=1, beta=0.25, fit_intercept=False):
self.alpha = alpha
self.C = C
self.beta = beta
self.fit_intercept = fit_intercept
def fit(self, X, y):
self.classes_ = np.unique(y)
if len(self.classes_) == 2:
coef_, intercept_ = self._fit_binary(X, y)
self.coef_ = coef_
self.intercept_ = intercept_
else:
coef_, intercept_ = zip(*[
self._fit_binary(X, y == class_)
for class_ in self.classes_
])
self.coef_ = np.concatenate(coef_)
self.intercept_ = np.array(intercept_).flatten()
return self
def _fit_binary(self, X, y):
p = np.asarray(self.alpha + X[y == 1].sum(axis=0)).flatten()
q = np.asarray(self.alpha + X[y == 0].sum(axis=0)).flatten()
r = np.log(p/np.abs(p).sum()) - np.log(q/np.abs(q).sum())
b = np.log((y == 1).sum()) - np.log((y == 0).sum())
if isinstance(X, spmatrix):
indices = np.arange(len(r))
r_sparse = coo_matrix(
(r, (indices, indices)),
shape=(len(r), len(r))
)
X_scaled = X * r_sparse
else:
X_scaled = X * r
lsvc = LinearSVC(
C=self.C,
fit_intercept=self.fit_intercept,
max_iter=10000
).fit(X_scaled, y)
mean_mag = np.abs(lsvc.coef_).mean()
coef_ = (1 - self.beta) * mean_mag * r + self.beta * (r * lsvc.coef_)
intercept_ = (1 - self.beta) * mean_mag * b + self.beta * lsvc.intercept_
return coef_, intercept_
In [108]:
imdb_nbsvm = Pipeline([
('vect', CountVectorizer(),
('clf', NBSVM()
])
_ = imdb_nbsvm.fit(imdb_X_train, imdb_y_train)
In [109]:
predicted = imdb_nbsvm.predict(imdb_X_test)
np.mean(predicted == imdb_y_test)
Out[109]:
In [110]:
print(metrics.classification_report(imdb_y_test, predicted, target_names=['neg', 'pos']))
metrics.confusion_matrix(imdb_y_test, predicted)
Out[110]:
In [133]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from scipy.stats import uniform
imdb_nbsvm = Pipeline([
('vect', CountVectorizer()),
('clf', NBSVM())
])
param_distributions = {
'vect__ngram_range': [(1,2), (1,3)],
'vect__stop_words': [None],
'clf__beta': uniform(0, 1),
'clf__C': [1.0]
}
rsearch = RandomizedSearchCV(estimator=imdb_nbsvm, param_distributions=param_distributions, n_iter=25)
rsearch.fit(imdb_X_train, imdb_y_train);
In [139]:
#print(rsearch)
# summarize the results of the grid search
print(rsearch.best_score_)
print(rsearch.best_params_)
In [143]:
imdb_nbsvm = Pipeline([
('vect', CountVectorizer(ngram_range=(1, 3))),
('clf', NBSVM(C=1.0, beta=0.53300408355730355))
])
imdb_nbsvm.fit(imdb_X_train, imdb_y_train);
In [144]:
predicted = imdb_nbsvm.predict(imdb_X_test)
In [145]:
np.mean(predicted == imdb_y_test)
Out[145]:
In [146]:
print(metrics.classification_report(imdb_y_test, predicted, target_names=['neg', 'pos']))
metrics.confusion_matrix(y_test, predicted)
Out[146]:
In [ ]:
# prepare a uniform distribution to sample for the alpha parameter
param_grid = {'alpha': sp_rand()}
# create and fit a ridge regression model, testing random alpha values
model = Ridge()
rsearch = RandomizedSearchCV(estimator=model, param_distributions=param_grid, n_iter=100)
rsearch.fit(dataset.data, dataset.target)
print(rsearch)
# summarize the results of the random parameter search
print(rsearch.best_score_)
print(rsearch.best_estimator_.alpha)
In [ ]:
imdb_nbsvm = Pipeline([
('vect', CountVectorizer(ngram_range=(1,2), stop_words='english')),
('clf', NBSVM( C=0.3, beta=0.5))
])
In [56]:
p = np.asarray(1.0 + X_train[y_train == 1].sum(axis=0)).flatten()
q = np.asarray(1.0 + X_train[y_train == 0].sum(axis=0)).flatten()
r = np.log(p/np.abs(p).sum()) - np.log(q/np.abs(q).sum())
In [59]:
print(X_train.shape)
print("p", p.shape)
print("q", q.shape)
print("r", r.shape)
indices = np.arange(len(r))
print(len(indices))
indices[:10]
Out[59]:
In [61]:
for x in zip(*[(1,[1,2,3]), (1,[1,2,3]), (1,[1,2,3]), (1,[1,2,3])]): print(x)
In [26]:
import glob
import os
import string
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
def load_imdb(data_directory='/home/data/sentiment-analysis-and-text-classification/baselines-and-bigrams/aclImdb'):
print("Vectorizing Training Text")
train_pos = glob.glob(os.path.join(data_directory, 'train', 'pos', '*.txt'))
train_neg = glob.glob(os.path.join(data_directory, 'train', 'neg', '*.txt'))
token_pattern = r'\w+|[%s]' % string.punctuation
vectorizer = CountVectorizer(
'filename',
ngram_range=(1, 3),
token_pattern=token_pattern,
binary=True
)
X_train = vectorizer.fit_transform(train_pos+train_neg)
y_train = np.array([1]*len(train_pos)+[0]*len(train_neg))
print("Vocabulary Size: %s" % len(vectorizer.vocabulary_))
print("Vectorizing Testing Text")
test_pos = glob.glob(os.path.join(data_directory, 'test', 'pos', '*.txt'))
test_neg = glob.glob(os.path.join(data_directory, 'test', 'neg', '*.txt'))
X_test = vectorizer.transform(test_pos + test_neg)
y_test = np.array([1]*len(test_pos)+[0]*len(test_neg))
return X_train, y_train, X_test, y_test
In [62]:
%time X_train, y_train, X_test, y_test = load_imdb()
In [30]:
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)
In [33]:
mnbsvm = NBSVM()
mnbsvm.fit(X_train, y_train)
Out[33]:
In [34]:
print('Test Accuracy: %s' % mnbsvm.score(X_test, y_test))
In [41]:
X_test[0]
Out[41]:
In [42]:
11495 / (11495 + 987)
Out[42]:
In [45]:
predicted[:30]
Out[45]:
In [47]:
y_test[-30:]
Out[47]:
In [ ]: