In [ ]:
__author__ = 'Nick Dingwall and Christopher Potts'
The IMDB dataset is here:
http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
This should be unpacked and placed in this directory.
Stanford's publicly-released GloVe vectors are also required and should also be unpacked into this directory:
In [ ]:
import bootstrap
from collections import defaultdict
import glob
import json
import numpy as np
import os
import pandas as pd
import pickle
from mittens.tf_mittens import Mittens, GloVe
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import (
classification_report, accuracy_score,
confusion_matrix, f1_score)
import utils
In [ ]:
def load_texts(dirname):
"""Loads the raw 'unsup' texts and puts them into a `pd.Series`."""
texts = []
for filename in glob.glob(os.path.join(dirname, "*.txt")):
with open(filename) as f:
texts.append(f.read())
return pd.Series(texts)
In [ ]:
texts = load_texts(os.path.join('aclImdb', 'train', 'unsup'))
In [ ]:
X = utils.build_weighted_matrix(texts)
In [ ]:
print("Build a word x word matrix with dimensionality {:,} x {:,}".format(*X.shape))
In [ ]:
def load_labeled_data(dirname):
"""Tokenize the train or test portion of the data, as given by
`dirname`. Returns a list of `(tokens, cls)` pairs where `tokens`
is a list of str and `cls` is a string.
"""
data = []
for cls in ['neg', 'pos']:
for filename in glob.glob(os.path.join(dirname, cls, "*.txt")):
with open(filename) as f:
tokens = utils.basic_tokenizer(f.read())
data.append((tokens, cls))
return data
In [ ]:
train_data = load_labeled_data(os.path.join('aclImdb', 'train'))
In [ ]:
vocab = {w for tokens, _ in train_data for w in tokens}
In [ ]:
test_data = load_labeled_data(os.path.join('aclImdb', 'test'))
In [ ]:
GLOVE_LOOKUP = utils.create_glove_lookup('glove.6B.50d.txt')
In [ ]:
def featurize(data, lookup):
"""Featurizing `data` according to `lookup`, a map from
strings to vectors. The return values are `np.arrays`,
with each examples in `X` represented by the sum of
the vectors for the words it contains.
"""
X = []
y = []
for tokens, label in data:
x = np.array([_get_rep(w, lookup) for w in tokens])
x = x.sum(axis=0)
X.append(x)
y.append(label)
return np.array(X), np.array(y)
def _get_rep(w, lookup):
"""Try to look up `w` in `lookup`, and fall back to GloVe
for out of vocabulary words. If a word is also not in
GloVe, then its representation is random.
"""
if w in lookup:
return lookup[w]
else:
return GLOVE_LOOKUP[w]
In [ ]:
def experiment(train_data, test_data, lookup, label, trial_num):
"""Run a standard IMDB movie review experiment using `lookup` as
the basis for representing examples. The results are pickled to a
file called "results/imdb_{label}.pickle"
"""
output_filename = "results/imdb_{}_trial{}.pickle".format(label, trial_num)
results = {}
# Model:
cv = GridSearchCV(
RandomForestClassifier(),
param_grid={
'n_estimators': [100, 200, 300, 400, 500],
'max_features': ['sqrt', 'log2'],
'max_depth': [3, 5, None]},
refit=True,
n_jobs=-1)
# Split:
X_train, y_train = featurize(train_data, lookup)
X_test, y_test = featurize(test_data, lookup)
# Fit with best estimator and predict:
cv.fit(X_train, y_train)
predictions = cv.predict(X_test)
# CV info:
results['cv_results'] = cv.cv_results_
results['best_params'] = cv.best_params_
results['best_score'] = cv.best_score_
# Test-set scoring:
acc = accuracy_score(y_test, predictions)
results['accuracy'] = acc
results['confusion_matrix'] = confusion_matrix(y_test, predictions)
results['f1'] = f1_score(y_test, predictions, average=None)
results['f1_macro'] = f1_score(y_test, predictions, average='macro')
results['f1_micro'] = f1_score(y_test, predictions, average='micro')
# Summary report:
print("Accuracy: {0:0.04%}".format(acc))
print("Best params:", cv.best_params_)
# Storage:
with open(output_filename, 'wb') as f:
pickle.dump(results, f)
In [ ]:
n_trials = 5
max_iter = 50000
embedding_dim = 50
eta = 0.05
In [ ]:
for trial_num in range(1, n_trials+1):
random_lookup = create_random_lookup(vocab)
experiment(train_data, test_data, random_lookup, 'random', trial_num)
In [ ]:
experiment(train_data, test_data, GLOVE_LOOKUP, 'external_glove', 1)
In [ ]:
for trial_num in range(1, n_trials+1):
glove = GloVe(max_iter=max_iter, n=embedding_dim, eta=eta)
G = glove.fit(X.values)
G = pd.DataFrame(G, index=X.index)
G.to_csv("imdb_glove_embedding_{}.csv.gzip".format(trial_num), compression='gzip')
imdb_glove_lookup = utils.create_lookup(G)
experiment(train_data, test_data, imdb_glove_lookup, 'imdb_glove', trial_num)
In [ ]:
for trial_num in range(1, n_trials+1):
mittens = Mittens(max_iter=max_iter, n=embedding_dim, eta=eta, mittens=1.0)
G_mittens = mittens.fit(
X.values,
vocab=list(X.index),
initial_embedding_dict=GLOVE_LOOKUP)
G_mittens = pd.DataFrame(G_mittens, index=X.index)
G.to_csv("imdb_mittens_embedding_{}.csv.gzip".format(trial_num), compression='gzip')
mittens_lookup = utils.create_lookup(G_mittens)
experiment(train_data, test_data, mittens_lookup, 'mittens', trial_num)
In [ ]:
def convert_all(dirname):
for filename in glob.glob(os.path.join(dirname, "*.pickle")):
data = convert(filename)
def convert(filename):
with open(filename, 'rb') as f:
data = pickle.load(f)
data = type_convert(data)
output_filename = filename.replace(".pickle", ".json")
with open(output_filename, 'wt') as f:
json.dump(data, f, indent=4, sort_keys=True)
return data
def type_convert(d):
for k, v in d.items():
if isinstance(v, dict):
v = type_convert(v)
if type(v) == type(np.array([1])):
v = v.tolist()
elif isinstance(v, np.ma.core.MaskedArray):
v = {'data': v.data.tolist(), 'mask': v.mask.tolist()}
d[k] = v
return d
In [ ]:
convert_all("results")
In [ ]:
def get_ci(vals):
"""Bootstrapped 95% confidence intervals."""
return bootstrap.ci(vals, method='bca')
def analyze_model(model_name):
data = []
base = "imdb_{}_trial*.json".format(model_name)
filenames = glob.glob(os.path.join("results", base))
for filename in filenames:
with open(filename, 'rt') as f:
results = json.load(f)
data.append(results['accuracy'])
data = np.array(data)
mu = "${:0.02%}$".format(data.mean())
if len(data) > 1:
ci = "${:0.02%}-{:0.02%}$".format(*get_ci(data))
else:
ci = "$-$"
print("{:>20} & {} & {}".format(model_name, mu, ci))
In [ ]:
for model_name in ('random', 'external_glove', 'imdb_glove', 'mittens'):
analyze_model(model_name)