IMDB sentiment classification


In [ ]:
__author__ = 'Nick Dingwall and Christopher Potts'

The IMDB dataset is here:

http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz

This should be unpacked and placed in this directory.

Stanford's publicly-released GloVe vectors are also required and should also be unpacked into this directory:

http://nlp.stanford.edu/data/glove.6B.zip


In [ ]:
import bootstrap
from collections import defaultdict
import glob
import json
import numpy as np
import os
import pandas as pd
import pickle
from mittens.tf_mittens import Mittens, GloVe
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import (
    classification_report, accuracy_score, 
    confusion_matrix, f1_score)
import utils

Count matrix from the unsupervised data


In [ ]:
def load_texts(dirname):
    """Loads the raw 'unsup' texts and puts them into a `pd.Series`."""
    texts = []
    for filename in glob.glob(os.path.join(dirname, "*.txt")):
        with open(filename) as f:
            texts.append(f.read())
    return pd.Series(texts)

In [ ]:
texts = load_texts(os.path.join('aclImdb', 'train', 'unsup'))

In [ ]:
X = utils.build_weighted_matrix(texts)

In [ ]:
print("Build a word x word matrix with dimensionality {:,} x {:,}".format(*X.shape))

Train/test split


In [ ]:
def load_labeled_data(dirname):
    """Tokenize the train or test portion of the data, as given by 
    `dirname`. Returns a list of `(tokens, cls)` pairs where `tokens` 
    is a list of str and `cls` is a string.    
    """
    data = []
    for cls in ['neg', 'pos']:
        for filename in glob.glob(os.path.join(dirname, cls, "*.txt")):
            with open(filename) as f:
                tokens = utils.basic_tokenizer(f.read())
                data.append((tokens, cls))
    return data

In [ ]:
train_data = load_labeled_data(os.path.join('aclImdb', 'train'))

In [ ]:
vocab = {w for tokens, _ in train_data for w in tokens}

In [ ]:
test_data = load_labeled_data(os.path.join('aclImdb', 'test'))

Featurization


In [ ]:
GLOVE_LOOKUP = utils.create_glove_lookup('glove.6B.50d.txt')

In [ ]:
def featurize(data, lookup):
    """Featurizing `data` according to `lookup`, a map from
    strings to vectors. The return values are `np.arrays`,
    with each examples in `X` represented by the sum of 
    the vectors for the words it contains.    
    """
    X = []
    y = []
    for tokens, label in data:            
        x = np.array([_get_rep(w, lookup) for w in tokens])
        x = x.sum(axis=0)
        X.append(x)
        y.append(label)
    return np.array(X), np.array(y)

def _get_rep(w, lookup):
    """Try to look up `w` in `lookup`, and fall back to GloVe
    for out of vocabulary words. If a word is also not in 
    GloVe, then its representation is random.
    """
    if w in lookup:
        return lookup[w]
    else:
        return GLOVE_LOOKUP[w]

Experiment framework


In [ ]:
def experiment(train_data, test_data, lookup, label, trial_num):
    """Run a standard IMDB movie review experiment using `lookup` as 
    the basis for representing examples. The results are pickled to a 
    file called "results/imdb_{label}.pickle"    
    """        
    output_filename = "results/imdb_{}_trial{}.pickle".format(label, trial_num)            

    results = {}
    
    # Model:
    cv = GridSearchCV(
        RandomForestClassifier(), 
        param_grid={
            'n_estimators': [100, 200, 300, 400, 500],
            'max_features': ['sqrt', 'log2'],
            'max_depth': [3, 5, None]}, 
        refit=True, 
        n_jobs=-1)  
    
    # Split:
    X_train, y_train = featurize(train_data, lookup)
    X_test, y_test = featurize(test_data, lookup)
    
    # Fit with best estimator and predict:
    cv.fit(X_train, y_train)
    predictions = cv.predict(X_test) 
    
    # CV info:
    results['cv_results'] = cv.cv_results_
    results['best_params'] = cv.best_params_
    results['best_score'] = cv.best_score_
        
    # Test-set scoring:
    acc = accuracy_score(y_test, predictions)               
    results['accuracy'] = acc
    results['confusion_matrix'] = confusion_matrix(y_test, predictions)
    results['f1'] = f1_score(y_test, predictions, average=None)
    results['f1_macro'] = f1_score(y_test, predictions, average='macro')
    results['f1_micro'] = f1_score(y_test, predictions, average='micro')
    
    # Summary report:
    print("Accuracy: {0:0.04%}".format(acc))
    print("Best params:", cv.best_params_)
          
    # Storage:
    with open(output_filename, 'wb') as f:
        pickle.dump(results, f)

Experiments


In [ ]:
n_trials = 5

max_iter = 50000

embedding_dim = 50

eta = 0.05

Random


In [ ]:
for trial_num in range(1, n_trials+1):
    random_lookup = create_random_lookup(vocab)
    experiment(train_data, test_data, random_lookup, 'random', trial_num)

External GloVe


In [ ]:
experiment(train_data, test_data, GLOVE_LOOKUP, 'external_glove', 1)

IMDB GloVe


In [ ]:
for trial_num in range(1, n_trials+1):    
    glove = GloVe(max_iter=max_iter, n=embedding_dim, eta=eta)
    G = glove.fit(X.values)
    G = pd.DataFrame(G, index=X.index)
    G.to_csv("imdb_glove_embedding_{}.csv.gzip".format(trial_num), compression='gzip')
    imdb_glove_lookup = utils.create_lookup(G)    
    experiment(train_data, test_data, imdb_glove_lookup, 'imdb_glove', trial_num)

Mittens


In [ ]:
for trial_num in range(1, n_trials+1):      
    mittens = Mittens(max_iter=max_iter, n=embedding_dim, eta=eta, mittens=1.0)
    G_mittens = mittens.fit(
        X.values, 
        vocab=list(X.index), 
        initial_embedding_dict=GLOVE_LOOKUP)
    G_mittens = pd.DataFrame(G_mittens, index=X.index)
    G.to_csv("imdb_mittens_embedding_{}.csv.gzip".format(trial_num), compression='gzip')
    mittens_lookup = utils.create_lookup(G_mittens)    
    experiment(train_data, test_data, mittens_lookup, 'mittens', trial_num)

Convert pickled results to JSON for portability


In [ ]:
def convert_all(dirname):
    for filename in glob.glob(os.path.join(dirname, "*.pickle")):
        data = convert(filename)

def convert(filename):
    with open(filename, 'rb') as f:
        data = pickle.load(f)
    data = type_convert(data)
        
    output_filename = filename.replace(".pickle", ".json")
    with open(output_filename, 'wt') as f:
        json.dump(data, f, indent=4, sort_keys=True)
    return data

def type_convert(d):
    for k, v in d.items():
        if isinstance(v, dict):
            v = type_convert(v)
        if type(v) == type(np.array([1])):
            v = v.tolist()
        elif isinstance(v, np.ma.core.MaskedArray):
            v = {'data': v.data.tolist(), 'mask': v.mask.tolist()}
        d[k] = v
    return d

In [ ]:
convert_all("results")

Analysis


In [ ]:
def get_ci(vals):
    """Bootstrapped 95% confidence intervals."""
    return bootstrap.ci(vals, method='bca')

def analyze_model(model_name):
    data = []
    base = "imdb_{}_trial*.json".format(model_name)
    filenames = glob.glob(os.path.join("results", base))
    for filename in filenames:
        with open(filename, 'rt') as f:
            results = json.load(f)
            data.append(results['accuracy'])
    data = np.array(data)
    mu = "${:0.02%}$".format(data.mean())
    if len(data) > 1:
        ci = "${:0.02%}-{:0.02%}$".format(*get_ci(data))
    else:
        ci = "$-$"
    print("{:>20} & {} & {}".format(model_name, mu, ci))

In [ ]:
for model_name in ('random', 'external_glove', 'imdb_glove', 'mittens'):
    analyze_model(model_name)