IMDB Movie Review Sentiment Analysis with gensim

Binary sentiment classification on IMDB movie reviews with Doc2Vec word embeddings via gensim


In [2]:
# Import packages
import os
import re
import tflearn
import numpy as np
import tensorflow as tf
from random import shuffle
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from gensim.models.doc2vec import TaggedDocument, Doc2Vec

# Define directories
train_pos_dir = 'aclImdb/train/pos/'
train_neg_dir = 'aclImdb/train/neg/'
train_unsup_dir = 'aclImdb/train/unsup/'
test_pos_dir = 'aclImdb/test/pos/'
test_neg_dir = 'aclImdb/test/neg/'

# Define dataset sizes
labeled_set_size = 12500
unlabeled_set_size = 50000

In [3]:
# Read and explore data
train_pos = []
for file_name in os.listdir(train_pos_dir):
    with open(train_pos_dir + file_name, 'r') as myfile:
        train_pos.append(myfile.read())
        
train_neg = []
for file_name in os.listdir(train_neg_dir):
    with open(train_neg_dir + file_name, 'r') as myfile:
        train_neg.append(myfile.read())
        
train_unsup = []
for file_name in os.listdir(train_unsup_dir):
    with open(train_unsup_dir + file_name, 'r') as myfile:
        train_unsup.append(myfile.read())
        
test_pos = []
for file_name in os.listdir(test_pos_dir):
    with open(test_pos_dir + file_name, 'r') as myfile:
        test_pos.append(myfile.read())
        
test_neg = []
for file_name in os.listdir(test_neg_dir):
    with open(test_neg_dir + file_name, 'r') as myfile:
        test_neg.append(myfile.read())     
        
print '%i positive train reviews:' % len(train_pos)
print train_pos[0][:50]
print '\n%i negative train reviews:' % len(train_neg)
print train_neg[0][:50]
print '\n%i unlabeled train reviews:' % len(train_unsup)
print train_unsup[0][:50]
print '\n%i positive test reviews:' % len(test_pos)
print test_pos[0][:50]
print '\n%i negative test reviews:' % len(test_neg)
print test_neg[0][:50]


12500 positive train reviews:
This anime was underrated and still is. Hardly the

12500 negative train reviews:
Whoever wrote the script for this movie does not d

50000 unlabeled train reviews:
The movie Contagion was a well thought out story t

12500 positive test reviews:
I'm not sure what version of the film I saw, but i

12500 negative test reviews:
I think there's a reason this film never came clos

In [3]:
# Preprocess data
stop_words = set(stopwords.words("english"))

def parse_html(data):
    data = BeautifulSoup(data, 'lxml').get_text() # Remove markup
    data = re.sub("[^a-zA-Z]"," ", data) # Remove all non-alphanumeric characters
    data = [x for x in data.lower().split() if not x in stop_words] # Remove stopwords
    return data
    
for i in xrange(labeled_set_size):
    train_pos[i] = TaggedDocument(parse_html(train_pos[i]), ['train_pos_' + str(i)])
    train_neg[i] = TaggedDocument(parse_html(train_neg[i]), ['train_neg_' + str(i)])
    test_pos[i] = TaggedDocument(parse_html(test_pos[i]), ['test_pos_' + str(i)])
    test_neg[i] = TaggedDocument(parse_html(test_neg[i]), ['test_neg_' + str(i)])
    
for i in xrange(unlabeled_set_size):
    train_unsup[i] = TaggedDocument(parse_html(train_unsup[i]), ['train_unsup_' + str(i)])
    
print '%i positive train reviews:' % len(train_pos)
print train_pos[0][0][:8]
print '\n%i negative train reviews:' % len(train_neg)
print train_neg[0][0][:8]
print '\n%i unlabeled train reviews:' % len(train_unsup)
print train_unsup[0][0][:8]
print '\n%i positive test reviews:' % len(test_pos)
print test_pos[0][0][:8]
print '\n%i negative test reviews:' % len(test_neg)
print test_neg[0][0][:8]


12500 positive train reviews:
[u'anime', u'underrated', u'still', u'hardly', u'dorky', u'kids', u'movie', u'noted']

12500 negative train reviews:
[u'whoever', u'wrote', u'script', u'movie', u'deserve', u'work', u'hollywood', u'even']

50000 unlabeled train reviews:
[u'movie', u'contagion', u'well', u'thought', u'story', u'average', u'acting', u'corny']

12500 positive test reviews:
[u'sure', u'version', u'film', u'saw', u'entertaining', u'know', u'twins', u'gillian']

12500 negative test reviews:
[u'think', u'reason', u'film', u'never', u'came', u'close', u'hitting', u'theaters']

In [4]:
# Embed documents using doc2vec
if os.path.isfile('d2v'):
    d2v = Doc2Vec.load('d2v')
else:
    workers = 8 # Number of virtual CPU cores on machine
    window = 16 # Skip-gram window
    min_count = 30 # 30 is the max number of reviews per movie in the dataset

    all_reviews = train_pos + train_neg + train_unsup + test_pos + test_neg
    d2v = Doc2Vec(window=window, workers=workers, min_count=min_count)
    d2v.build_vocab(all_reviews)

    for i in range(10):
        shuffle(all_reviews) # Make sure to shuffle each epoch
        d2v.train(all_reviews)
        print 'epoch %i complete' % (i + 1)
        
    d2v.save('d2v')

In [5]:
# Examine embedding
print 'Most similar to man: '
print d2v.most_similar('man')[:3]
print '\nMost similar to movie: '
print d2v.most_similar('movie')[:3]


Most similar to man: 
[(u'lady', 0.7224129438400269), (u'woman', 0.6937535405158997), (u'guy', 0.6755703687667847)]

Most similar to movie: 
[(u'film', 0.9590884447097778), (u'flick', 0.7753545641899109), (u'show', 0.7212437987327576)]

In [12]:
# Get train and test embedded vectors for classification
x_vector_size = 300
y_vector_size = 2

train_x = np.ndarray([2 * labeled_set_size, x_vector_size])
train_y = np.concatenate((np.ones(labeled_set_size), np.zeros(labeled_set_size)))
test_x = np.ndarray([2 * labeled_set_size, x_vector_size])
test_y = np.concatenate((np.ones(labeled_set_size), np.zeros(labeled_set_size)))

for i in xrange(labeled_set_size):
    train_x[i] = d2v.docvecs['train_pos_' + str(i)]
    test_x[i] = d2v.docvecs['test_pos_' + str(i)]    
    
for i in xrange(labeled_set_size):
    train_x[i + labeled_set_size] = d2v.docvecs['train_neg_' + str(i)]
    test_x[i + labeled_set_size] = d2v.docvecs['test_neg_' + str(i)]    
    
# Shuffle data
shuffled_i = np.random.permutation(np.arange(len(train_x)))
train_x = train_x[shuffled_i]
train_y = train_y[shuffled_i]

In [23]:
# Train and test logistic regression classifier
from sklearn.linear_model import LogisticRegression

logistic_regression_model = LogisticRegression().fit(train_x, train_y)
print 'Test set accuracy: ' + '{:.2%}'.format(logistic_regression_model.score(test_x, test_y))


Test set accuracy: 86.84%