In [2]:
# Import packages
import os
import re
import tflearn
import numpy as np
import tensorflow as tf
from random import shuffle
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from gensim.models.doc2vec import TaggedDocument, Doc2Vec
# Define directories
train_pos_dir = 'aclImdb/train/pos/'
train_neg_dir = 'aclImdb/train/neg/'
train_unsup_dir = 'aclImdb/train/unsup/'
test_pos_dir = 'aclImdb/test/pos/'
test_neg_dir = 'aclImdb/test/neg/'
# Define dataset sizes
labeled_set_size = 12500
unlabeled_set_size = 50000
In [3]:
# Read and explore data
train_pos = []
for file_name in os.listdir(train_pos_dir):
with open(train_pos_dir + file_name, 'r') as myfile:
train_pos.append(myfile.read())
train_neg = []
for file_name in os.listdir(train_neg_dir):
with open(train_neg_dir + file_name, 'r') as myfile:
train_neg.append(myfile.read())
train_unsup = []
for file_name in os.listdir(train_unsup_dir):
with open(train_unsup_dir + file_name, 'r') as myfile:
train_unsup.append(myfile.read())
test_pos = []
for file_name in os.listdir(test_pos_dir):
with open(test_pos_dir + file_name, 'r') as myfile:
test_pos.append(myfile.read())
test_neg = []
for file_name in os.listdir(test_neg_dir):
with open(test_neg_dir + file_name, 'r') as myfile:
test_neg.append(myfile.read())
print '%i positive train reviews:' % len(train_pos)
print train_pos[0][:50]
print '\n%i negative train reviews:' % len(train_neg)
print train_neg[0][:50]
print '\n%i unlabeled train reviews:' % len(train_unsup)
print train_unsup[0][:50]
print '\n%i positive test reviews:' % len(test_pos)
print test_pos[0][:50]
print '\n%i negative test reviews:' % len(test_neg)
print test_neg[0][:50]
In [3]:
# Preprocess data
stop_words = set(stopwords.words("english"))
def parse_html(data):
data = BeautifulSoup(data, 'lxml').get_text() # Remove markup
data = re.sub("[^a-zA-Z]"," ", data) # Remove all non-alphanumeric characters
data = [x for x in data.lower().split() if not x in stop_words] # Remove stopwords
return data
for i in xrange(labeled_set_size):
train_pos[i] = TaggedDocument(parse_html(train_pos[i]), ['train_pos_' + str(i)])
train_neg[i] = TaggedDocument(parse_html(train_neg[i]), ['train_neg_' + str(i)])
test_pos[i] = TaggedDocument(parse_html(test_pos[i]), ['test_pos_' + str(i)])
test_neg[i] = TaggedDocument(parse_html(test_neg[i]), ['test_neg_' + str(i)])
for i in xrange(unlabeled_set_size):
train_unsup[i] = TaggedDocument(parse_html(train_unsup[i]), ['train_unsup_' + str(i)])
print '%i positive train reviews:' % len(train_pos)
print train_pos[0][0][:8]
print '\n%i negative train reviews:' % len(train_neg)
print train_neg[0][0][:8]
print '\n%i unlabeled train reviews:' % len(train_unsup)
print train_unsup[0][0][:8]
print '\n%i positive test reviews:' % len(test_pos)
print test_pos[0][0][:8]
print '\n%i negative test reviews:' % len(test_neg)
print test_neg[0][0][:8]
In [4]:
# Embed documents using doc2vec
if os.path.isfile('d2v'):
d2v = Doc2Vec.load('d2v')
else:
workers = 8 # Number of virtual CPU cores on machine
window = 16 # Skip-gram window
min_count = 30 # 30 is the max number of reviews per movie in the dataset
all_reviews = train_pos + train_neg + train_unsup + test_pos + test_neg
d2v = Doc2Vec(window=window, workers=workers, min_count=min_count)
d2v.build_vocab(all_reviews)
for i in range(10):
shuffle(all_reviews) # Make sure to shuffle each epoch
d2v.train(all_reviews)
print 'epoch %i complete' % (i + 1)
d2v.save('d2v')
In [5]:
# Examine embedding
print 'Most similar to man: '
print d2v.most_similar('man')[:3]
print '\nMost similar to movie: '
print d2v.most_similar('movie')[:3]
In [12]:
# Get train and test embedded vectors for classification
x_vector_size = 300
y_vector_size = 2
train_x = np.ndarray([2 * labeled_set_size, x_vector_size])
train_y = np.concatenate((np.ones(labeled_set_size), np.zeros(labeled_set_size)))
test_x = np.ndarray([2 * labeled_set_size, x_vector_size])
test_y = np.concatenate((np.ones(labeled_set_size), np.zeros(labeled_set_size)))
for i in xrange(labeled_set_size):
train_x[i] = d2v.docvecs['train_pos_' + str(i)]
test_x[i] = d2v.docvecs['test_pos_' + str(i)]
for i in xrange(labeled_set_size):
train_x[i + labeled_set_size] = d2v.docvecs['train_neg_' + str(i)]
test_x[i + labeled_set_size] = d2v.docvecs['test_neg_' + str(i)]
# Shuffle data
shuffled_i = np.random.permutation(np.arange(len(train_x)))
train_x = train_x[shuffled_i]
train_y = train_y[shuffled_i]
In [23]:
# Train and test logistic regression classifier
from sklearn.linear_model import LogisticRegression
logistic_regression_model = LogisticRegression().fit(train_x, train_y)
print 'Test set accuracy: ' + '{:.2%}'.format(logistic_regression_model.score(test_x, test_y))