Lets build a model to predict the sentiment of the movie review (Positive or Negative)
@sunilmallya
@jrhunt
Dataset and use case credit to David Ping
For th easiest Deep Learning environment I recommend using the AWS using Deep Learning AMI, please read this post on AWS AI Blog for detailed instructions.
In [4]:
import mxnet as mx
import numpy as np
#https://keras.io/preprocessing/text/
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
The dataset can be downloaded from http://ai.stanford.edu/~amaas/data/sentiment/. We will process the unzipped raw reviews into traing and testing datasets for training and validation purpose.
In [1]:
# Load the data
import sys
import os
path = 'aclImdb'
files = [path + '/train/pos/' + f for f in os.listdir(path + '/train/pos/')] + \
[path + '/train/neg/' + f for f in os.listdir(path + '/train/neg/')] +\
[path + '/test/pos/' + f for f in os.listdir(path + '/test/pos/')] +\
[path + '/test/neg/' + f for f in os.listdir(path + '/test/neg/')]
In [2]:
import re
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
input_text = []
for fname in files:
with open(fname) as f:
input_text += [remove_tags(" ".join(f.readlines()))]
In [5]:
num_words = 10000
tok = Tokenizer(num_words)
tok.fit_on_texts(input_text[:25000])
In [6]:
# Create Training & test data; input data (X) and Labels (Y)
# Labels
input_label = ([1] * 12500 + [0] * 12500) * 2
# Words will be replaced with index
X_train = tok.texts_to_sequences(input_text[:25000])
X_test = tok.texts_to_sequences(input_text[25000:])
y_train = input_label[:25000]
y_test = input_label[25000:]
In [7]:
#MAX Review Length
MAX_LENGTH = 500
X_train = pad_sequences(X_train, maxlen=MAX_LENGTH)
X_test = pad_sequences(X_test, maxlen=MAX_LENGTH)
y_train = np.asarray(y_train)
y_test = np.asarray(y_test)
#http://mxnet.io/api/python/io.html#mxnet.io.NDArrayIter
batch_size = 250
train_iter = mx.io.NDArrayIter(X_train, y_train, batch_size, shuffle=True)
test_iter = mx.io.NDArrayIter(X_test, y_test, batch_size, shuffle=True)
In [9]:
data = mx.sym.Variable('data')
target = mx.sym.Variable('softmax_label') # placeholder for label
emb = mx.sym.Embedding(data=data, input_dim=num_words, output_dim=32, name='embed')
# MLP only accepts 1D vector, hence flatten
f_data = mx.sym.Flatten(data=emb, name='flatten')
fc1 = mx.sym.FullyConnected(data=f_data, num_hidden=250)
act1 = mx.sym.Activation(data=fc1, act_type="relu")
fc2 = mx.sym.FullyConnected(data=act1, num_hidden=2)
mlp = mx.sym.SoftmaxOutput(data=fc2, label=target, name='softmax')
# Lets visualize the network
#mx.viz.plot_network(mlp)
In [11]:
import logging
logger = logging.getLogger()
logger.setLevel(logging.INFO)
num_epoch = 10
ctx = [mx.gpu(i) for i in range(1)]
mlp_model = mx.mod.Module(symbol=mlp, context=ctx)
mlp_model.fit(train_iter,
eval_data=test_iter,
optimizer="adam", # use adam optimizer to train
optimizer_params={'learning_rate':0.01},
eval_metric='acc',
batch_end_callback = mx.callback.Speedometer(batch_size, 100),
num_epoch=num_epoch)
In [12]:
metric = mx.metric.Accuracy()
mlp_model.score(test_iter, metric)
Out[12]:
In [13]:
# Save the model
prefix = "twitch_imdb"
mlp_model.save_checkpoint (prefix, num_epoch)
In [14]:
prefix = "twitch_imdb"
pred_model = mx.mod.Module.load(prefix, num_epoch, False)
# We load the model for only forward pass, so for_training=False
# Set the data shape for 1 single batch example of size (1,500) => (batch_size, MAX_LENGTH)
pred_model.bind(for_training=False, data_shapes=[('data', (1, MAX_LENGTH))])
In [15]:
# Remember we need the input to test in the same format as we trained
def prepare_imdb_list(text, maxlen=500, vocabsize=10000):
# the index to the vocabular we built earlier
imdb_word_index = tok.word_index
sentence = []
sentence.append(str(text))
#tokenize the input sentence
tokens = Tokenizer()
tokens.fit_on_texts(sentence)
# get a list of words from the encoding
words = []
for iter in range(len(tokens.word_index)):
words += [key for key,value in tokens.word_index.items() if value==iter+1]
imdb_seq = []
for w in words:
idx = imdb_word_index[w]
if idx < vocabsize:
imdb_seq.append(idx)
new_list = []
new_list.append(imdb_seq)
new_list = pad_sequences(new_list, maxlen=maxlen)
return new_list
def predict_sentiment(model, text_nd):
sentence_Iter = mx.io.NDArrayIter(text_nd, batch_size=1)
pred = pred_model.predict(sentence_Iter)
return pred
def handle_submit(sender):
text_nd = prepare_imdb_list(inputtext.value)
pred = predict_sentiment(pred_model, text_nd)
outputlabel_0.value = 'Probability for negative sentiment (0): %0.4f ' % pred.asnumpy()[0:1,0]
outputlabel_1.value = 'Probability for positive sentiment (1): %0.4f ' % pred.asnumpy()[0:1,1]
In [17]:
from IPython.display import display
from IPython.html import widgets
inputtext = widgets.Textarea()
display(inputtext)
inputbutton = widgets.Button(description='Predict Sentiment')
display(inputbutton)
outputlabel_0 = widgets.HTML()
outputlabel_1 = widgets.HTML()
display(outputlabel_0)
display(outputlabel_1)
inputbutton.on_click(handle_submit)
Make sure you install it by running
pip install mxnet --pre
Gluon Tutorial: https://github.com/zackchase/mxnet-the-straight-dope
In [ ]:
import mxnet as mx
from mxnet import nd, autograd
from mxnet import gluon
import numpy as np
ctx = mx.cpu()
num_outputs = 2
num_hidden = 256
net = gluon.nn.Sequential()
with net.name_scope():
net.add(gluon.nn.Embedding(vocabsize, 32, weight_initializer=mx.init.Uniform(0.1)))
net.add(gluon.nn.Dense(num_hidden, activation="relu"))
net.add(gluon.nn.Dropout(0.5))
net.add(gluon.nn.Dense(num_hidden, activation="relu"))
net.add(gluon.nn.Dense(num_outputs))
net.collect_params().initialize(mx.init.Xavier(magnitude=2.24), ctx=ctx)
loss = gluon.loss.SoftmaxCrossEntropyLoss()
trainer = gluon.Trainer(net.collect_params(), 'adam', {'learning_rate': .01})
def evaluate_accuracy(data_iterator, net, shape_n=500):
acc = mx.metric.Accuracy()
data_iterator.reset()
for i, batch in enumerate(data_iterator):
data = batch.data[0].as_in_context(ctx).reshape((-1,shape_n))
label = batch.label[0].as_in_context(ctx)
output = net(data)
predictions = nd.argmax(output, axis=1)
acc.update(preds=predictions, labels=label)
return acc.get()[1]
In [ ]:
epochs = 10
moving_loss = 0.
for e in range(epochs):
train_data.reset()
for i, batch in enumerate(train_iter):
data = batch.data[0].as_in_context(ctx).reshape((-1,500))
label = batch.label[0].as_in_context(ctx)
with autograd.record():
output = net(data)
cross_entropy = loss(output, label)
cross_entropy.backward()
trainer.step(data.shape[0], ignore_stale_grad=True)
##########################
# Keep a moving average of the losses
##########################
if i == 0:
moving_loss = nd.mean(cross_entropy).asscalar()
else:
moving_loss = .99 * moving_loss + .01 * nd.mean(cross_entropy).asscalar()
test_accuracy = evaluate_accuracy(test_iter, net)
train_accuracy = evaluate_accuracy(train_iter, net)
print("Epoch %s. Loss: %s, Train_acc %s, Test_acc %s" % (e, moving_loss, train_accuracy, test_accuracy))
In [18]:
# Plot the loss function and see the curve
content = '''Epoch 0. Loss: 0.812929830771, Train_acc 0.78576, Test_acc 0.7498
Epoch 1. Loss: 0.404830086653, Train_acc 0.95528, Test_acc 0.86884
Epoch 2. Loss: 0.194560304798, Train_acc 0.9746, Test_acc 0.8424
Epoch 3. Loss: 0.0949568697476, Train_acc 0.98632, Test_acc 0.85032
Epoch 4. Loss: 0.0540652401239, Train_acc 0.99656, Test_acc 0.84576
Epoch 5. Loss: 0.0192620222058, Train_acc 0.99904, Test_acc 0.86436
Epoch 6. Loss: 0.0256203386385, Train_acc 0.99788, Test_acc 0.85636
Epoch 7. Loss: 0.0162229706022, Train_acc 0.99724, Test_acc 0.85216
Epoch 8. Loss: 0.0194205419608, Train_acc 0.9978, Test_acc 0.84712
Epoch 9. Loss: 0.0193563111006, Train_acc 0.99904, Test_acc 0.85124'''
loss_points = [float(c.split(" ")[3].rstrip(',')) for c in content.split('\n')]
print loss_points
import matplotlib.pyplot as plt
plt.plot(loss_points)
plt.show()
In [ ]: