We attempt to do some sentiment analysis with a dataset provided by the University of Michigan for the Kaggle UMICH SI650 - Sentiment Classification competition. We will only use the training data, since the test data is unlabeled and we cannot run an evaluation locally.
In [1]:
from __future__ import division, print_function
from keras.layers.core import Dense, Activation
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM
from keras.models import Sequential
from keras.preprocessing import sequence
from sklearn.cross_validation import train_test_split
import collections
import nltk
import numpy as np
In [2]:
maxlen = 0
word_freqs = collections.Counter()
num_recs = 0
ftrain = open("../data/umich-sentiment-train.txt", "rb")
for line in ftrain:
label, sentence = line.strip().split("\t")
words = nltk.word_tokenize(sentence.decode("ascii", "ignore").lower())
if len(words) > maxlen:
maxlen = len(words)
for word in words:
word_freqs[word] += 1
num_recs += 1
ftrain.close()
# print some statistics about our data, that will drive our parameters
print("maxlen: %d, vocab size: %d" % (maxlen, len(word_freqs)))
In [3]:
MAX_FEATURES = 2000
MAX_SENTENCE_LENGTH = 40
In [4]:
# special words: UNK = -1, PAD = 0
vocab = {"UNK": -1, "PAD": 0}
reverse_vocab = {v:k for k, v in vocab.items()}
for idx, word in enumerate([w[0] for w in word_freqs.most_common(MAX_FEATURES - 1)]):
vocab[word] = idx + 1
reverse_vocab[idx + 1] = word
In [5]:
X = np.empty((num_recs, ), dtype=list)
y = np.zeros((num_recs, ))
i = 0
ftrain = open("../data/umich-sentiment-train.txt", "rb")
for line in ftrain:
label, sentence = line.strip().split("\t")
words = nltk.word_tokenize(sentence.decode("ascii", "ignore").lower())
seqs = []
for word in words:
seqs.append(vocab.get(word, -1))
X[i] = seqs
y[i] = int(label)
i += 1
ftrain.close()
X = sequence.pad_sequences(X, maxlen=MAX_SENTENCE_LENGTH)
In [6]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.3, random_state=0)
print(Xtrain.shape, Xtest.shape, ytrain.shape, ytest.shape)
In [7]:
model = Sequential()
model.add(Embedding(MAX_FEATURES, 128, input_length=MAX_SENTENCE_LENGTH, dropout=0.2))
model.add(LSTM(128, dropout_W=0.2, dropout_U=0.2))
model.add(Dense(1))
model.add(Activation("sigmoid"))
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
In [8]:
model.fit(Xtrain, ytrain, batch_size=32, nb_epoch=10, validation_data=(Xtest, ytest))
Out[8]:
In [9]:
loss, accuracy = model.evaluate(Xtest, ytest, batch_size=32)
print("loss on test set: %.3f, accuracy: %.3f" % (loss, accuracy))
In [10]:
random_idxs = np.random.randint(0, Xtest.shape[0], 10)
for i in range(random_idxs.shape[0]):
xtest = Xtest[random_idxs[i]].reshape(1, MAX_SENTENCE_LENGTH)
ylabel = ytest[i]
ypred = model.predict(xtest)[0][0]
sent_pred = " ".join([reverse_vocab[x] for x in xtest[0].tolist() if x != 0])
print("%.3f\t%d\t%s" % (ypred, ylabel, sent_pred))
In [ ]: