In [1]:
%matplotlib inline
import codecs
import logging
import numpy as np
from word2vec.word2vecReader import Word2Vec
from preprocessing import preprocess_tweet
from nltk import word_tokenize
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import cross_val_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.utils import shuffle
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from lasagne import layers
from lasagne.updates import nesterov_momentum
from lasagne.nonlinearities import tanh, softmax
from nolearn.lasagne import NeuralNet
import matplotlib.pyplot as plt
import sys
In [2]:
f = codecs.open('../data/positive-all', 'r', 'utf-8')
positive = {l.strip() for l in f}
f.close()
f = codecs.open('../data/negative-all', 'r', 'utf-8')
negative = {l.strip() for l in f}
f.close()
f = codecs.open('../data/neutral-all', 'r', 'utf-8')
neutral = {l.strip() for l in f}
f.close()
print 'Number of positives = %d' % len(positive)
print 'Number of negatives = %d' % len(negative)
print 'Number of neutrals = %d' % len(neutral)
In [3]:
model_path = '../models/word2vec_twitter_model.bin'
w2v = Word2Vec.load_word2vec_format(model_path, binary=True)
print "Loaded the model with layer size: %d and %d vocabulary size." % (w2v.layer1_size, len(w2v.vocab))
In [4]:
import preprocessing
reload(preprocessing)
positive_counts = dict()
negative_counts = dict()
neutral_counts = dict()
for sent in positive:
for w in preprocessing.preprocess_tweet(sent):
if w not in positive_counts: positive_counts[w] = 0
positive_counts[w] += 1
for sent in negative:
for w in preprocessing.preprocess_tweet(sent):
if w not in negative_counts: negative_counts[w] = 0
negative_counts[w] += 1
for sent in neutral:
for w in preprocessing.preprocess_tweet(sent):
if w not in neutral_counts: neutral_counts[w] = 0
neutral_counts[w] += 1
print 'Vocab size for positives: %d' % (len(positive_counts))
print 'Vocab size for negatives: %d' % (len(negative_counts))
print 'Vocab size for neutrals : %d' % (len(neutral_counts))
N = 50
print 'Top %d positive words:' % N
for w in sorted(positive_counts.keys(), key=positive_counts.get, reverse=True)[:N]:
print '\t%s : %d' % (w, positive_counts[w])
print 'Top %d neutral words:' % N
for w in sorted(neutral_counts.keys(), key=neutral_counts.get, reverse=True)[:N]:
print '\t%s : %d' % (w, neutral_counts[w])
print 'Top %d negative words:' % N
for w in sorted(negative_counts.keys(), key=negative_counts.get, reverse=True)[:N]:
print '\t%s : %d' % (w, negative_counts[w])
In [5]:
positive_features = np.zeros((len(positive), w2v.layer1_size), dtype=np.float32)
negative_features = np.zeros((len(negative), w2v.layer1_size), dtype=np.float32)
neutral_features = np.zeros((len(neutral) , w2v.layer1_size), dtype=np.float32)
# the word2vec model is so big so we don't really need to normalise
# the tweets before passing them over to the model, just tokenizing...
for i, sentence in enumerate(positive):
sent_vec = w2v.get_sentence_vec(word_tokenize(sentence))
positive_features[i,] = sent_vec
for i, sentence in enumerate(negative):
sent_vec = w2v.get_sentence_vec(word_tokenize(sentence))
negative_features[i,] = sent_vec
for i, sentence in enumerate(neutral):
sent_vec = w2v.get_sentence_vec(word_tokenize(sentence))
neutral_features[i,] = sent_vec
del w2v # we're finished with it for now
In [6]:
# we'll ignore the imbalance of the classes for now and see what happens
# choose 500 samples from each class to be included in the test set
num_test_samples_per_class = 500
pos_train, pos_test = train_test_split(positive_features, test_size=num_test_samples_per_class, random_state=22)
neg_train, neg_test = train_test_split(negative_features, test_size=num_test_samples_per_class, random_state=22)
neu_train, neu_test = train_test_split(neutral_features , test_size=num_test_samples_per_class, random_state=22)
X_train = np.vstack((
pos_train,
neg_train,
neu_train
))
X_test = np.vstack((
pos_test,
neg_test,
neu_test
))
Y_train = np.hstack((
np.ones((pos_train.shape[0]), dtype=np.float32),
np.ones((neg_train.shape[0]), dtype=np.float32) * -1,
np.zeros((neu_train.shape[0]), dtype=np.float32)
))
Y_test = np.hstack((
np.ones((pos_test.shape[0]), dtype=np.float32),
np.ones((neg_test.shape[0]), dtype=np.float32) * -1,
np.zeros((neu_test.shape[0]), dtype=np.float32)
))
# shuffle 'em
X_train, Y_train = shuffle(X_train, Y_train, random_state=111)
X_test , Y_test = shuffle(X_test , Y_test , random_state=111)
In [7]:
model = LogisticRegression(C=1e5, class_weight='auto', random_state=33)
scores = cross_val_score(model, X_train, Y_train, cv=3, verbose=1, n_jobs=8)
print "Accuracy: %0.5f [%0.5f - %0.5f]" % (scores.mean(), scores.mean()-scores.std() * 2, scores.mean()+scores.std() * 2)
model.fit(X_train, Y_train)
Out[7]:
In [6]:
Y_pred = model.predict(X_test)
print classification_report(Y_test, Y_pred)
cm = confusion_matrix(Y_test, Y_pred)
print cm
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(cm)
fig.colorbar(cax)
ax.set_xticklabels(['']+['-1', '0', '1'])
ax.set_yticklabels(['']+['-1', '0', '1'])
plt.title('Confusion Matrix')
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()
In [7]:
pca = PCA(n_components=300, whiten=True)
pca.fit(X_train)
X_transformed = pca.transform(X_train)
model = LogisticRegression(C=1e5, class_weight='auto', random_state=33)
scores = cross_val_score(model, X_transformed, Y_train, cv=3, verbose=1)
print "Accuracy: %0.5f [%0.5f - %0.5f]" % (scores.mean(), scores.mean()-scores.std() * 2, scores.mean()+scores.std() * 2)
model.fit(X_transformed, Y_train)
Out[7]:
In [8]:
Y_pred = model.predict(pca.transform(X_test))
print classification_report(Y_test, Y_pred)
cm = confusion_matrix(Y_test, Y_pred)
print cm
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(cm)
fig.colorbar(cax)
ax.set_xticklabels(['']+['-1', '0', '1'])
ax.set_yticklabels(['']+['-1', '0', '1'])
plt.title('Confusion Matrix')
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()
In [13]:
# first implement a class to cache the best model and
# prevent overfitting
class EarlyStopping(object):
def __init__(self, patience):
self.patience = patience
self.best_valid = np.inf
self.best_valid_epoch = 0
self.best_weights = None
def __call__(self, nn, train_history):
current_valid = train_history[-1]['valid_loss']
current_epoch = train_history[-1]['epoch']
if current_valid < self.best_valid:
self.best_valid = current_valid
self.best_valid_epoch = current_epoch
self.best_weights = nn.get_all_params_values()
elif self.best_valid_epoch + self.patience < current_epoch:
print "Early stopping."
print "Best valid loss was {:.6f} at epoch {}.".format(
self.best_valid, self.best_valid_epoch)
sys.stdout.flush()
nn.load_params_from(self.best_weights)
raise StopIteration()
# another helper class to adjust the learning rate and the momentum
class AdjustVariable(object):
def __init__(self, name, stop, decrement=0.0001, increment=None):
self.name = name
self.stop = stop
self.decrement = decrement
self.increment = increment
def __call__(self, nn, train_history):
epoch = train_history[-1]['epoch']
if self.increment:
new_value = min(getattr(nn, self.name) + self.increment, self.stop)
else:
new_value = max(getattr(nn, self.name) - self.decrement, self.stop)
nn.__dict__[self.name] = np.cast['float32'](new_value)
model = NeuralNet(
layers=[
('input', layers.InputLayer),
('hidden1', layers.DenseLayer),
('dropout1', layers.DropoutLayer),
('hidden2', layers.DenseLayer),
('dropout2', layers.DropoutLayer),
('output', layers.DenseLayer),
],
input_shape=(None, 400),
hidden1_num_units=1000, dropout1_p=0.5,
hidden2_num_units=1000, dropout2_p=0.5,
output_nonlinearity=tanh,
output_num_units=1,
# optimization method:
regression=True,
update=nesterov_momentum,
update_learning_rate=0.01,
update_momentum=0.9,
on_epoch_finished=[
AdjustVariable('update_learning_rate', stop=0.0001, decrement=0.00001),
AdjustVariable('update_momentum', stop=0.999, increment=0.0001),
EarlyStopping(patience=100)
],
max_epochs=1000,
eval_size=0.1,
verbose=1
)
model.fit(np.asarray(X_train, dtype=np.float32), np.asarray(Y_train, dtype=np.float32))
Out[13]:
In [31]:
Y_pred = model.predict(np.asarray(X_test, dtype=np.float32))
# let's threshold the continuous values to get the classes
pos = Y_pred >= .33
neg = Y_pred <= -0.33
neu = np.logical_and(Y_pred < 0.33, Y_pred > -0.33)
Y_pred[pos] = 1
Y_pred[neg] = -1
Y_pred[neu] = 0
print classification_report(Y_test, Y_pred)
cm = confusion_matrix(Y_test, Y_pred)
print cm
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(cm)
fig.colorbar(cax)
ax.set_xticklabels(['']+['-1', '0', '1'])
ax.set_yticklabels(['']+['-1', '0', '1'])
plt.title('Confusion Matrix')
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()
In [14]:
Y_pred = model.predict(np.asarray(X_test, dtype=np.float32))
# let's threshold the continuous values to get the classes
pos = Y_pred >= .33
neg = Y_pred <= -0.33
neu = np.logical_and(Y_pred < 0.33, Y_pred > -0.33)
Y_pred[pos] = 1
Y_pred[neg] = -1
Y_pred[neu] = 0
print classification_report(Y_test, Y_pred)
cm = confusion_matrix(Y_test, Y_pred)
print cm
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(cm)
fig.colorbar(cax)
ax.set_xticklabels(['']+['-1', '0', '1'])
ax.set_yticklabels(['']+['-1', '0', '1'])
plt.title('Confusion Matrix')
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()
In [7]:
import features
reload(features)
positive_features = np.zeros((len(positive), features.NUM_LINGUISTIC_FEATURES), dtype=np.float32)
negative_features = np.zeros((len(negative), features.NUM_LINGUISTIC_FEATURES), dtype=np.float32)
neutral_features = np.zeros((len(neutral) , features.NUM_LINGUISTIC_FEATURES), dtype=np.float32)
for i, sentence in enumerate(positive):
sent_vec = features.get_linguistic_features(sentence)
positive_features[i,] = sent_vec
for i, sentence in enumerate(negative):
sent_vec = features.get_linguistic_features(sentence)
negative_features[i,] = sent_vec
for i, sentence in enumerate(neutral):
sent_vec = features.get_linguistic_features(sentence)
neutral_features[i,] = sent_vec
# we'll ignore the imbalance of the classes for now and see what happens
# choose 500 samples from each class to be included in the test set
num_test_samples_per_class = 500
pos_train, pos_test = train_test_split(positive_features, test_size=num_test_samples_per_class, random_state=22)
neg_train, neg_test = train_test_split(negative_features, test_size=num_test_samples_per_class, random_state=22)
neu_train, neu_test = train_test_split(neutral_features , test_size=num_test_samples_per_class, random_state=22)
X_train = np.vstack((
pos_train,
neg_train,
neu_train
))
X_test = np.vstack((
pos_test,
neg_test,
neu_test
))
Y_train = np.hstack((
np.ones((pos_train.shape[0]), dtype=np.float32),
np.ones((neg_train.shape[0]), dtype=np.float32) * -1,
np.zeros((neu_train.shape[0]), dtype=np.float32)
))
Y_test = np.hstack((
np.ones((pos_test.shape[0]), dtype=np.float32),
np.ones((neg_test.shape[0]), dtype=np.float32) * -1,
np.zeros((neu_test.shape[0]), dtype=np.float32)
))
# shuffle 'em
X_train, Y_train = shuffle(X_train, Y_train, random_state=111)
X_test , Y_test = shuffle(X_test , Y_test , random_state=111)
In [8]:
model = LogisticRegression(C=1e5, class_weight='auto', random_state=33)
scores = cross_val_score(model, X_train, Y_train, cv=3, verbose=1, n_jobs=8)
print "Accuracy: %0.5f [%0.5f - %0.5f]" % (scores.mean(), scores.mean()-scores.std() * 2, scores.mean()+scores.std() * 2)
model.fit(X_train, Y_train)
Out[8]:
In [34]:
print model.coef_
Y_pred = model.predict(X_test)
print classification_report(Y_test, Y_pred)
cm = confusion_matrix(Y_test, Y_pred)
print cm
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(cm)
fig.colorbar(cax)
ax.set_xticklabels(['']+['-1', '0', '1'])
ax.set_yticklabels(['']+['-1', '0', '1'])
plt.title('Confusion Matrix')
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()
In [38]:
model_path = '../models/word2vec_twitter_model.bin'
w2v = Word2Vec.load_word2vec_format(model_path, binary=True)
import features
reload(features)
num_features = w2v.layer1_size + features.NUM_LINGUISTIC_FEATURES
positive_features = np.zeros((len(positive), num_features), dtype=np.float32)
negative_features = np.zeros((len(negative), num_features), dtype=np.float32)
neutral_features = np.zeros((len(neutral) , num_features), dtype=np.float32)
for i, sentence in enumerate(positive):
sent_vec = features.get_features(w2v, sentence)
positive_features[i,] = sent_vec
for i, sentence in enumerate(negative):
sent_vec = features.get_features(w2v, sentence)
negative_features[i,] = sent_vec
for i, sentence in enumerate(neutral):
sent_vec = features.get_features(w2v, sentence)
neutral_features[i,] = sent_vec
del w2v
num_test_samples_per_class = 500
pos_train, pos_test = train_test_split(positive_features, test_size=num_test_samples_per_class, random_state=22)
neg_train, neg_test = train_test_split(negative_features, test_size=num_test_samples_per_class, random_state=22)
neu_train, neu_test = train_test_split(neutral_features , test_size=num_test_samples_per_class, random_state=22)
X_train = np.vstack((
pos_train,
neg_train,
neu_train
))
X_test = np.vstack((
pos_test,
neg_test,
neu_test
))
Y_train = np.hstack((
np.ones((pos_train.shape[0]), dtype=np.float32),
np.ones((neg_train.shape[0]), dtype=np.float32) * -1,
np.zeros((neu_train.shape[0]), dtype=np.float32)
))
Y_test = np.hstack((
np.ones((pos_test.shape[0]), dtype=np.float32),
np.ones((neg_test.shape[0]), dtype=np.float32) * -1,
np.zeros((neu_test.shape[0]), dtype=np.float32)
))
# shuffle 'em
X_train, Y_train = shuffle(X_train, Y_train, random_state=111)
X_test , Y_test = shuffle(X_test , Y_test , random_state=111)
In [39]:
model = LogisticRegression(C=1e5, class_weight='auto', random_state=33)
scores = cross_val_score(model, X_train, Y_train, cv=3, verbose=1, n_jobs=8)
print "Accuracy: %0.5f [%0.5f - %0.5f]" % (scores.mean(), scores.mean()-scores.std() * 2, scores.mean()+scores.std() * 2)
model.fit(X_train, Y_train)
Out[39]:
In [40]:
Y_pred = model.predict(X_test)
print classification_report(Y_test, Y_pred)
cm = confusion_matrix(Y_test, Y_pred)
print cm
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(cm)
fig.colorbar(cax)
ax.set_xticklabels(['']+['-1', '0', '1'])
ax.set_yticklabels(['']+['-1', '0', '1'])
plt.title('Confusion Matrix')
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()
In [18]:
w2v_outputs = model.predict(X_train)
# convert the outputs to 3 indicator (i.e. binary) features
mlb = MultiLabelBinarizer()
w2v_outputs = mlb.fit_transform([(x,) for x in w2v_outputs.tolist()])
print list(mlb.classes_)
In [23]:
# load the linguistic features again
import features
reload(features)
positive_features = np.zeros((len(positive), features.NUM_LINGUISTIC_FEATURES), dtype=np.float32)
negative_features = np.zeros((len(negative), features.NUM_LINGUISTIC_FEATURES), dtype=np.float32)
neutral_features = np.zeros((len(neutral) , features.NUM_LINGUISTIC_FEATURES), dtype=np.float32)
for i, sentence in enumerate(positive):
sent_vec = features.get_linguistic_features(sentence)
positive_features[i,] = sent_vec
for i, sentence in enumerate(negative):
sent_vec = features.get_linguistic_features(sentence)
negative_features[i,] = sent_vec
for i, sentence in enumerate(neutral):
sent_vec = features.get_linguistic_features(sentence)
neutral_features[i,] = sent_vec
# we'll ignore the imbalance of the classes for now and see what happens
# choose 500 samples from each class to be included in the test set
num_test_samples_per_class = 500
pos_train, pos_test = train_test_split(positive_features, test_size=num_test_samples_per_class, random_state=22)
neg_train, neg_test = train_test_split(negative_features, test_size=num_test_samples_per_class, random_state=22)
neu_train, neu_test = train_test_split(neutral_features , test_size=num_test_samples_per_class, random_state=22)
X_train = np.vstack((
pos_train,
neg_train,
neu_train
))
X_test = np.vstack((
pos_test,
neg_test,
neu_test
))
Y_train = np.hstack((
np.ones((pos_train.shape[0]), dtype=np.float32),
np.ones((neg_train.shape[0]), dtype=np.float32) * -1,
np.zeros((neu_train.shape[0]), dtype=np.float32)
))
Y_test = np.hstack((
np.ones((pos_test.shape[0]), dtype=np.float32),
np.ones((neg_test.shape[0]), dtype=np.float32) * -1,
np.zeros((neu_test.shape[0]), dtype=np.float32)
))
# shuffle 'em
X_train, Y_train = shuffle(X_train, Y_train, random_state=111)
X_test , Y_test = shuffle(X_test , Y_test , random_state=111)
In [20]:
# now combine the features and train a new classifier
X_train = np.hstack((
w2v_outputs,
X_train
))
ling_model = LogisticRegression(C=1e5, class_weight='auto', random_state=33)
scores = cross_val_score(ling_model, X_train, Y_train, cv=3, verbose=1)
print "Accuracy: %0.5f [%0.5f - %0.5f]" % (scores.mean(), scores.mean()-scores.std() * 2, scores.mean()+scores.std() * 2)
ling_model.fit(X_train, Y_train)
print ling_model.coef_