imports


In [1]:
%matplotlib inline
import codecs
import logging

import numpy as np
from word2vec.word2vecReader import Word2Vec
from preprocessing import preprocess_tweet

from nltk import word_tokenize

from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import cross_val_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.utils import shuffle
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA

from lasagne import layers
from lasagne.updates import nesterov_momentum
from lasagne.nonlinearities import tanh, softmax
from nolearn.lasagne import NeuralNet

import matplotlib.pyplot as plt

import sys


Using gpu device 0: GeForce GT 650M (CNMeM is disabled)

Load the corpus and the model


In [2]:
f = codecs.open('../data/positive-all', 'r', 'utf-8')
positive = {l.strip() for l in f}
f.close()

f = codecs.open('../data/negative-all', 'r', 'utf-8')
negative = {l.strip() for l in f}
f.close()

f = codecs.open('../data/neutral-all', 'r', 'utf-8')
neutral = {l.strip() for l in f}
f.close()

print 'Number of positives = %d' % len(positive)
print 'Number of negatives = %d' % len(negative)
print 'Number of neutrals  = %d' % len(neutral)


Number of positives = 7331
Number of negatives = 9372
Number of neutrals  = 10724

In [3]:
model_path = '../models/word2vec_twitter_model.bin'
w2v = Word2Vec.load_word2vec_format(model_path, binary=True)

print "Loaded the model with layer size: %d and %d vocabulary size." % (w2v.layer1_size, len(w2v.vocab))


Loaded the model with layer size: 400 and 3039345 vocabulary size.

Let's check word frequencies in each class


In [4]:
import preprocessing
reload(preprocessing)

positive_counts = dict()
negative_counts = dict()
neutral_counts  = dict()

for sent in positive:
    for w in preprocessing.preprocess_tweet(sent):
        if w not in positive_counts: positive_counts[w] = 0
        positive_counts[w] += 1

for sent in negative:
    for w in preprocessing.preprocess_tweet(sent):
        if w not in negative_counts: negative_counts[w] = 0
        negative_counts[w] += 1

for sent in neutral:
    for w in preprocessing.preprocess_tweet(sent):
        if w not in neutral_counts: neutral_counts[w] = 0
        neutral_counts[w] += 1

print 'Vocab size for positives: %d' % (len(positive_counts))
print 'Vocab size for negatives: %d' % (len(negative_counts))
print 'Vocab size for neutrals : %d' % (len(neutral_counts))

N = 50
print 'Top %d positive words:' % N
for w in sorted(positive_counts.keys(), key=positive_counts.get, reverse=True)[:N]:
    print '\t%s : %d' % (w, positive_counts[w])

print 'Top %d neutral words:' % N
for w in sorted(neutral_counts.keys(), key=neutral_counts.get, reverse=True)[:N]:
    print '\t%s : %d' % (w, neutral_counts[w])

print 'Top %d negative words:' % N
for w in sorted(negative_counts.keys(), key=negative_counts.get, reverse=True)[:N]:
    print '\t%s : %d' % (w, negative_counts[w])


Vocab size for positives: 8776
Vocab size for negatives: 10529
Vocab size for neutrals : 12070
Top 50 positive words:
	|||HASHTAG||| : 8972
	|||MENTION||| : 5478
	! : 4627
	i : 2183
	|||DIGIT||| : 1570
	you : 1210
	ipad : 912
	it : 909
	my : 859
	google : 777
	apple : 768
	not : 709
	? : 678
	up : 575
	store : 557
	so : 543
	me : 515
	thanks : 497
	just : 487
	iphone : 456
	new : 454
	quot : 453
	go : 440
	app : 403
	out : 385
	thank : 382
	great : 376
	amp : 367
	your : 359
	lineup : 358
	see : 323
	all : 323
	coachella : 311
	get : 308
	now : 303
	will : 303
	we : 296
	line : 295
	flight : 269
	but : 264
	austin : 262
	time : 261
	one : 252
	love : 251
	like : 250
	going : 246
	got : 244
	good : 236
	they : 234
	can : 227
Top 50 neutral words:
	|||HASHTAG||| : 13264
	|||MENTION||| : 8631
	|||DIGIT||| : 3155
	? : 2625
	google : 2216
	i : 1766
	! : 1627
	you : 1327
	apple : 1240
	ipad : 1091
	car : 1069
	quot : 992
	cars : 929
	it : 926
	not : 898
	store : 897
	new : 834
	my : 819
	self-driving : 817
	amp : 759
	iphone : 716
	up : 636
	will : 621
	austin : 567
	driverless : 565
	just : 523
	today : 515
	can : 492
	out : 482
	we : 467
	social : 462
	launch : 459
	your : 457
	me : 447
	driving : 427
	via : 417
	get : 405
	circles : 403
	app : 385
	self : 377
	now : 366
	how : 344
	network : 341
	sxsw : 336
	$ : 326
	flight : 314
	so : 309
	line : 303
	but : 300
	here : 289
Top 50 negative words:
	|||MENTION||| : 9551
	i : 4265
	|||DIGIT||| : 4241
	|||HASHTAG||| : 3796
	not : 3153
	? : 2907
	! : 2579
	you : 2439
	flight : 2412
	my : 2345
	it : 1653
	me : 1276
	your : 1259
	no : 1220
	get : 884
	but : 851
	cancelled : 777
	now : 768
	we : 745
	service : 687
	up : 660
	just : 626
	hours : 609
	so : 592
	they : 574
	hold : 571
	customer : 557
	can : 547
	time : 546
	why : 542
	amp : 489
	help : 473
	still : 469
	out : 456
	when : 453
	plane : 451
	how : 445
	all : 440
	delayed : 432
	one : 424
	phone : 424
	will : 423
	hour : 420
	us : 415
	call : 414
	ca : 408
	flightled : 392
	there : 379
	would : 368
	$ : 366

Build a training set using only the word2vec features


In [5]:
positive_features = np.zeros((len(positive), w2v.layer1_size), dtype=np.float32)
negative_features = np.zeros((len(negative), w2v.layer1_size), dtype=np.float32)
neutral_features  = np.zeros((len(neutral) , w2v.layer1_size), dtype=np.float32)

# the word2vec model is so big so we don't really need to normalise
# the tweets before passing them over to the model, just tokenizing...

for i, sentence in enumerate(positive):
    sent_vec = w2v.get_sentence_vec(word_tokenize(sentence))
    positive_features[i,] = sent_vec

for i, sentence in enumerate(negative):
    sent_vec = w2v.get_sentence_vec(word_tokenize(sentence))
    negative_features[i,] = sent_vec

for i, sentence in enumerate(neutral):
    sent_vec = w2v.get_sentence_vec(word_tokenize(sentence))
    neutral_features[i,] = sent_vec

del w2v # we're finished with it for now

Split the dataset into train/test


In [6]:
# we'll ignore the imbalance of the classes for now and see what happens
# choose 500 samples from each class to be included in the test set

num_test_samples_per_class = 500

pos_train, pos_test = train_test_split(positive_features, test_size=num_test_samples_per_class, random_state=22)
neg_train, neg_test = train_test_split(negative_features, test_size=num_test_samples_per_class, random_state=22)
neu_train, neu_test = train_test_split(neutral_features , test_size=num_test_samples_per_class, random_state=22)

X_train = np.vstack((
    pos_train,
    neg_train,
    neu_train
))
X_test  = np.vstack((
    pos_test,
    neg_test,
    neu_test
))
Y_train = np.hstack((
    np.ones((pos_train.shape[0]), dtype=np.float32),
    np.ones((neg_train.shape[0]), dtype=np.float32) * -1,
    np.zeros((neu_train.shape[0]), dtype=np.float32)
))
Y_test = np.hstack((
    np.ones((pos_test.shape[0]), dtype=np.float32),
    np.ones((neg_test.shape[0]), dtype=np.float32) * -1,
    np.zeros((neu_test.shape[0]), dtype=np.float32)
))

# shuffle 'em
X_train, Y_train = shuffle(X_train, Y_train, random_state=111)
X_test , Y_test  = shuffle(X_test , Y_test , random_state=111)

train a classifier on this dataset


In [7]:
model = LogisticRegression(C=1e5, class_weight='auto', random_state=33)
scores = cross_val_score(model, X_train, Y_train, cv=3, verbose=1, n_jobs=8)
print "Accuracy: %0.5f [%0.5f - %0.5f]" % (scores.mean(), scores.mean()-scores.std() * 2, scores.mean()+scores.std() * 2)

model.fit(X_train, Y_train)


[Parallel(n_jobs=8)]: Done   1 jobs       | elapsed:    9.6s
[Parallel(n_jobs=8)]: Done   3 out of   3 | elapsed:   10.1s finished
Accuracy: 0.76152 [0.75851 - 0.76454]
Out[7]:
LogisticRegression(C=100000.0, class_weight='auto', dual=False,
          fit_intercept=True, intercept_scaling=1, penalty='l2',
          random_state=33, tol=0.0001)

Evaluate the classifier


In [6]:
Y_pred = model.predict(X_test)
print classification_report(Y_test, Y_pred)

cm = confusion_matrix(Y_test, Y_pred)

print cm

fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(cm)
fig.colorbar(cax)

ax.set_xticklabels(['']+['-1', '0', '1'])
ax.set_yticklabels(['']+['-1', '0', '1'])

plt.title('Confusion Matrix')
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()


             precision    recall  f1-score   support

       -1.0       0.83      0.89      0.86       500
        0.0       0.71      0.79      0.75       500
        1.0       0.77      0.64      0.70       500

avg / total       0.77      0.77      0.77      1500

[[444  31  25]
 [ 39 393  68]
 [ 52 129 319]]

Let's see the effect of PCA!


In [7]:
pca = PCA(n_components=300, whiten=True)
pca.fit(X_train)

X_transformed = pca.transform(X_train)

model = LogisticRegression(C=1e5, class_weight='auto', random_state=33)
scores = cross_val_score(model, X_transformed, Y_train, cv=3, verbose=1)
print "Accuracy: %0.5f [%0.5f - %0.5f]" % (scores.mean(), scores.mean()-scores.std() * 2, scores.mean()+scores.std() * 2)

model.fit(X_transformed, Y_train)


[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:    1.1s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    3.2s finished
Accuracy: 0.75905 [0.75811 - 0.76000]
Out[7]:
LogisticRegression(C=100000.0, class_weight='auto', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', penalty='l2', random_state=33,
          solver='liblinear', tol=0.0001, verbose=0)

In [8]:
Y_pred = model.predict(pca.transform(X_test))
print classification_report(Y_test, Y_pred)

cm = confusion_matrix(Y_test, Y_pred)

print cm

fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(cm)
fig.colorbar(cax)

ax.set_xticklabels(['']+['-1', '0', '1'])
ax.set_yticklabels(['']+['-1', '0', '1'])

plt.title('Confusion Matrix')
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()


             precision    recall  f1-score   support

       -1.0       0.84      0.90      0.86       500
        0.0       0.70      0.77      0.73       500
        1.0       0.77      0.63      0.70       500

avg / total       0.77      0.77      0.76      1500

[[448  30  22]
 [ 42 387  71]
 [ 46 138 316]]

It's not helping.

Let's see what a normal neural net does on the W2V features


In [13]:
# first implement a class to cache the best model and
# prevent overfitting
class EarlyStopping(object):
    def __init__(self, patience):
        self.patience = patience
        self.best_valid = np.inf
        self.best_valid_epoch = 0
        self.best_weights = None

    def __call__(self, nn, train_history):
        current_valid = train_history[-1]['valid_loss']
        current_epoch = train_history[-1]['epoch']
        if current_valid < self.best_valid:
            self.best_valid = current_valid
            self.best_valid_epoch = current_epoch
            self.best_weights = nn.get_all_params_values()
        elif self.best_valid_epoch + self.patience < current_epoch:
            print "Early stopping."
            print "Best valid loss was {:.6f} at epoch {}.".format(
                self.best_valid, self.best_valid_epoch)
            sys.stdout.flush()
            nn.load_params_from(self.best_weights)
            raise StopIteration()

# another helper class to adjust the learning rate and the momentum
class AdjustVariable(object):
    def __init__(self, name, stop, decrement=0.0001, increment=None):
        self.name = name
        self.stop = stop
        self.decrement = decrement
        self.increment = increment

    def __call__(self, nn, train_history):
        epoch = train_history[-1]['epoch']
        if self.increment:
            new_value = min(getattr(nn, self.name) + self.increment, self.stop)
        else:
            new_value = max(getattr(nn, self.name) - self.decrement, self.stop)
        nn.__dict__[self.name] = np.cast['float32'](new_value)
            
model = NeuralNet(
        layers=[
        ('input', layers.InputLayer),
        ('hidden1', layers.DenseLayer),
        ('dropout1', layers.DropoutLayer),
        ('hidden2', layers.DenseLayer),
        ('dropout2', layers.DropoutLayer),
        ('output', layers.DenseLayer),
    ],
    input_shape=(None, 400),
    hidden1_num_units=1000, dropout1_p=0.5,
    hidden2_num_units=1000, dropout2_p=0.5,
    
    output_nonlinearity=tanh,
    output_num_units=1,

    # optimization method:
    regression=True,
    update=nesterov_momentum,
    update_learning_rate=0.01,
    update_momentum=0.9,
    
    on_epoch_finished=[
        AdjustVariable('update_learning_rate', stop=0.0001, decrement=0.00001),
        AdjustVariable('update_momentum',      stop=0.999,  increment=0.0001),
        EarlyStopping(patience=100)
    ],
    max_epochs=1000,
    eval_size=0.1,
    verbose=1
)

model.fit(np.asarray(X_train, dtype=np.float32), np.asarray(Y_train, dtype=np.float32))


# Neural Network with 1403001 learnable parameters

## Layer information

  #  name        size
---  --------  ------
  0  input        400
  1  hidden1     1000
  2  dropout1    1000
  3  hidden2     1000
  4  dropout2    1000
  5  output         1

  epoch    train loss    valid loss    train/val  dur
-------  ------------  ------------  -----------  -----
      1       0.49380       0.40578      1.21692  1.96s
      2       0.38043       0.34664      1.09748  1.55s
      3       0.34009       0.31964      1.06398  1.89s
      4       0.31969       0.30434      1.05043  1.54s
      5       0.30566       0.30284      1.00931  1.53s
      6       0.29810       0.29219      1.02025  1.53s
      7       0.28983       0.28547      1.01526  1.52s
      8       0.28221       0.29149      0.96817  1.53s
      9       0.27866       0.28924      0.96344  1.53s
     10       0.27459       0.27711      0.99090  1.53s
     11       0.26981       0.27150      0.99379  1.85s
     12       0.26798       0.27340      0.98015  1.53s
     13       0.26503       0.27075      0.97886  1.52s
     14       0.26122       0.26391      0.98982  1.52s
     15       0.25948       0.26364      0.98419  1.53s
     16       0.25736       0.26489      0.97159  1.52s
     17       0.25323       0.26078      0.97106  1.53s
     18       0.25318       0.26494      0.95562  1.54s
     19       0.25111       0.25669      0.97825  1.53s
     20       0.24783       0.25702      0.96424  1.53s
     21       0.24684       0.25592      0.96453  1.52s
     22       0.24477       0.25569      0.95728  1.53s
     23       0.24308       0.25970      0.93602  1.53s
     24       0.24193       0.25943      0.93255  1.53s
     25       0.23968       0.25722      0.93182  1.52s
     26       0.24027       0.25813      0.93082  1.52s
     27       0.23813       0.25141      0.94715  1.52s
     28       0.23505       0.25482      0.92239  1.52s
     29       0.23405       0.25932      0.90254  1.53s
     30       0.23432       0.25707      0.91150  1.53s
     31       0.23385       0.25513      0.91660  1.57s
     32       0.23177       0.25465      0.91015  1.53s
     33       0.23080       0.25029      0.92212  1.55s
     34       0.22738       0.25171      0.90333  1.54s
     35       0.22920       0.25773      0.88928  1.52s
     36       0.22596       0.25524      0.88528  1.52s
     37       0.22534       0.26729      0.84308  1.52s
     38       0.22516       0.25014      0.90012  1.52s
     39       0.22290       0.25240      0.88309  1.52s
     40       0.22166       0.24887      0.89068  1.52s
     41       0.22097       0.25502      0.86648  1.52s
     42       0.22138       0.25655      0.86290  1.52s
     43       0.21951       0.24699      0.88874  1.52s
     44       0.21784       0.25240      0.86305  1.53s
     45       0.21615       0.24548      0.88050  1.52s
     46       0.21564       0.24915      0.86548  1.52s
     47       0.21472       0.24922      0.86158  1.52s
     48       0.21337       0.24756      0.86189  1.53s
     49       0.21163       0.25159      0.84120  1.52s
     50       0.21233       0.24294      0.87402  1.52s
     51       0.21137       0.25087      0.84254  1.52s
     52       0.21036       0.24598      0.85520  1.52s
     53       0.20986       0.24437      0.85878  1.91s
     54       0.20720       0.24614      0.84182  1.60s
     55       0.20758       0.25518      0.81343  1.53s
     56       0.20722       0.24176      0.85713  1.53s
     57       0.20388       0.25212      0.80867  1.79s
     58       0.20364       0.24804      0.82099  2.22s
     59       0.20399       0.24664      0.82705  1.92s
     60       0.20379       0.24548      0.83019  1.53s
     61       0.20194       0.24898      0.81104  1.53s
     62       0.19944       0.25512      0.78175  1.53s
     63       0.20102       0.24666      0.81495  1.74s
     64       0.19822       0.24576      0.80655  1.83s
     65       0.19598       0.25106      0.78062  1.67s
     66       0.19730       0.25224      0.78219  1.65s
     67       0.19698       0.25125      0.78401  1.77s
     68       0.19574       0.26871      0.72842  1.61s
     69       0.19547       0.24096      0.81122  1.60s
     70       0.19333       0.24477      0.78984  1.56s
     71       0.19274       0.25960      0.74247  1.56s
     72       0.19148       0.25094      0.76306  1.61s
     73       0.19110       0.26300      0.72664  1.58s
     74       0.19048       0.24406      0.78046  1.54s
     75       0.19109       0.24951      0.76585  1.54s
     76       0.18765       0.24123      0.77787  1.61s
     77       0.18442       0.25236      0.73080  1.85s
     78       0.18827       0.24619      0.76474  1.53s
     79       0.18818       0.24238      0.77638  1.52s
     80       0.18304       0.24611      0.74375  1.54s
     81       0.18205       0.24385      0.74657  1.54s
     82       0.18390       0.25332      0.72595  1.52s
     83       0.18352       0.25060      0.73231  1.78s
     84       0.18129       0.25659      0.70656  1.72s
     85       0.18153       0.24494      0.74110  1.54s
     86       0.17746       0.25210      0.70394  1.87s
     87       0.17892       0.24238      0.73815  1.91s
     88       0.17938       0.24051      0.74585  1.91s
     89       0.17853       0.24715      0.72235  1.80s
     90       0.17770       0.24617      0.72188  2.18s
     91       0.17555       0.25220      0.69609  1.72s
     92       0.17615       0.27416      0.64251  1.93s
     93       0.17625       0.25262      0.69768  1.59s
     94       0.17564       0.24860      0.70651  1.54s
     95       0.17135       0.24365      0.70326  1.53s
     96       0.17271       0.24672      0.70002  1.53s
     97       0.17185       0.26230      0.65518  1.96s
     98       0.17107       0.25973      0.65863  2.26s
     99       0.17204       0.24370      0.70598  2.23s
    100       0.17362       0.24109      0.72013  2.08s
    101       0.16672       0.24013      0.69427  1.88s
    102       0.16770       0.24623      0.68109  1.59s
    103       0.16806       0.24516      0.68550  1.57s
    104       0.16914       0.26763      0.63201  1.55s
    105       0.16823       0.23782      0.70739  1.56s
    106       0.16919       0.24767      0.68310  1.53s
    107       0.16577       0.25410      0.65238  1.54s
    108       0.16567       0.23960      0.69144  1.55s
    109       0.16624       0.24283      0.68460  1.63s
    110       0.16231       0.24534      0.66159  2.03s
    111       0.16096       0.25578      0.62930  1.92s
    112       0.16379       0.24517      0.66804  2.24s
    113       0.16018       0.24821      0.64533  1.74s
    114       0.15832       0.23868      0.66330  1.52s
    115       0.15938       0.25311      0.62970  1.53s
    116       0.15867       0.23825      0.66596  1.53s
    117       0.15680       0.24709      0.63460  1.53s
    118       0.15817       0.24839      0.63677  1.52s
    119       0.15820       0.24760      0.63893  1.53s
    120       0.15716       0.24079      0.65268  1.52s
    121       0.15601       0.25636      0.60858  1.52s
    122       0.15577       0.26120      0.59635  1.52s
    123       0.15559       0.25280      0.61548  1.52s
    124       0.15348       0.24401      0.62899  1.52s
    125       0.15231       0.24534      0.62081  1.52s
    126       0.15278       0.24272      0.62944  1.52s
    127       0.14871       0.24658      0.60308  1.52s
    128       0.15073       0.27232      0.55352  1.52s
    129       0.15016       0.25201      0.59584  1.52s
    130       0.15098       0.25221      0.59863  1.52s
    131       0.15103       0.26365      0.57282  1.53s
    132       0.15059       0.25201      0.59755  1.52s
    133       0.14732       0.24336      0.60536  1.52s
    134       0.14448       0.24604      0.58723  1.53s
    135       0.14503       0.24503      0.59187  1.53s
    136       0.14386       0.25163      0.57172  1.53s
    137       0.14736       0.25425      0.57957  1.52s
    138       0.14574       0.24476      0.59543  1.53s
    139       0.14689       0.24643      0.59606  1.53s
    140       0.14582       0.24452      0.59634  1.53s
    141       0.14221       0.26096      0.54495  1.52s
    142       0.14368       0.25144      0.57143  1.52s
    143       0.14273       0.24540      0.58160  1.52s
    144       0.14046       0.24597      0.57105  1.52s
    145       0.14015       0.26069      0.53760  1.52s
    146       0.14000       0.25379      0.55164  1.52s
    147       0.14029       0.28585      0.49078  1.52s
    148       0.14008       0.24065      0.58208  1.52s
    149       0.14034       0.24326      0.57688  1.53s
    150       0.13810       0.25340      0.54499  1.52s
    151       0.13554       0.24535      0.55246  1.53s
    152       0.13756       0.24048      0.57203  1.52s
    153       0.13811       0.25532      0.54092  1.52s
    154       0.13427       0.24543      0.54709  1.52s
    155       0.13300       0.25152      0.52877  1.52s
    156       0.13354       0.26515      0.50364  1.52s
    157       0.13386       0.24156      0.55414  1.52s
    158       0.13267       0.24883      0.53316  1.52s
    159       0.13448       0.24206      0.55556  1.52s
    160       0.13131       0.25076      0.52363  1.52s
    161       0.13146       0.24339      0.54013  1.53s
    162       0.13007       0.24259      0.53617  1.53s
    163       0.12647       0.23994      0.52708  1.52s
    164       0.13006       0.26122      0.49790  1.52s
    165       0.12844       0.24896      0.51593  1.52s
    166       0.13020       0.23963      0.54334  1.52s
    167       0.13044       0.24788      0.52623  1.53s
    168       0.12785       0.24178      0.52881  1.52s
    169       0.12646       0.25043      0.50496  1.53s
    170       0.12633       0.25265      0.50001  1.53s
    171       0.12855       0.25333      0.50744  1.52s
    172       0.12822       0.26184      0.48971  1.52s
    173       0.12958       0.24673      0.52518  1.52s
    174       0.12715       0.24777      0.51318  1.52s
    175       0.12247       0.24374      0.50248  1.52s
    176       0.12504       0.24004      0.52091  1.52s
    177       0.12289       0.26296      0.46735  1.52s
    178       0.12457       0.24044      0.51809  1.52s
    179       0.12148       0.26085      0.46573  1.52s
    180       0.11992       0.24291      0.49368  1.53s
    181       0.12117       0.25291      0.47910  1.52s
    182       0.12018       0.24447      0.49161  1.53s
    183       0.12230       0.24805      0.49304  1.52s
    184       0.12077       0.24285      0.49730  1.52s
    185       0.11937       0.24316      0.49089  1.52s
    186       0.11859       0.24957      0.47518  1.52s
    187       0.11603       0.24452      0.47451  1.52s
    188       0.11417       0.24359      0.46872  1.52s
    189       0.11568       0.25750      0.44925  1.52s
    190       0.11533       0.25784      0.44729  1.52s
    191       0.11588       0.24412      0.47470  1.77s
    192       0.11519       0.25137      0.45827  1.53s
    193       0.11357       0.24865      0.45673  1.52s
    194       0.11247       0.25222      0.44595  1.51s
    195       0.11573       0.25001      0.46290  1.51s
    196       0.11574       0.24026      0.48171  1.51s
    197       0.11235       0.26956      0.41678  1.51s
    198       0.11052       0.24924      0.44342  1.52s
    199       0.11175       0.24248      0.46086  1.51s
    200       0.10851       0.25232      0.43005  1.52s
    201       0.11126       0.24386      0.45625  1.51s
    202       0.11065       0.24542      0.45087  1.52s
    203       0.10820       0.24583      0.44013  1.51s
    204       0.10839       0.25487      0.42526  1.51s
    205       0.11048       0.25142      0.43942  1.52s
Early stopping.
Best valid loss was 0.237818 at epoch 105.
Loaded parameters to layer 'hidden1' (shape 400x1000).
Loaded parameters to layer 'hidden1' (shape 1000).
Loaded parameters to layer 'hidden2' (shape 1000x1000).
Loaded parameters to layer 'hidden2' (shape 1000).
Loaded parameters to layer 'output' (shape 1000x1).
Loaded parameters to layer 'output' (shape 1).
Out[13]:
NeuralNet(X_tensor_type=None,
     batch_iterator_test=<nolearn.lasagne.base.BatchIterator object at 0x116999090>,
     batch_iterator_train=<nolearn.lasagne.base.BatchIterator object at 0x116983fd0>,
     custom_score=None, dropout1_p=0.5, dropout2_p=0.5,
     hidden1_num_units=1000, hidden2_num_units=1000,
     input_shape=(None, 400),
     layers=[('input', <class 'lasagne.layers.input.InputLayer'>), ('hidden1', <class 'lasagne.layers.dense.DenseLayer'>), ('dropout1', <class 'lasagne.layers.noise.DropoutLayer'>), ('hidden2', <class 'lasagne.layers.dense.DenseLayer'>), ('dropout2', <class 'lasagne.layers.noise.DropoutLayer'>), ('output', <class 'lasagne.layers.dense.DenseLayer'>)],
     loss=None, max_epochs=1000, more_params={},
     objective=<function objective at 0x1169849b0>,
     objective_loss_function=<function squared_error at 0x11696a140>,
     on_epoch_finished=[<__main__.AdjustVariable object at 0x175ce2490>, <__main__.AdjustVariable object at 0x175cdf7d0>, <__main__.EarlyStopping object at 0x175cdf810>, <nolearn.lasagne.handlers.PrintLog instance at 0x1791dd7a0>],
     on_training_finished=[],
     on_training_started=[<nolearn.lasagne.handlers.PrintLayerInfo instance at 0x1791dd758>],
     output_nonlinearity=<function tanh at 0x1169566e0>,
     output_num_units=1, regression=True,
     train_split=<nolearn.lasagne.base.TrainSplit object at 0x1169990d0>,
     update=<function nesterov_momentum at 0x11696a7d0>,
     update_learning_rate=array(0.007940080016851425, dtype=float32),
     update_momentum=array(0.9206033945083618, dtype=float32),
     use_label_encoder=False, verbose=1,
     y_tensor_type=TensorType(float32, matrix))

Evaluate the NN


In [31]:
Y_pred = model.predict(np.asarray(X_test, dtype=np.float32))

# let's threshold the continuous values to get the classes
pos = Y_pred >= .33
neg = Y_pred <= -0.33
neu = np.logical_and(Y_pred < 0.33, Y_pred > -0.33)
Y_pred[pos] = 1
Y_pred[neg] = -1
Y_pred[neu] = 0

print classification_report(Y_test, Y_pred)

cm = confusion_matrix(Y_test, Y_pred)

print cm

fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(cm)
fig.colorbar(cax)

ax.set_xticklabels(['']+['-1', '0', '1'])
ax.set_yticklabels(['']+['-1', '0', '1'])

plt.title('Confusion Matrix')
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()


             precision    recall  f1-score   support

       -1.0       0.92      0.84      0.88       500
        0.0       0.63      0.84      0.72       500
        1.0       0.81      0.61      0.70       500

avg / total       0.79      0.76      0.77      1500

[[421  68  11]
 [ 22 419  59]
 [ 16 178 306]]

In [14]:
Y_pred = model.predict(np.asarray(X_test, dtype=np.float32))

# let's threshold the continuous values to get the classes
pos = Y_pred >= .33
neg = Y_pred <= -0.33
neu = np.logical_and(Y_pred < 0.33, Y_pred > -0.33)
Y_pred[pos] = 1
Y_pred[neg] = -1
Y_pred[neu] = 0

print classification_report(Y_test, Y_pred)

cm = confusion_matrix(Y_test, Y_pred)

print cm

fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(cm)
fig.colorbar(cax)

ax.set_xticklabels(['']+['-1', '0', '1'])
ax.set_yticklabels(['']+['-1', '0', '1'])

plt.title('Confusion Matrix')
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()


             precision    recall  f1-score   support

       -1.0       0.91      0.86      0.88       500
        0.0       0.68      0.68      0.68       500
        1.0       0.72      0.75      0.73       500

avg / total       0.77      0.76      0.77      1500

[[430  52  18]
 [ 29 340 131]
 [ 16 107 377]]

The biggest problem is between Positive Vs. Neutral. Let's try to find useful distinguishers for these two classes.

See how good (or bad) the linguistic features that we have


In [7]:
import features
reload(features)

positive_features = np.zeros((len(positive), features.NUM_LINGUISTIC_FEATURES), dtype=np.float32)
negative_features = np.zeros((len(negative), features.NUM_LINGUISTIC_FEATURES), dtype=np.float32)
neutral_features  = np.zeros((len(neutral) , features.NUM_LINGUISTIC_FEATURES), dtype=np.float32)

for i, sentence in enumerate(positive):
    sent_vec = features.get_linguistic_features(sentence)
    positive_features[i,] = sent_vec

for i, sentence in enumerate(negative):
    sent_vec = features.get_linguistic_features(sentence)
    negative_features[i,] = sent_vec

for i, sentence in enumerate(neutral):
    sent_vec = features.get_linguistic_features(sentence)
    neutral_features[i,] = sent_vec

# we'll ignore the imbalance of the classes for now and see what happens
# choose 500 samples from each class to be included in the test set

num_test_samples_per_class = 500

pos_train, pos_test = train_test_split(positive_features, test_size=num_test_samples_per_class, random_state=22)
neg_train, neg_test = train_test_split(negative_features, test_size=num_test_samples_per_class, random_state=22)
neu_train, neu_test = train_test_split(neutral_features , test_size=num_test_samples_per_class, random_state=22)

X_train = np.vstack((
    pos_train,
    neg_train,
    neu_train
))
X_test  = np.vstack((
    pos_test,
    neg_test,
    neu_test
))
Y_train = np.hstack((
    np.ones((pos_train.shape[0]), dtype=np.float32),
    np.ones((neg_train.shape[0]), dtype=np.float32) * -1,
    np.zeros((neu_train.shape[0]), dtype=np.float32)
))
Y_test = np.hstack((
    np.ones((pos_test.shape[0]), dtype=np.float32),
    np.ones((neg_test.shape[0]), dtype=np.float32) * -1,
    np.zeros((neu_test.shape[0]), dtype=np.float32)
))

# shuffle 'em
X_train, Y_train = shuffle(X_train, Y_train, random_state=111)
X_test , Y_test  = shuffle(X_test , Y_test , random_state=111)

Train LR


In [8]:
model = LogisticRegression(C=1e5, class_weight='auto', random_state=33)
scores = cross_val_score(model, X_train, Y_train, cv=3, verbose=1, n_jobs=8)
print "Accuracy: %0.5f [%0.5f - %0.5f]" % (scores.mean(), scores.mean()-scores.std() * 2, scores.mean()+scores.std() * 2)

model.fit(X_train, Y_train)


Accuracy: 0.62159 [0.60780 - 0.63538]
[Parallel(n_jobs=8)]: Done   1 jobs       | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done   3 out of   3 | elapsed:    0.1s finished
Out[8]:
LogisticRegression(C=100000.0, class_weight='auto', dual=False,
          fit_intercept=True, intercept_scaling=1, penalty='l2',
          random_state=33, tol=0.0001)

In [34]:
print model.coef_

Y_pred = model.predict(X_test)
print classification_report(Y_test, Y_pred)

cm = confusion_matrix(Y_test, Y_pred)

print cm

fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(cm)
fig.colorbar(cax)

ax.set_xticklabels(['']+['-1', '0', '1'])
ax.set_yticklabels(['']+['-1', '0', '1'])

plt.title('Confusion Matrix')
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()


[[ 0.35346987  4.92481448  1.80200111  2.88740596  1.34519168 -0.06794115
   1.20213894 -4.38018904]
 [-2.13637061 -3.40506267 -2.38370639 -1.82043459  0.25270369 -2.41349426
  -3.46977753  0.79135405]
 [ 2.92176862 -1.57157761  1.50862341  1.11264888 -2.41054806  3.26814909
   1.841855    3.13502001]]
             precision    recall  f1-score   support

       -1.0       0.74      0.62      0.68       500
        0.0       0.53      0.66      0.59       500
        1.0       0.61      0.55      0.58       500

avg / total       0.63      0.61      0.61      1500

[[311 123  66]
 [ 57 331 112]
 [ 53 171 276]]

Okay so these are much worse than word2vec. Let's combine them together.


In [38]:
model_path = '../models/word2vec_twitter_model.bin'
w2v = Word2Vec.load_word2vec_format(model_path, binary=True)

import features
reload(features)

num_features = w2v.layer1_size + features.NUM_LINGUISTIC_FEATURES
positive_features = np.zeros((len(positive), num_features), dtype=np.float32)
negative_features = np.zeros((len(negative), num_features), dtype=np.float32)
neutral_features  = np.zeros((len(neutral) , num_features), dtype=np.float32)

for i, sentence in enumerate(positive):
    sent_vec = features.get_features(w2v, sentence)
    positive_features[i,] = sent_vec

for i, sentence in enumerate(negative):
    sent_vec = features.get_features(w2v, sentence)
    negative_features[i,] = sent_vec

for i, sentence in enumerate(neutral):
    sent_vec = features.get_features(w2v, sentence)
    neutral_features[i,] = sent_vec

del w2v
    
num_test_samples_per_class = 500

pos_train, pos_test = train_test_split(positive_features, test_size=num_test_samples_per_class, random_state=22)
neg_train, neg_test = train_test_split(negative_features, test_size=num_test_samples_per_class, random_state=22)
neu_train, neu_test = train_test_split(neutral_features , test_size=num_test_samples_per_class, random_state=22)

X_train = np.vstack((
    pos_train,
    neg_train,
    neu_train
))
X_test  = np.vstack((
    pos_test,
    neg_test,
    neu_test
))
Y_train = np.hstack((
    np.ones((pos_train.shape[0]), dtype=np.float32),
    np.ones((neg_train.shape[0]), dtype=np.float32) * -1,
    np.zeros((neu_train.shape[0]), dtype=np.float32)
))
Y_test = np.hstack((
    np.ones((pos_test.shape[0]), dtype=np.float32),
    np.ones((neg_test.shape[0]), dtype=np.float32) * -1,
    np.zeros((neu_test.shape[0]), dtype=np.float32)
))

# shuffle 'em
X_train, Y_train = shuffle(X_train, Y_train, random_state=111)
X_test , Y_test  = shuffle(X_test , Y_test , random_state=111)

In [39]:
model = LogisticRegression(C=1e5, class_weight='auto', random_state=33)
scores = cross_val_score(model, X_train, Y_train, cv=3, verbose=1, n_jobs=8)
print "Accuracy: %0.5f [%0.5f - %0.5f]" % (scores.mean(), scores.mean()-scores.std() * 2, scores.mean()+scores.std() * 2)

model.fit(X_train, Y_train)


[Parallel(n_jobs=8)]: Done   1 out of   3 | elapsed:   16.0s remaining:   32.0s
[Parallel(n_jobs=8)]: Done   3 out of   3 | elapsed:   16.3s finished
Accuracy: 0.76723 [0.76123 - 0.77323]
Out[39]:
LogisticRegression(C=100000.0, class_weight='auto', dual=False,
          fit_intercept=True, intercept_scaling=1, penalty='l2',
          random_state=33, tol=0.0001)

In [40]:
Y_pred = model.predict(X_test)
print classification_report(Y_test, Y_pred)

cm = confusion_matrix(Y_test, Y_pred)

print cm

fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(cm)
fig.colorbar(cax)

ax.set_xticklabels(['']+['-1', '0', '1'])
ax.set_yticklabels(['']+['-1', '0', '1'])

plt.title('Confusion Matrix')
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()


             precision    recall  f1-score   support

       -1.0       0.84      0.88      0.86       500
        0.0       0.71      0.79      0.75       500
        1.0       0.78      0.66      0.71       500

avg / total       0.78      0.77      0.77      1500

[[441  35  24]
 [ 37 393  70]
 [ 46 126 328]]

Very little improvement. Let's train a model on the w2v features, and then use the output of this model as a linguistic feature.


In [18]:
w2v_outputs = model.predict(X_train)

# convert the outputs to 3 indicator (i.e. binary) features
mlb = MultiLabelBinarizer()
w2v_outputs = mlb.fit_transform([(x,) for x in w2v_outputs.tolist()])
print list(mlb.classes_)


[-1.0, 0.0, 1.0]

In [23]:
# load the linguistic features again

import features
reload(features)

positive_features = np.zeros((len(positive), features.NUM_LINGUISTIC_FEATURES), dtype=np.float32)
negative_features = np.zeros((len(negative), features.NUM_LINGUISTIC_FEATURES), dtype=np.float32)
neutral_features  = np.zeros((len(neutral) , features.NUM_LINGUISTIC_FEATURES), dtype=np.float32)

for i, sentence in enumerate(positive):
    sent_vec = features.get_linguistic_features(sentence)
    positive_features[i,] = sent_vec

for i, sentence in enumerate(negative):
    sent_vec = features.get_linguistic_features(sentence)
    negative_features[i,] = sent_vec

for i, sentence in enumerate(neutral):
    sent_vec = features.get_linguistic_features(sentence)
    neutral_features[i,] = sent_vec

# we'll ignore the imbalance of the classes for now and see what happens
# choose 500 samples from each class to be included in the test set

num_test_samples_per_class = 500

pos_train, pos_test = train_test_split(positive_features, test_size=num_test_samples_per_class, random_state=22)
neg_train, neg_test = train_test_split(negative_features, test_size=num_test_samples_per_class, random_state=22)
neu_train, neu_test = train_test_split(neutral_features , test_size=num_test_samples_per_class, random_state=22)

X_train = np.vstack((
    pos_train,
    neg_train,
    neu_train
))
X_test  = np.vstack((
    pos_test,
    neg_test,
    neu_test
))
Y_train = np.hstack((
    np.ones((pos_train.shape[0]), dtype=np.float32),
    np.ones((neg_train.shape[0]), dtype=np.float32) * -1,
    np.zeros((neu_train.shape[0]), dtype=np.float32)
))
Y_test = np.hstack((
    np.ones((pos_test.shape[0]), dtype=np.float32),
    np.ones((neg_test.shape[0]), dtype=np.float32) * -1,
    np.zeros((neu_test.shape[0]), dtype=np.float32)
))

# shuffle 'em
X_train, Y_train = shuffle(X_train, Y_train, random_state=111)
X_test , Y_test  = shuffle(X_test , Y_test , random_state=111)

In [20]:
# now combine the features and train a new classifier
X_train = np.hstack((
    w2v_outputs,
    X_train
))

ling_model = LogisticRegression(C=1e5, class_weight='auto', random_state=33)
scores = cross_val_score(ling_model, X_train, Y_train, cv=3, verbose=1)
print "Accuracy: %0.5f [%0.5f - %0.5f]" % (scores.mean(), scores.mean()-scores.std() * 2, scores.mean()+scores.std() * 2)

ling_model.fit(X_train, Y_train)
print ling_model.coef_


[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:    0.1s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.3s finished
Accuracy: 0.77722 [0.77457 - 0.77987]
[[ 1.78308727 -2.00017066 -1.51003745 -0.12711376  2.66382888  0.83223089
   1.41265682  0.54149745 -0.37233413  1.36643276 -2.02640973]
 [-1.28674621  1.80233702 -0.26957343 -1.33392601 -1.62424827 -1.19637774
  -0.86351365  0.62487347 -1.45411957 -1.84409244 -0.2899234 ]
 [-1.78057058 -0.89058757  1.02719014  1.81030889 -0.7555225   0.90475212
   0.57236135 -1.62974088  1.93892389  0.64970067  1.8247693 ]]