In [1]:
import codecs, sys

from word2vec.word2vecReader import Word2Vec
from preprocessing import preprocess_tweet
from features import get_word2vec_features, NUM_LINGUISTIC_FEATURES, get_linguistic_features
from nn import NN

import numpy as np

from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import cross_val_score
from sklearn.utils import shuffle
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import explained_variance_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

import matplotlib.pyplot as plt

def print_evaluations(Y_true, Y_pred):
    report = classification_report(Y_true, Y_pred)
    print 'Classification report:\n%s' % str(report)

    cm = confusion_matrix(Y_true, Y_pred)
    print 'Confusion Matrix:\n%s' % str(cm)
    
    sys.stdout.flush()

def load_word2vec(path='../models/word2vec_twitter_model.bin'):
    return Word2Vec.load_word2vec_format(path, binary=True)

def load_data(vec_function, num_features, num_test_samples_per_class=500):
    # first load the raw data
    f = codecs.open('../data/positive-all', 'r', 'utf-8')
    positive = {l.strip() for l in f}
    f.close()

    f = codecs.open('../data/negative-all', 'r', 'utf-8')
    negative = {l.strip() for l in f}
    f.close()

    f = codecs.open('../data/neutral-all', 'r', 'utf-8')
    neutral = {l.strip() for l in f}
    f.close()
    
    # convert the sentences to vectors
    positive_features = np.zeros((len(positive), num_features), dtype=np.float32)
    negative_features = np.zeros((len(negative), num_features), dtype=np.float32)
    neutral_features  = np.zeros((len(neutral) , num_features), dtype=np.float32)

    for i, sentence in enumerate(positive):
        sent_vec = vec_function(sentence)
        positive_features[i,] = sent_vec

    for i, sentence in enumerate(negative):
        sent_vec = vec_function(sentence)
        negative_features[i,] = sent_vec

    for i, sentence in enumerate(neutral):
        sent_vec = vec_function(sentence)
        neutral_features[i,] = sent_vec
    
    # finally split into train/test and combine them into one big matrix
    pos_train, pos_test = train_test_split(positive_features, test_size=num_test_samples_per_class, random_state=22)
    neg_train, neg_test = train_test_split(negative_features, test_size=num_test_samples_per_class, random_state=22)
    neu_train, neu_test = train_test_split(neutral_features , test_size=num_test_samples_per_class, random_state=22)

    X_train = np.vstack((
        pos_train,
        neg_train,
        neu_train
    ))
    X_test  = np.vstack((
        pos_test,
        neg_test,
        neu_test
    ))
    Y_train = np.hstack((
        np.ones((pos_train.shape[0]), dtype=np.float32),
        np.ones((neg_train.shape[0]), dtype=np.float32) * -1,
        np.zeros((neu_train.shape[0]), dtype=np.float32)
    ))
    Y_test = np.hstack((
        np.ones((pos_test.shape[0]), dtype=np.float32),
        np.ones((neg_test.shape[0]), dtype=np.float32) * -1,
        np.zeros((neu_test.shape[0]), dtype=np.float32)
    ))

    # shuffle 'em
    X_train, Y_train = shuffle(X_train, Y_train, random_state=111)
    X_test , Y_test  = shuffle(X_test , Y_test , random_state=111)
    
    return X_train, Y_train, X_test , Y_test


Using gpu device 0: GeForce GT 650M (CNMeM is disabled)

Load the word2vec features and the linguistic ones


In [2]:
np.random.seed(142)

print 'Loading the word2vec model...'
sys.stdout.flush()
w2v = load_word2vec()

print 'Building the word2vec sentence features...'
sys.stdout.flush()

vec_function = lambda sentence: get_word2vec_features(w2v, sentence)
num_features = w2v.layer1_size
w2v_X_train, w2v_Y_train, w2v_X_test , w2v_Y_test = load_data(vec_function, num_features)

print 'Building the linguistic features...'
sys.stdout.flush()

vec_function = lambda sentence: get_linguistic_features(sentence)
num_features = NUM_LINGUISTIC_FEATURES
ling_X_train, ling_Y_train, ling_X_test , ling_Y_test = load_data(vec_function, num_features)


Loading the word2vec model...
Building the word2vec sentence features...
Building the linguistic features...

Let's train a Logistic Regression model on each and compare the results.


In [3]:
del w2v # don't need it anymore

print 'Training a logistic regression model on the word2vec features...'
sys.stdout.flush()

w2v_lr = LogisticRegression(C=1e5, class_weight='auto', random_state=33)
w2v_lr.fit(w2v_X_train, w2v_Y_train)

predictions = w2v_lr.predict(w2v_X_test)
print_evaluations(w2v_Y_test, predictions)

# In addition, let's do CV and print out the results
scores = cross_val_score(w2v_lr, w2v_X_train, w2v_Y_train, cv=5, verbose=1)
print "CV-5 accuracy: %0.5f [%0.5f - %0.5f]" % (scores.mean(), scores.mean()-scores.std() * 2, scores.mean()+scores.std() * 2)
sys.stdout.flush()

print 'Training a logistic regression model on the linguistic features...'
sys.stdout.flush()

ling_lr = LogisticRegression(C=1e5, class_weight='auto', random_state=33)
ling_lr.fit(ling_X_train, ling_Y_train)

predictions = ling_lr.predict(ling_X_test)
print_evaluations(ling_Y_test, predictions)

# In addition, let's do CV and print out the results
scores = cross_val_score(ling_lr, ling_X_train, ling_Y_train, cv=5, verbose=1)
print "CV-5 accuracy: %0.5f [%0.5f - %0.5f]" % (scores.mean(), scores.mean()-scores.std() * 2, scores.mean()+scores.std() * 2)
sys.stdout.flush()


Training a logistic regression model on the word2vec features...
Classification report:
             precision    recall  f1-score   support

       -1.0       0.83      0.89      0.86       500
        0.0       0.71      0.79      0.75       500
        1.0       0.77      0.64      0.70       500

avg / total       0.77      0.77      0.77      1500

Confusion Matrix:
[[444  31  25]
 [ 39 393  68]
 [ 52 129 319]]
[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:    7.2s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   35.7s finished
CV-5 accuracy: 0.76345 [0.76044 - 0.76646]
Training a logistic regression model on the linguistic features...
Classification report:
             precision    recall  f1-score   support

       -1.0       0.65      0.75      0.70       500
        0.0       0.54      0.52      0.53       500
        1.0       0.63      0.54      0.58       500

avg / total       0.60      0.61      0.60      1500

Confusion Matrix:
[[377  72  51]
 [130 262 108]
 [ 77 154 269]]
[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:    0.1s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.4s finished
CV-5 accuracy: 0.62213 [0.60591 - 0.63835]

Now, let's combine the outputs of the word2vec model with the linguistic features


In [4]:
# predict on both train and test
w2v_train_predictions = w2v_lr.predict(w2v_X_train)
w2v_test_predictions  = w2v_lr.predict(w2v_X_test)

mlb = MultiLabelBinarizer()
w2v_train_predictions_binarized = mlb.fit_transform(w2v_train_predictions.reshape(-1, 1))
w2v_test_predictions_binarized  = mlb.fit_transform(w2v_test_predictions.reshape(-1, 1))

# now stack these with the ling features
# now combine the features and train a new classifier
X_train = np.hstack((
    w2v_train_predictions_binarized,
    ling_X_train
))
X_test = np.hstack((
    w2v_test_predictions_binarized,
    ling_X_test
))

# normalise to unit length
lengths = np.linalg.norm(X_train, axis=1)
X_train = X_train / lengths[:, None] # divides each row by the corresponding element
lengths = np.linalg.norm(X_test, axis=1)
X_test  = X_test / lengths[:, None]

Now let's train a logistic regression model on the combined data


In [5]:
print 'Training a logistic regression model on the combined features...'
sys.stdout.flush()
lr = LogisticRegression(C=1e5, class_weight='auto', random_state=33)
lr.fit(X_train, ling_Y_train)

predictions = lr.predict(X_test)
print_evaluations(ling_Y_test, predictions)

# In addition, let's do CV and print out the results
scores = cross_val_score(lr, X_train, ling_Y_train, cv=5, verbose=1)
print "CV-5 accuracy: %0.5f [%0.5f - %0.5f]" % (scores.mean(), scores.mean()-scores.std() * 2, scores.mean()+scores.std() * 2)
sys.stdout.flush()


Training a logistic regression model on the combined features...
Classification report:
             precision    recall  f1-score   support

       -1.0       0.83      0.89      0.86       500
        0.0       0.71      0.78      0.74       500
        1.0       0.77      0.64      0.70       500

avg / total       0.77      0.77      0.77      1500

Confusion Matrix:
[[444  31  25]
 [ 40 391  69]
 [ 50 130 320]]
[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:    0.1s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.5s finished
CV-5 accuracy: 0.77595 [0.76981 - 0.78209]

The precision/recall/F1 scores are almost exactly the same as the word2vec features by themselves. However, cross validation slightly favours the combined features.

Let's checkout the coefficients


In [6]:
print '\tW2V_Neg\tW2V_Neu\tW2VPos\tc(Pos)\tc(Neg)\tc(Int)\tc(Elo)\t?\t!\t!!..\t#'
row_titles = ['Neg', 'Neu', 'Pos']
for title, row in zip(row_titles, lr.coef_):
    print '%s:\t' % title,
    for v in row:
        print '%-.2f\t' % v,
    print ''


	W2V_Neg	W2V_Neu	W2VPos	c(Pos)	c(Neg)	c(Int)	c(Elo)	?	!	!!..	#
Neg:	6.04	-2.22	-1.52	0.23	3.93	1.29	2.29	0.96	-0.20	1.70	-2.38	
Neu:	-4.39	2.33	-2.83	-1.36	-2.08	-1.32	-0.48	0.86	-1.56	-2.31	0.09	
Pos:	-4.48	-2.74	2.19	1.61	-1.59	0.65	-0.55	-2.22	1.97	0.61	2.08	

Okay, so the word2vec features have more magnitude, but also some of the linguistic features seem to be kind of good, especially in finding the negative ones. The word2vec feature seems to be the only one that's useful in finding Neutral sentiments.

Let's set all the linguistic features to random numbers and see what happens


In [7]:
np.random.seed(142)

random_ling_X_train = np.random.rand(ling_X_train.shape[0], ling_X_train.shape[1])
random_ling_X_test  = np.random.rand(ling_X_test.shape[0], ling_X_test.shape[1])

X_train_rand = np.hstack((
    w2v_train_predictions_binarized,
    random_ling_X_train
))
X_test_rand = np.hstack((
    w2v_test_predictions_binarized,
    random_ling_X_test
))

train again and checkout the results


In [19]:
lr = LogisticRegression(C=1e5, class_weight='auto', random_state=33)
lr.fit(X_train_rand, ling_Y_train)

predictions = lr.predict(X_test_rand)
print_evaluations(ling_Y_test, predictions)

# In addition, let's do CV and print out the results
scores = cross_val_score(lr, X_train, ling_Y_train, cv=5, verbose=1)
print "CV-5 accuracy: %0.5f [%0.5f - %0.5f]" % (scores.mean(), scores.mean()-scores.std() * 2, scores.mean()+scores.std() * 2)

print '\nCoefficients:'
print '\tW2V_Neg\tW2V_Neu\tW2VPos\tRand1\tRand2\tRand3\tRand4\tRand5\tRand6\tRand7\tRand8'
row_titles = ['Neg', 'Neu', 'Pos']
for title, row in zip(row_titles, lr.coef_):
    print '%s:\t' % title,
    for v in row:
        print '%-.2f\t' % v,
    print ''


Classification report:
             precision    recall  f1-score   support

       -1.0       0.83      0.89      0.86       500
        0.0       0.71      0.79      0.75       500
        1.0       0.77      0.64      0.70       500

avg / total       0.77      0.77      0.77      1500

Confusion Matrix:
[[444  31  25]
 [ 39 393  68]
 [ 52 129 319]]
[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:    0.1s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.5s finished
CV-5 accuracy: 0.77595 [0.76981 - 0.78209]

Coefficients:
	W2V_Neg	W2V_Neu	W2VPos	Rand1	Rand2	Rand3	Rand4	Rand5	Rand6	Rand7	Rand8
Neg:	2.49	-1.90	-1.58	0.12	-0.04	0.05	0.01	0.08	-0.03	-0.02	0.03	
Neu:	-1.67	1.72	-0.79	0.03	0.04	-0.06	-0.08	-0.02	0.11	-0.03	0.08	
Pos:	-1.82	-0.63	1.82	-0.11	-0.01	0.02	0.08	-0.04	-0.10	0.06	-0.11	

Same numbers as well, which suggests that the linguistic features aren't really affecting the performance of the model.

Final experiment: Let's train on the top 8 features


In [20]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

feature_selector = SelectKBest(chi2, k=8)
feature_selector.fit(X_train, ling_Y_train)

new_X_train = feature_selector.transform(X_train)
new_X_test  = feature_selector.transform(X_test)

lr = LogisticRegression(C=1e5, class_weight='auto', random_state=33)
lr.fit(new_X_train, ling_Y_train)

predictions = lr.predict(new_X_test)
print_evaluations(ling_Y_test, predictions)

# In addition, we'll do CV validation and print out the results
scores = cross_val_score(lr, new_X_train, ling_Y_train, cv=5, verbose=1)
print "CV-5 accuracy: %0.5f [%0.5f - %0.5f]" % (scores.mean(), scores.mean()-scores.std() * 2, scores.mean()+scores.std() * 2)

print '\nCoefficients:'
print '\tF1\tF2\tF3\tF4\tF5\tF6\tF7\tF8'
row_titles = ['Neg', 'Neu', 'Pos']
for title, row in zip(row_titles, lr.coef_):
    print '%s:\t' % title,
    for v in row:
        print '%-.2f\t' % v,
    print ''


Classification report:
             precision    recall  f1-score   support

       -1.0       0.83      0.89      0.86       500
        0.0       0.71      0.78      0.74       500
        1.0       0.77      0.64      0.70       500

avg / total       0.77      0.77      0.77      1500

Confusion Matrix:
[[444  31  25]
 [ 40 388  72]
 [ 49 129 322]]
[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:    0.1s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.4s finished
CV-5 accuracy: 0.77595 [0.76866 - 0.78324]

Coefficients:
	F1	F2	F3	F4	F5	F6	F7\F8
Neg:	4.22	-4.31	-3.37	-0.74	2.96	0.38	-0.54	-2.72	
Neu:	-4.00	2.85	-2.47	-1.15	-1.90	0.96	-1.71	0.19	
Pos:	-3.96	-2.21	2.77	1.85	-1.35	-2.07	2.21	2.09	

Nothing seems to be able to beat the word2vec features!


In [ ]: