In [1]:
import codecs, sys
from word2vec.word2vecReader import Word2Vec
from preprocessing import preprocess_tweet
from features import get_word2vec_features, NUM_LINGUISTIC_FEATURES, get_linguistic_features
from nn import NN
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import cross_val_score
from sklearn.utils import shuffle
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import explained_variance_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt
def print_evaluations(Y_true, Y_pred):
report = classification_report(Y_true, Y_pred)
print 'Classification report:\n%s' % str(report)
cm = confusion_matrix(Y_true, Y_pred)
print 'Confusion Matrix:\n%s' % str(cm)
sys.stdout.flush()
def load_word2vec(path='../models/word2vec_twitter_model.bin'):
return Word2Vec.load_word2vec_format(path, binary=True)
def load_data(vec_function, num_features, num_test_samples_per_class=500):
# first load the raw data
f = codecs.open('../data/positive-all', 'r', 'utf-8')
positive = {l.strip() for l in f}
f.close()
f = codecs.open('../data/negative-all', 'r', 'utf-8')
negative = {l.strip() for l in f}
f.close()
f = codecs.open('../data/neutral-all', 'r', 'utf-8')
neutral = {l.strip() for l in f}
f.close()
# convert the sentences to vectors
positive_features = np.zeros((len(positive), num_features), dtype=np.float32)
negative_features = np.zeros((len(negative), num_features), dtype=np.float32)
neutral_features = np.zeros((len(neutral) , num_features), dtype=np.float32)
for i, sentence in enumerate(positive):
sent_vec = vec_function(sentence)
positive_features[i,] = sent_vec
for i, sentence in enumerate(negative):
sent_vec = vec_function(sentence)
negative_features[i,] = sent_vec
for i, sentence in enumerate(neutral):
sent_vec = vec_function(sentence)
neutral_features[i,] = sent_vec
# finally split into train/test and combine them into one big matrix
pos_train, pos_test = train_test_split(positive_features, test_size=num_test_samples_per_class, random_state=22)
neg_train, neg_test = train_test_split(negative_features, test_size=num_test_samples_per_class, random_state=22)
neu_train, neu_test = train_test_split(neutral_features , test_size=num_test_samples_per_class, random_state=22)
X_train = np.vstack((
pos_train,
neg_train,
neu_train
))
X_test = np.vstack((
pos_test,
neg_test,
neu_test
))
Y_train = np.hstack((
np.ones((pos_train.shape[0]), dtype=np.float32),
np.ones((neg_train.shape[0]), dtype=np.float32) * -1,
np.zeros((neu_train.shape[0]), dtype=np.float32)
))
Y_test = np.hstack((
np.ones((pos_test.shape[0]), dtype=np.float32),
np.ones((neg_test.shape[0]), dtype=np.float32) * -1,
np.zeros((neu_test.shape[0]), dtype=np.float32)
))
# shuffle 'em
X_train, Y_train = shuffle(X_train, Y_train, random_state=111)
X_test , Y_test = shuffle(X_test , Y_test , random_state=111)
return X_train, Y_train, X_test , Y_test
In [2]:
np.random.seed(142)
print 'Loading the word2vec model...'
sys.stdout.flush()
w2v = load_word2vec()
print 'Building the word2vec sentence features...'
sys.stdout.flush()
vec_function = lambda sentence: get_word2vec_features(w2v, sentence)
num_features = w2v.layer1_size
w2v_X_train, w2v_Y_train, w2v_X_test , w2v_Y_test = load_data(vec_function, num_features)
print 'Building the linguistic features...'
sys.stdout.flush()
vec_function = lambda sentence: get_linguistic_features(sentence)
num_features = NUM_LINGUISTIC_FEATURES
ling_X_train, ling_Y_train, ling_X_test , ling_Y_test = load_data(vec_function, num_features)
In [3]:
del w2v # don't need it anymore
print 'Training a logistic regression model on the word2vec features...'
sys.stdout.flush()
w2v_lr = LogisticRegression(C=1e5, class_weight='auto', random_state=33)
w2v_lr.fit(w2v_X_train, w2v_Y_train)
predictions = w2v_lr.predict(w2v_X_test)
print_evaluations(w2v_Y_test, predictions)
# In addition, let's do CV and print out the results
scores = cross_val_score(w2v_lr, w2v_X_train, w2v_Y_train, cv=5, verbose=1)
print "CV-5 accuracy: %0.5f [%0.5f - %0.5f]" % (scores.mean(), scores.mean()-scores.std() * 2, scores.mean()+scores.std() * 2)
sys.stdout.flush()
print 'Training a logistic regression model on the linguistic features...'
sys.stdout.flush()
ling_lr = LogisticRegression(C=1e5, class_weight='auto', random_state=33)
ling_lr.fit(ling_X_train, ling_Y_train)
predictions = ling_lr.predict(ling_X_test)
print_evaluations(ling_Y_test, predictions)
# In addition, let's do CV and print out the results
scores = cross_val_score(ling_lr, ling_X_train, ling_Y_train, cv=5, verbose=1)
print "CV-5 accuracy: %0.5f [%0.5f - %0.5f]" % (scores.mean(), scores.mean()-scores.std() * 2, scores.mean()+scores.std() * 2)
sys.stdout.flush()
In [4]:
# predict on both train and test
w2v_train_predictions = w2v_lr.predict(w2v_X_train)
w2v_test_predictions = w2v_lr.predict(w2v_X_test)
mlb = MultiLabelBinarizer()
w2v_train_predictions_binarized = mlb.fit_transform(w2v_train_predictions.reshape(-1, 1))
w2v_test_predictions_binarized = mlb.fit_transform(w2v_test_predictions.reshape(-1, 1))
# now stack these with the ling features
# now combine the features and train a new classifier
X_train = np.hstack((
w2v_train_predictions_binarized,
ling_X_train
))
X_test = np.hstack((
w2v_test_predictions_binarized,
ling_X_test
))
# normalise to unit length
lengths = np.linalg.norm(X_train, axis=1)
X_train = X_train / lengths[:, None] # divides each row by the corresponding element
lengths = np.linalg.norm(X_test, axis=1)
X_test = X_test / lengths[:, None]
In [5]:
print 'Training a logistic regression model on the combined features...'
sys.stdout.flush()
lr = LogisticRegression(C=1e5, class_weight='auto', random_state=33)
lr.fit(X_train, ling_Y_train)
predictions = lr.predict(X_test)
print_evaluations(ling_Y_test, predictions)
# In addition, let's do CV and print out the results
scores = cross_val_score(lr, X_train, ling_Y_train, cv=5, verbose=1)
print "CV-5 accuracy: %0.5f [%0.5f - %0.5f]" % (scores.mean(), scores.mean()-scores.std() * 2, scores.mean()+scores.std() * 2)
sys.stdout.flush()
In [6]:
print '\tW2V_Neg\tW2V_Neu\tW2VPos\tc(Pos)\tc(Neg)\tc(Int)\tc(Elo)\t?\t!\t!!..\t#'
row_titles = ['Neg', 'Neu', 'Pos']
for title, row in zip(row_titles, lr.coef_):
print '%s:\t' % title,
for v in row:
print '%-.2f\t' % v,
print ''
In [7]:
np.random.seed(142)
random_ling_X_train = np.random.rand(ling_X_train.shape[0], ling_X_train.shape[1])
random_ling_X_test = np.random.rand(ling_X_test.shape[0], ling_X_test.shape[1])
X_train_rand = np.hstack((
w2v_train_predictions_binarized,
random_ling_X_train
))
X_test_rand = np.hstack((
w2v_test_predictions_binarized,
random_ling_X_test
))
In [19]:
lr = LogisticRegression(C=1e5, class_weight='auto', random_state=33)
lr.fit(X_train_rand, ling_Y_train)
predictions = lr.predict(X_test_rand)
print_evaluations(ling_Y_test, predictions)
# In addition, let's do CV and print out the results
scores = cross_val_score(lr, X_train, ling_Y_train, cv=5, verbose=1)
print "CV-5 accuracy: %0.5f [%0.5f - %0.5f]" % (scores.mean(), scores.mean()-scores.std() * 2, scores.mean()+scores.std() * 2)
print '\nCoefficients:'
print '\tW2V_Neg\tW2V_Neu\tW2VPos\tRand1\tRand2\tRand3\tRand4\tRand5\tRand6\tRand7\tRand8'
row_titles = ['Neg', 'Neu', 'Pos']
for title, row in zip(row_titles, lr.coef_):
print '%s:\t' % title,
for v in row:
print '%-.2f\t' % v,
print ''
In [20]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
feature_selector = SelectKBest(chi2, k=8)
feature_selector.fit(X_train, ling_Y_train)
new_X_train = feature_selector.transform(X_train)
new_X_test = feature_selector.transform(X_test)
lr = LogisticRegression(C=1e5, class_weight='auto', random_state=33)
lr.fit(new_X_train, ling_Y_train)
predictions = lr.predict(new_X_test)
print_evaluations(ling_Y_test, predictions)
# In addition, we'll do CV validation and print out the results
scores = cross_val_score(lr, new_X_train, ling_Y_train, cv=5, verbose=1)
print "CV-5 accuracy: %0.5f [%0.5f - %0.5f]" % (scores.mean(), scores.mean()-scores.std() * 2, scores.mean()+scores.std() * 2)
print '\nCoefficients:'
print '\tF1\tF2\tF3\tF4\tF5\tF6\tF7\tF8'
row_titles = ['Neg', 'Neu', 'Pos']
for title, row in zip(row_titles, lr.coef_):
print '%s:\t' % title,
for v in row:
print '%-.2f\t' % v,
print ''
In [ ]: