In [1]:
from keras.models import Sequential
from keras.layers.convolutional import Convolution1D, MaxPooling1D
from keras.layers.embeddings import Embedding
from keras.layers.core import Dense, Dropout, Flatten
from keras.layers.recurrent import LSTM
from keras.preprocessing.sequence import pad_sequences


Using TensorFlow backend.

In [2]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt

In [21]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix, classification_report, precision_recall_fscore_support
from sklearn.preprocessing import LabelBinarizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier

In [37]:
train_file = "data/processed/gold/train/100_topics_100_tweets.sentence-three-point.subtask-A.train.gold.txt"
test_file = "data/processed/gold/dev/100_topics_100_tweets.sentence-three-point.subtask-A.dev.gold.txt"
df_train = pd.read_csv(train_file, header=None, sep="\t")
df_test = pd.read_csv(test_file, header= None, sep="\t")
NULL_TEXT = "Not Available"
df_text_train = df_train[df_train[2] != NULL_TEXT][[1,2]]
df_text_test = df_test[df_test[2] != NULL_TEXT][[1,2]]
X_text_train, y_text_train = df_text_train[2], df_text_train[1]
X_text_test, y_text_test = df_text_test[2], df_text_test[1]


X_text = X_text_train.tolist() + X_text_test.tolist()
vectorizer = CountVectorizer(min_df=3, ngram_range=(1,3), binary=True)
X = vectorizer.fit_transform(X_text).todense()

Y_text = y_text_train.tolist() + y_text_test.tolist()
lb = LabelBinarizer()
Y = lb.fit_transform(Y_text)

print X.shape, X[:2]
X_train, Y_train = X[0:X_text_train.shape[0]], Y[0:X_text_train.shape[0]]
X_test, Y_test = X[X_text_train.shape[0]:], Y[X_text_train.shape[0]:]
print X_train.shape, Y_train.shape, X_test.shape, Y_test.shape


(7164, 12579) [[0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]]
(5366, 12579) (5366, 3) (1798, 12579) (1798, 3)

In [5]:
model = Sequential()
model.add(Dense(3, input_dim=X.shape[1], activation="softmax"))

In [6]:
model.compile(loss="categorical_crossentropy", optimizer="adam")

In [11]:
model.fit(X_train, Y_train, validation_data=(X_test, Y_test), nb_epoch=10, batch_size=128, show_accuracy=True)


Train on 5366 samples, validate on 1798 samples
Epoch 1/10
5366/5366 [==============================] - 0s - loss: 0.6761 - acc: 0.7152 - val_loss: 0.9885 - val_acc: 0.5122
Epoch 2/10
5366/5366 [==============================] - 0s - loss: 0.6704 - acc: 0.7164 - val_loss: 0.9920 - val_acc: 0.5100
Epoch 3/10
5366/5366 [==============================] - 0s - loss: 0.6654 - acc: 0.7214 - val_loss: 0.9944 - val_acc: 0.5117
Epoch 4/10
5366/5366 [==============================] - 0s - loss: 0.6604 - acc: 0.7227 - val_loss: 0.9967 - val_acc: 0.5106
Epoch 5/10
5366/5366 [==============================] - 0s - loss: 0.6559 - acc: 0.7259 - val_loss: 1.0000 - val_acc: 0.5095
Epoch 6/10
5366/5366 [==============================] - 0s - loss: 0.6515 - acc: 0.7287 - val_loss: 1.0021 - val_acc: 0.5100
Epoch 7/10
5366/5366 [==============================] - 0s - loss: 0.6473 - acc: 0.7296 - val_loss: 1.0050 - val_acc: 0.5106
Epoch 8/10
5366/5366 [==============================] - 0s - loss: 0.6433 - acc: 0.7324 - val_loss: 1.0087 - val_acc: 0.5106
Epoch 9/10
5366/5366 [==============================] - 0s - loss: 0.6395 - acc: 0.7316 - val_loss: 1.0112 - val_acc: 0.5106
Epoch 10/10
5366/5366 [==============================] - 0s - loss: 0.6360 - acc: 0.7374 - val_loss: 1.0154 - val_acc: 0.5083
Out[11]:
<keras.callbacks.History at 0x7f9c6fe7ac90>

In [12]:
char_df = pd.read_csv("index_char.txt", sep="\t", header=None, quoting=3)
char_dict = dict(k for k in char_df.values.tolist())
index_char = list(k[0] for k in char_df.values.tolist())
print len(char_dict), len(index_char)
char_vocab_size = len(char_dict) + 2 # 0 for padding and 1 for OOV
maxlen = 140 # Tweet length
label_dict = dict((k[1], k[0]) for k in enumerate(["positive", "negative", "neutral"]))
embedding_size = 10


95 95

In [34]:
logit = LogisticRegression(multi_class="multinomial", solver="lbfgs")
logit.fit(X_train, y_text_train)


Out[34]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          penalty='l2', random_state=None, solver='lbfgs', tol=0.0001,
          verbose=0)

In [35]:
y_pred = logit.predict(X_test)
print classification_report(y_pred, y_text_test)
print precision_recall_fscore_support(y_pred, y_text_test, average="micro")
print confusion_matrix(y_pred, y_text_test)


             precision    recall  f1-score   support

   negative       0.18      0.37      0.25       175
    neutral       0.50      0.45      0.47       752
   positive       0.65      0.57      0.61       871

avg / total       0.54      0.50      0.52      1798

(0.50166852057842048, 0.50166852057842048, 0.50166852057842048, None)
[[ 65  71  39]
 [187 339 226]
 [101 272 498]]

In [38]:
svc = LinearSVC(multi_class="crammer_singer")
svc.fit(X_train, y_text_train)
y_pred = svc.predict(X_test)
print classification_report(y_pred, y_text_test)
print precision_recall_fscore_support(y_pred, y_text_test, average="micro")
print confusion_matrix(y_pred, y_text_test)


             precision    recall  f1-score   support

   negative       0.22      0.31      0.26       250
    neutral       0.46      0.45      0.46       699
   positive       0.63      0.57      0.60       849

avg / total       0.51      0.48      0.49      1798

(0.4849833147942158, 0.4849833147942158, 0.4849833147942158, None)
[[ 77  99  74]
 [175 315 209]
 [101 268 480]]

In [32]:
randF = RandomForestClassifier(n_estimators=10, n_jobs=-1, random_state=1337, warm_start=True, class_weight="auto")
randF.fit(X_train, y_text_train)

print X_train.shape, X_test.shape
y_pred = randF.predict(X_test)
print classification_report(y_pred, y_text_test)
print precision_recall_fscore_support(y_pred, y_text_test, average="micro")
print confusion_matrix(y_pred, y_text_test)


(5366, 1394) (1798, 1394)
             precision    recall  f1-score   support

   negative       0.12      0.37      0.18       110
    neutral       0.39      0.43      0.41       624
   positive       0.67      0.48      0.56      1064

avg / total       0.54      0.46      0.49      1798

(0.45717463848720802, 0.45717463848720802, 0.45717463848720802, None)
[[ 41  43  26]
 [134 267 223]
 [178 372 514]]

In [ ]:
y_pred = randF.predict

In [ ]:
randF = RandomForestClassifier

In [16]:
def get_data(filename, char_dict= char_dict, idx = [1,2], label_dict = None, padding=140):
    """
    params:
    idx = [label_idx, text_idx, other_idx]
    """
    X = []
    Y = []
    with open(filename) as fp:
        for line in fp.readlines():
            line = line[:-1].split("\t")
            X.append([char_dict.get(k, -1) + 2 for k in line[idx[1]]]) # Add offset of 2, 0 for padding and 1 for OOV
            if label_dict is not None:
                y = [0]* len(label_dict)                
                y[label_dict.get(line[idx[0]])] = 1
            else:
                y = int(line[idx[0]])
            Y.append(y)
    X = pad_sequences(X, maxlen=padding)
    return X, Y

In [17]:
X, Y = get_data("data/processed/gold/train/100_topics_100_tweets.sentence-three-point.subtask-A.train.gold.txt", label_dict=label_dict)
X_test, Y_test = get_data("data/processed/gold/dev/100_topics_100_tweets.sentence-three-point.subtask-A.dev.gold.txt", label_dict=label_dict)

In [18]:
model = Sequential()
model.add(Embedding(char_vocab_size, embedding_size, input_length=maxlen))
model.add(Dropout(0.5))
model.add(Convolution1D(nb_filter=10,filter_length=50, border_mode='valid', activation='relu', subsample_length=1))
model.add(MaxPooling1D(pool_length=2))
"""
model.add(Dropout(0.25))

model.add(Dense(100, activation="relu"))
model.add(Dropout(0.5))
model.add(Dense(50, activation="relu"))
"""
model.add(Dropout(0.5))
model.add(Flatten())
model.add(Dense(len(label_dict), activation="softmax"))

In [19]:
model.compile(loss='categorical_crossentropy',optimizer='adam')

In [20]:
model.fit(X,Y, validation_data=(X_test, Y_test), batch_size=128, nb_epoch=10, show_accuracy=True)


Train on 6000 samples, validate on 2000 samples
Epoch 1/10
6000/6000 [==============================] - 3s - loss: 1.0097 - acc: 0.5030 - val_loss: 1.0628 - val_acc: 0.4220
Epoch 2/10
6000/6000 [==============================] - 3s - loss: 0.9870 - acc: 0.5155 - val_loss: 1.0588 - val_acc: 0.4220
Epoch 3/10
6000/6000 [==============================] - 3s - loss: 0.9804 - acc: 0.5160 - val_loss: 1.0627 - val_acc: 0.4220
Epoch 4/10
6000/6000 [==============================] - 3s - loss: 0.9739 - acc: 0.5175 - val_loss: 1.0503 - val_acc: 0.4220
Epoch 5/10
6000/6000 [==============================] - 3s - loss: 0.9653 - acc: 0.5170 - val_loss: 1.0688 - val_acc: 0.4220
Epoch 6/10
6000/6000 [==============================] - 3s - loss: 0.9569 - acc: 0.5185 - val_loss: 1.0476 - val_acc: 0.4270
Epoch 7/10
6000/6000 [==============================] - 3s - loss: 0.9507 - acc: 0.5277 - val_loss: 1.0546 - val_acc: 0.4310
Epoch 8/10
6000/6000 [==============================] - 3s - loss: 0.9457 - acc: 0.5275 - val_loss: 1.0461 - val_acc: 0.4380
Epoch 9/10
6000/6000 [==============================] - 3s - loss: 0.9403 - acc: 0.5372 - val_loss: 1.0480 - val_acc: 0.4325
Epoch 10/10
6000/6000 [==============================] - 3s - loss: 0.9380 - acc: 0.5345 - val_loss: 1.0487 - val_acc: 0.4345
Out[20]:
<keras.callbacks.History at 0x7f9c34267890>

In [ ]:
embeddings = model.get_weights()[0]

In [ ]:
embeddings

In [ ]:


In [ ]:
plt.clf()
plt.figure(figsize=(10,20))
plt.imshow(embeddings)
plt.yticks(range(97), ["pad", "oov"] + index_char)
plt.ylabel("Chars")

In [ ]:
a_id = ord('A')
z_id = ord('Z')
for i in range(a_id, z_id + 1):
    print i, chr(i), chr(i) in index_char

In [ ]:
char_embed = dict((k, e) for k, e in zip(["pad", "oov"] + index_char, embeddings.tolist()))

In [ ]:
sorted_keys = sorted(char_embed.keys())
sort_embeddings = np.array([char_embed.get(k) for k in sorted_keys])

In [ ]:
plt.clf()
plt.figure(figsize=(10,20))
plt.imshow(sort_embeddings)
plt.yticks(range(len(sorted_keys)), sorted_keys)
plt.ylabel("Chars")

In [ ]: