In [1]:
from keras.models import Sequential
from keras.layers.convolutional import Convolution1D, MaxPooling1D
from keras.layers.embeddings import Embedding
from keras.layers.core import Dense, Dropout, Flatten
from keras.layers.recurrent import LSTM
from keras.preprocessing.sequence import pad_sequences
In [2]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
In [21]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix, classification_report, precision_recall_fscore_support
from sklearn.preprocessing import LabelBinarizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
In [37]:
train_file = "data/processed/gold/train/100_topics_100_tweets.sentence-three-point.subtask-A.train.gold.txt"
test_file = "data/processed/gold/dev/100_topics_100_tweets.sentence-three-point.subtask-A.dev.gold.txt"
df_train = pd.read_csv(train_file, header=None, sep="\t")
df_test = pd.read_csv(test_file, header= None, sep="\t")
NULL_TEXT = "Not Available"
df_text_train = df_train[df_train[2] != NULL_TEXT][[1,2]]
df_text_test = df_test[df_test[2] != NULL_TEXT][[1,2]]
X_text_train, y_text_train = df_text_train[2], df_text_train[1]
X_text_test, y_text_test = df_text_test[2], df_text_test[1]
X_text = X_text_train.tolist() + X_text_test.tolist()
vectorizer = CountVectorizer(min_df=3, ngram_range=(1,3), binary=True)
X = vectorizer.fit_transform(X_text).todense()
Y_text = y_text_train.tolist() + y_text_test.tolist()
lb = LabelBinarizer()
Y = lb.fit_transform(Y_text)
print X.shape, X[:2]
X_train, Y_train = X[0:X_text_train.shape[0]], Y[0:X_text_train.shape[0]]
X_test, Y_test = X[X_text_train.shape[0]:], Y[X_text_train.shape[0]:]
print X_train.shape, Y_train.shape, X_test.shape, Y_test.shape
In [5]:
model = Sequential()
model.add(Dense(3, input_dim=X.shape[1], activation="softmax"))
In [6]:
model.compile(loss="categorical_crossentropy", optimizer="adam")
In [11]:
model.fit(X_train, Y_train, validation_data=(X_test, Y_test), nb_epoch=10, batch_size=128, show_accuracy=True)
Out[11]:
In [12]:
char_df = pd.read_csv("index_char.txt", sep="\t", header=None, quoting=3)
char_dict = dict(k for k in char_df.values.tolist())
index_char = list(k[0] for k in char_df.values.tolist())
print len(char_dict), len(index_char)
char_vocab_size = len(char_dict) + 2 # 0 for padding and 1 for OOV
maxlen = 140 # Tweet length
label_dict = dict((k[1], k[0]) for k in enumerate(["positive", "negative", "neutral"]))
embedding_size = 10
In [34]:
logit = LogisticRegression(multi_class="multinomial", solver="lbfgs")
logit.fit(X_train, y_text_train)
Out[34]:
In [35]:
y_pred = logit.predict(X_test)
print classification_report(y_pred, y_text_test)
print precision_recall_fscore_support(y_pred, y_text_test, average="micro")
print confusion_matrix(y_pred, y_text_test)
In [38]:
svc = LinearSVC(multi_class="crammer_singer")
svc.fit(X_train, y_text_train)
y_pred = svc.predict(X_test)
print classification_report(y_pred, y_text_test)
print precision_recall_fscore_support(y_pred, y_text_test, average="micro")
print confusion_matrix(y_pred, y_text_test)
In [32]:
randF = RandomForestClassifier(n_estimators=10, n_jobs=-1, random_state=1337, warm_start=True, class_weight="auto")
randF.fit(X_train, y_text_train)
print X_train.shape, X_test.shape
y_pred = randF.predict(X_test)
print classification_report(y_pred, y_text_test)
print precision_recall_fscore_support(y_pred, y_text_test, average="micro")
print confusion_matrix(y_pred, y_text_test)
In [ ]:
y_pred = randF.predict
In [ ]:
randF = RandomForestClassifier
In [16]:
def get_data(filename, char_dict= char_dict, idx = [1,2], label_dict = None, padding=140):
"""
params:
idx = [label_idx, text_idx, other_idx]
"""
X = []
Y = []
with open(filename) as fp:
for line in fp.readlines():
line = line[:-1].split("\t")
X.append([char_dict.get(k, -1) + 2 for k in line[idx[1]]]) # Add offset of 2, 0 for padding and 1 for OOV
if label_dict is not None:
y = [0]* len(label_dict)
y[label_dict.get(line[idx[0]])] = 1
else:
y = int(line[idx[0]])
Y.append(y)
X = pad_sequences(X, maxlen=padding)
return X, Y
In [17]:
X, Y = get_data("data/processed/gold/train/100_topics_100_tweets.sentence-three-point.subtask-A.train.gold.txt", label_dict=label_dict)
X_test, Y_test = get_data("data/processed/gold/dev/100_topics_100_tweets.sentence-three-point.subtask-A.dev.gold.txt", label_dict=label_dict)
In [18]:
model = Sequential()
model.add(Embedding(char_vocab_size, embedding_size, input_length=maxlen))
model.add(Dropout(0.5))
model.add(Convolution1D(nb_filter=10,filter_length=50, border_mode='valid', activation='relu', subsample_length=1))
model.add(MaxPooling1D(pool_length=2))
"""
model.add(Dropout(0.25))
model.add(Dense(100, activation="relu"))
model.add(Dropout(0.5))
model.add(Dense(50, activation="relu"))
"""
model.add(Dropout(0.5))
model.add(Flatten())
model.add(Dense(len(label_dict), activation="softmax"))
In [19]:
model.compile(loss='categorical_crossentropy',optimizer='adam')
In [20]:
model.fit(X,Y, validation_data=(X_test, Y_test), batch_size=128, nb_epoch=10, show_accuracy=True)
Out[20]:
In [ ]:
embeddings = model.get_weights()[0]
In [ ]:
embeddings
In [ ]:
In [ ]:
plt.clf()
plt.figure(figsize=(10,20))
plt.imshow(embeddings)
plt.yticks(range(97), ["pad", "oov"] + index_char)
plt.ylabel("Chars")
In [ ]:
a_id = ord('A')
z_id = ord('Z')
for i in range(a_id, z_id + 1):
print i, chr(i), chr(i) in index_char
In [ ]:
char_embed = dict((k, e) for k, e in zip(["pad", "oov"] + index_char, embeddings.tolist()))
In [ ]:
sorted_keys = sorted(char_embed.keys())
sort_embeddings = np.array([char_embed.get(k) for k in sorted_keys])
In [ ]:
plt.clf()
plt.figure(figsize=(10,20))
plt.imshow(sort_embeddings)
plt.yticks(range(len(sorted_keys)), sorted_keys)
plt.ylabel("Chars")
In [ ]: