In [1]:
import numpy as np
np.random.seed(113) #set seed before any keras import
import pandas as pd
import random
from sklearn.model_selection import train_test_split
from collections import defaultdict
from keras.preprocessing import sequence
from collections import Counter
import pydot
In [2]:
seed=0
corpus = pd.read_csv('twistytest.csv',
index_col=0,
header=1,
names=['user_id', 'lang', 'text', 'mbti'])
corpus.sample(5)
Out[2]:
In [13]:
#here we limit the corpus size. The SVM with all the text can learn somethign
corpus.text = corpus.text.apply(lambda x: x[:1000])
corpus.mbti = corpus.mbti.apply(lambda x: x[0])
#corpus = tmp.sample(frac=1, random_state=seed)
e = corpus[corpus.mbti.apply(lambda x: x == 'E')]
i = corpus[corpus.mbti.apply(lambda x: x == 'I')].sample(226)
corpus = pd.concat([e,i]).sample(frac=0.3, random_state=seed)
print(corpus.shape)
## set max length of doc per author
sentences = corpus.text#.apply(lambda x: x[:100000])
## trim labels: convert problem to binary classification I vs E
labels = corpus.mbti
## make sure we have a label for every data instance
assert(len(sentences)==len(labels))
data={}
np.random.seed(113) #seed
data['target']= np.random.permutation(labels)
np.random.seed(113) # use same seed!
data['data'] = np.random.permutation(sentences)
In [14]:
# preview the dataset
print(corpus.shape)
corpus.head()
Out[14]:
In [15]:
# plot the distribution of labels
import matplotlib.pyplot as plt
l, v = zip(*Counter(y_train).items())
indexes = np.arange(len(l))
width = 1
plt.bar(indexes, v, width, color=['r', 'b'])
plt.xticks(indexes + width * 0.5, l)
plt.show()
In [16]:
#split the data into train, dev, test
X_rest, X_test, y_rest, y_test = train_test_split(data['data'], data['target'], test_size=0.2)
X_train, X_dev, y_train, y_dev = train_test_split(X_rest, y_rest, test_size=0.2)
del X_rest, y_rest
print("#train instances: {} #dev: {} #test: {}".format(len(X_train),len(X_dev),len(X_test)))
In [ ]:
from sklearn.svm import LinearSVC
from sklearn.dummy import DummyClassifier
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
pipeline = Pipeline([('features', FeatureUnion([('wngram', TfidfVectorizer(ngram_range=(1,2))),
('cngram', TfidfVectorizer(analyzer='char'))])),
('cls', LinearSVC())])
pipeline.fit(X_train, y_train)
In [12]:
testpred = pipeline.predict(X_test)
print(accuracy_score(testpred, y_test))
print(classification_report(testpred, y_test))
In [18]:
from keras.utils import np_utils
y2i = defaultdict(lambda: len(y2i))
y_train_num = [y2i[mbti] for mbti in y_train]
y_dev_num = [y2i[mbti] for mbti in y_dev]
y_test_num = [y2i[mbti] for mbti in y_test]
num_classes = len(np.unique(y_train_num))
print(num_classes)
In [19]:
from collections import defaultdict
# convert words to indices, taking care of UNKs
def get_characters(sentence, c2i):
out = []
for word in sentence.split(" "):
chars = []
for c in word:
chars.append(c2i[c])
out.append(chars)
return out
c2i = defaultdict(lambda: len(c2i))
PAD = c2i["<pad>"] # index 0 is padding
UNK = c2i["<unk>"] # index 1 is for UNK
X_train_num = [get_characters(sentence, c2i) for sentence in X_train]
c2i = defaultdict(lambda: UNK, c2i) # freeze - cute trick!
X_dev_num = [get_characters(sentence, c2i) for sentence in X_dev]
X_test_num = [get_characters(sentence, c2i) for sentence in X_test]
max_sentence_length=max([len(s.split(" ")) for s in X_train]
+ [len(s.split(" ")) for s in X_dev]
+ [len(s.split(" ")) for s in X_test] )
max_word_length = max([len(word) for sentence in X_train_num for word in sentence])
In [26]:
### we need both max sent and word length
print(max_sentence_length)
print(max_word_length)
print(X_train[0:2])
print(X_train_num[0][:100]) # example how the first two sentences are encoded
In [27]:
def pad_words(tensor_words, max_word_len, pad_symbol_id, max_sent_len=None):
"""
pad character list all to same word length
"""
padded = []
for words in tensor_words:
if max_sent_len: #pad all to same sentence length (insert empty word list)
words = [[[0]]*(max_sent_len-len(words))+ words][0] #prepending empty words
padded.append(sequence.pad_sequences(words, maxlen=max_word_len, value=pad_symbol_id))
return np.array(padded)
In [28]:
X_train_pad_char = pad_words(X_train_num, max_word_length, 0, max_sent_len=max_sentence_length)
X_dev_pad_char = pad_words(X_dev_num, max_word_length, 0, max_sent_len=max_sentence_length)
X_test_pad_char = pad_words(X_test_num, max_word_length, 0, max_sent_len=max_sentence_length)
In [29]:
X_train_pad_char.shape
Out[29]:
In [30]:
from keras.models import Model, Sequential
from keras.layers import Dense, Input, GRU, TimeDistributed, Embedding, Bidirectional
import keras
Instead of using a separate word embedding matrix, compose words through characters (see https://aclweb.org/anthology/W/W16/W16-4303.pdf)
In [31]:
batch_size=8
max_chars = len(c2i)
c_dim=50
c_h_dim=32
w_h_dim=32
char_vocab_size = len(c2i)
## lower-level character LSTM
input_chars = Input(shape=(max_sentence_length, max_word_length), name='main_input')
embedded_chars = TimeDistributed(Embedding(char_vocab_size, c_dim,
input_length=max_word_length), name='char_embedding')(input_chars)
char_lstm = TimeDistributed(Bidirectional(GRU(c_h_dim)), name='GRU_on_char')(embedded_chars)
word_lstm_from_char = Bidirectional(GRU(w_h_dim), name='GRU_on_words')(char_lstm)
# And add a prediction node on top
predictions = Dense(1, activation='relu', name='output_layer')(word_lstm_from_char)
In [32]:
model = Model(inputs=input_chars, outputs=predictions)
model.compile(loss='binary_crossentropy', optimizer='adam',
metrics=['accuracy'])
model.summary()
In [33]:
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot, plot_model
SVG(model_to_dot(model).create(prog='dot', format='svg'))
Out[33]:
In [34]:
model.fit(X_train_pad_char, y_train_num, epochs=10, batch_size=8)
Out[34]:
In [35]:
loss, accuracy = model.evaluate(X_test_pad_char, y_test_num)
In [36]:
print(accuracy)