import numpy as np
np.random.seed(113) #set seed before any keras import
import pandas as pd
import random
from sklearn.model_selection import train_test_split
from collections import defaultdict
from keras.preprocessing import sequence
from collections import Counter

First we load the dataset. I built a single csv file that is easy to load and handle using pandas. The original label tagset of TwiSty consists of 16 different labels, however I am going to transform it into a binary classification problem: I will only predict the I(introvert) and E(extrovert) personality traits.

The dataset is huge and it require a lot of RAM: for this reason I decided to trim the maximum number of characters of text per author to 1000. This means that the toy will be a toy model.

corpus = pd.read_csv('twistytest.csv', 
                     names=['user_id', 'lang', 'text', 'mbti'])

#here we limit the corpus size. The SVM with all the text can learn somethign
corpus.text = corpus.text.apply(lambda x: x) 
corpus.mbti = corpus.mbti.apply(lambda x: x[0])

#corpus = tmp.sample(frac=1, random_state=seed)
e = corpus[corpus.mbti.apply(lambda x: x == 'E')]
i = corpus[corpus.mbti.apply(lambda x: x == 'I')].sample(226)
corpus = pd.concat([e,i]).sample(frac=1, random_state=seed)

## set max length of doc per author
sentences = corpus.text#.apply(lambda x: x[:100000])
## trim labels: convert problem to binary classification I vs E
labels = corpus.mbti

## make sure we have a label for every data instance
np.random.seed(113) #seed
data['target']= np.random.permutation(labels)
np.random.seed(113) # use same seed!
data['data'] = np.random.permutation(sentences)

(452, 4)

# preview the dataset

(452, 4)
user_id lang text mbti
544 113022898 de ['@Stoffnuss Ich glaube Emil wird nicht lange ... E
280 108731189 it ['', "AGE SEMPRE PIU' PR... E
848 313458775 de ['@Jaqi_Metalbraut wh00t? Ich dacht Endivie wä... I
324 531664179 it ["#Ironman3 in one of the most glorious movies... I
483 1166332766 it ['La tecnologia mi odia.', "Io stamattina: ogg... I

# plot the distribution of labels

import matplotlib.pyplot as plt

l, v = zip(*Counter(y_train).items())
indexes = np.arange(len(l))
width = 1, v, width, color=['r', 'b'])
plt.xticks(indexes + width * 0.5, l)

#split the data into train, dev, test

X_rest, X_test, y_rest, y_test = train_test_split(data['data'], data['target'], test_size=0.2)
X_train, X_dev, y_train, y_dev = train_test_split(X_rest, y_rest, test_size=0.2)
del X_rest, y_rest
print("#train instances: {} #dev: {} #test: {}".format(len(X_train),len(X_dev),len(X_test)))

#train instances: 288 #dev: 73 #test: 91

# compute random baseline per class


Test and dev set label distribution

# test
l, v = zip(*Counter(y_test).items())
indexes = np.arange(len(l))
width = 1, v, width, color=['r', 'b'])
plt.xticks(indexes + width * 0.5, l)

# dev
l, v = zip(*Counter(y_dev).items())
indexes = np.arange(len(l))
width = 1, v, width, color=['r', 'b'])
plt.xticks(indexes + width * 0.5, l)


For the baseline we use an SVM with a sparse feature representation.

We use both character- and word-ngrams.

from sklearn.svm import LinearSVC
from sklearn.dummy import DummyClassifier
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

pipeline = Pipeline([('features', FeatureUnion([('wngram', TfidfVectorizer(ngram_range=(1,2))),
                                                ('cngram', TfidfVectorizer(analyzer='char'))])),
                     ('cls', DummyClassifier())]), y_train)
predictions = pipeline.predict(X_dev)
accuracy_score(predictions, y_dev)

print(classification_report(predictions, y_dev))

testpredictions = pipeline.predict(X_test)
print(accuracy_score(testpredictions, y_test))

print(classification_report(testpredictions, y_test))

             precision    recall  f1-score   support

          E       0.45      0.65      0.53        31
          I       0.62      0.43      0.51        42

avg / total       0.55      0.52      0.52        73

             precision    recall  f1-score   support

          E       0.47      0.45      0.46        44
          I       0.50      0.51      0.51        47

avg / total       0.48      0.48      0.48        91

from sklearn.svm import LinearSVC
from sklearn.dummy import DummyClassifier
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

pipeline = Pipeline([('features', FeatureUnion([('wngram', TfidfVectorizer(ngram_range=(1,2))),
                                                ('cngram', TfidfVectorizer(analyzer='char'))])),
                     ('cls', LinearSVC())]), y_train)
predictions = pipeline.predict(X_dev)
accuracy_score(predictions, y_dev)

print(classification_report(predictions, y_dev))

             precision    recall  f1-score   support

          E       0.64      0.68      0.66        41
          I       0.55      0.50      0.52        32

avg / total       0.60      0.60      0.60        73


The SVM works quite well already: we outperform the random baseline by a signficant margin.

testpred = pipeline.predict(X_test)
print(accuracy_score(testpred, y_test))
print(classification_report(testpred, y_test))

             precision    recall  f1-score   support

          E       0.67      0.67      0.67        43
          I       0.71      0.71      0.71        48

avg / total       0.69      0.69      0.69        91

Neural network

First we have to encode the labels in the one-hot format. Since this is a binary classification format, we don't convert them to a categorical format.

from keras.utils import np_utils
y2i = defaultdict(lambda: len(y2i))
y_train_num = [y2i[mbti] for mbti in y_train]
y_dev_num = [y2i[mbti] for mbti in y_dev]
y_test_num = [y2i[mbti] for mbti in y_test]
num_classes = len(np.unique(y_train_num))


Text representation

For the baseline we used a one-hot encoding. For our neural model we are going to represent the text using a dense representation. We will be building it from characters.

from collections import defaultdict

# convert words to indices, taking care of UNKs
def get_characters(sentence, c2i):
    out = []
    for word in sentence.split(" "):
        chars = []
        for c in word:
    return out

c2i = defaultdict(lambda: len(c2i))

PAD = c2i["<pad>"] # index 0 is padding
UNK = c2i["<unk>"] # index 1 is for UNK
X_train_num = [get_characters(sentence, c2i) for sentence in X_train]
c2i = defaultdict(lambda: UNK, c2i) # freeze - cute trick!
X_dev_num = [get_characters(sentence, c2i) for sentence in X_dev]
X_test_num = [get_characters(sentence, c2i) for sentence in X_test]

max_sentence_length=max([len(s.split(" ")) for s in X_train] 
                        + [len(s.split(" ")) for s in X_dev] 
                        + [len(s.split(" ")) for s in X_test] )
max_word_length = max([len(word)  for sentence in X_train_num for word in sentence])

### we need both max sent and word length
print(X_train_num[0:2]) # example how the first two sentences are encoded

def pad_words(tensor_words, max_word_len, pad_symbol_id, max_sent_len=None):
    pad character list all to same word length
    padded = []
    for words in tensor_words:
        if max_sent_len: #pad all to same sentence length (insert empty word list)
            words = [[[0]]*(max_sent_len-len(words))+ words][0] #prepending empty words
        padded.append(sequence.pad_sequences(words, maxlen=max_word_len, value=pad_symbol_id))
    return np.array(padded)

X_train_pad_char = pad_words(X_train_num, max_word_length, 0, max_sent_len=max_sentence_length)
X_dev_pad_char = pad_words(X_dev_num, max_word_length, 0, max_sent_len=max_sentence_length)
X_test_pad_char = pad_words(X_test_num, max_word_length, 0, max_sent_len=max_sentence_length)

(57, 893, 139)

from keras.models import Model, Sequential
from keras.layers import Dense, Input, GRU, TimeDistributed, Embedding, Bidirectional
import keras

Composing words only out of characters

Instead of using a separate word embedding matrix, compose words through characters (see

max_chars = len(c2i)
char_vocab_size = len(c2i)

## lower-level character LSTM
input_chars = Input(shape=(max_sentence_length, max_word_length), name='main_input')

embedded_chars = TimeDistributed(Embedding(char_vocab_size, c_dim,
                                         input_length=max_word_length), name='char_embedding')(input_chars)
char_lstm = TimeDistributed(Bidirectional(GRU(c_h_dim)), name='GRU_on_char')(embedded_chars)

word_lstm_from_char = Bidirectional(GRU(w_h_dim), name='GRU_on_words')(char_lstm)

# And add a prediction node on top
predictions = Dense(1, activation='relu', name='output_layer')(word_lstm_from_char)

model = Model(inputs=input_chars, outputs=predictions)

model.compile(loss='binary_crossentropy', optimizer='adam',


Layer (type)                 Output Shape              Param #   
main_input (InputLayer)      (None, 893, 139)          0         
char_embedding (TimeDistribu (None, 893, 139, 50)      32900     
GRU_on_char (TimeDistributed (None, 893, 64)           15936     
GRU_on_words (Bidirectional) (None, 64)                18624     
output_layer (Dense)         (None, 1)                 65        
Total params: 67,525
Trainable params: 67,525
Non-trainable params: 0

from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot, plot_model

plot_model(model, to_file='model.png')
SVG(model_to_dot(model).create(prog='dot', format='svg'))

G 140581090833856 main_input: InputLayer 140581090396256 char_embedding(embedding_5): TimeDistributed(Embedding) 140581090833856->140581090396256 140581090488272 GRU_on_char(bidirectional_5): TimeDistributed(Bidirectional) 140581090396256->140581090488272 140581087596496 GRU_on_words(gru_7): Bidirectional(GRU) 140581090488272->140581087596496 140581090558024 output_layer: Dense 140581087596496->140581090558024

In [48]:, y_train_num, epochs=10, batch_size=8)

Epoch 1/10
57/57 [==============================] - 98s - loss: 8.7660 - acc: 0.4561    
Epoch 2/10
57/57 [==============================] - 98s - loss: 8.7660 - acc: 0.4561    
Epoch 3/10
57/57 [==============================] - 98s - loss: 8.7660 - acc: 0.4561    
Epoch 4/10
57/57 [==============================] - 98s - loss: 8.7660 - acc: 0.4561    
Epoch 5/10
57/57 [==============================] - 98s - loss: 8.7660 - acc: 0.4561    
Epoch 6/10
57/57 [==============================] - 99s - loss: 8.7660 - acc: 0.4561    
Epoch 7/10
57/57 [==============================] - 99s - loss: 8.7660 - acc: 0.4561    
Epoch 8/10
57/57 [==============================] - 99s - loss: 8.7660 - acc: 0.4561    
Epoch 9/10
57/57 [==============================] - 99s - loss: 8.7660 - acc: 0.4561    
Epoch 10/10
57/57 [==============================] - 99s - loss: 8.7660 - acc: 0.4561    
<keras.callbacks.History at 0x7fdd504c9b00>

loss, accuracy = model.evaluate(X_dev_pad_char, y_dev_num)

15/15 [==============================] - 9s

loss, accuracy = model.evaluate(X_test_pad_char, y_test_num)

18/18 [==============================] - 11s

The model did not learn anything at all: the performance reflects the label distribution in the test set. When trained with enough data, the SVM can learn and solve to some extent the problem. The neural model needs a lot more data.