Understanding Embeddings on Texts


In [1]:
# Based on
# https://github.com/fchollet/deep-learning-with-python-notebooks/blob/master/6.2-understanding-recurrent-neural-networks.ipynb

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
%matplotlib inline
%pylab inline


Populating the interactive namespace from numpy and matplotlib

In [4]:
import tensorflow as tf
tf.logging.set_verbosity(tf.logging.ERROR)
print(tf.__version__)


1.8.0

In [5]:
# https://keras.io/datasets/#imdb-movie-reviews-sentiment-classification
max_features = 1000  # number of words to consider as features
maxlen = 20  # cut texts after this number of words (among top max_features most common words)

# each review is encoded as a sequence of word indexes
# indexed by overall frequency in the dataset
# output is 0 (negative) or 1 (positive) 
imdb = tf.keras.datasets.imdb.load_data(num_words=max_features)
(raw_input_train, y_train), (raw_input_test, y_test) = imdb

In [6]:
# tf.keras.datasets.imdb.load_data?

In [7]:
y_train.min()


Out[7]:
0

In [8]:
y_train.max()


Out[8]:
1

In [9]:
# 25000 texts
len(raw_input_train)


Out[9]:
25000

In [10]:
# first text has 218 words
len(raw_input_train[0])


Out[10]:
218

In [11]:
raw_input_train[0]


Out[11]:
[1,
 14,
 22,
 16,
 43,
 530,
 973,
 2,
 2,
 65,
 458,
 2,
 66,
 2,
 4,
 173,
 36,
 256,
 5,
 25,
 100,
 43,
 838,
 112,
 50,
 670,
 2,
 9,
 35,
 480,
 284,
 5,
 150,
 4,
 172,
 112,
 167,
 2,
 336,
 385,
 39,
 4,
 172,
 2,
 2,
 17,
 546,
 38,
 13,
 447,
 4,
 192,
 50,
 16,
 6,
 147,
 2,
 19,
 14,
 22,
 4,
 2,
 2,
 469,
 4,
 22,
 71,
 87,
 12,
 16,
 43,
 530,
 38,
 76,
 15,
 13,
 2,
 4,
 22,
 17,
 515,
 17,
 12,
 16,
 626,
 18,
 2,
 5,
 62,
 386,
 12,
 8,
 316,
 8,
 106,
 5,
 4,
 2,
 2,
 16,
 480,
 66,
 2,
 33,
 4,
 130,
 12,
 16,
 38,
 619,
 5,
 25,
 124,
 51,
 36,
 135,
 48,
 25,
 2,
 33,
 6,
 22,
 12,
 215,
 28,
 77,
 52,
 5,
 14,
 407,
 16,
 82,
 2,
 8,
 4,
 107,
 117,
 2,
 15,
 256,
 4,
 2,
 7,
 2,
 5,
 723,
 36,
 71,
 43,
 530,
 476,
 26,
 400,
 317,
 46,
 7,
 4,
 2,
 2,
 13,
 104,
 88,
 4,
 381,
 15,
 297,
 98,
 32,
 2,
 56,
 26,
 141,
 6,
 194,
 2,
 18,
 4,
 226,
 22,
 21,
 134,
 476,
 26,
 480,
 5,
 144,
 30,
 2,
 18,
 51,
 36,
 28,
 224,
 92,
 25,
 104,
 4,
 226,
 65,
 16,
 38,
 2,
 88,
 12,
 16,
 283,
 5,
 16,
 2,
 113,
 103,
 32,
 15,
 16,
 2,
 19,
 178,
 32]

In [12]:
# tf.keras.preprocessing.sequence.pad_sequences?

In [13]:
# https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/sequence/pad_sequences

input_train = tf.keras.preprocessing.sequence.pad_sequences(raw_input_train, maxlen=maxlen)
input_test = tf.keras.preprocessing.sequence.pad_sequences(raw_input_test, maxlen=maxlen)

In [14]:
input_train.shape, input_test.shape, y_train.shape, y_test.shape


Out[14]:
((25000, 20), (25000, 20), (25000,), (25000,))

In [15]:
# left padded with zeros
# As a convention, "0" does not stand for a specific word, but instead is used to encode any unknown word.
input_train[0]


Out[15]:
array([ 65,  16,  38,   2,  88,  12,  16, 283,   5,  16,   2, 113, 103,
        32,  15,  16,   2,  19, 178,  32])

We can use a randomly initialized embedding without any training


In [16]:
# tf.keras.layers.Embedding?

In [17]:
embedding_dim = 3

random_model = tf.keras.Sequential()
# Parameters: max_features * embedding_dim 
random_model.add(tf.keras.layers.Embedding(name='embedding',input_dim=max_features, output_dim=embedding_dim, input_length=maxlen))

random_model.summary()


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
embedding (Embedding)        (None, 20, 3)             3000      
=================================================================
Total params: 3,000
Trainable params: 3,000
Non-trainable params: 0
_________________________________________________________________

In [18]:
random_model.predict(input_train[:1])


Out[18]:
array([[[0.07384551, 0.29555106, 0.3009336 ],
        [0.45920503, 0.16309488, 0.46034312],
        [0.39517987, 0.81220245, 0.6899681 ],
        [0.13280201, 0.5241946 , 0.01194322],
        [0.8166592 , 0.38502324, 0.5417099 ],
        [0.7756258 , 0.04511654, 0.13022017],
        [0.45920503, 0.16309488, 0.46034312],
        [0.64187396, 0.67822623, 0.95672846],
        [0.10987711, 0.07286727, 0.07477903],
        [0.45920503, 0.16309488, 0.46034312],
        [0.13280201, 0.5241946 , 0.01194322],
        [0.32317853, 0.99291134, 0.70348394],
        [0.71620905, 0.57316947, 0.35613513],
        [0.14302993, 0.59799194, 0.8499563 ],
        [0.00432396, 0.6946956 , 0.00477517],
        [0.45920503, 0.16309488, 0.46034312],
        [0.13280201, 0.5241946 , 0.01194322],
        [0.5677427 , 0.4795617 , 0.4080261 ],
        [0.16597784, 0.17524338, 0.966107  ],
        [0.14302993, 0.59799194, 0.8499563 ]]], dtype=float32)

Training the embedding together with the whole model is more reasonable

Alternative: use a pre-trained model, probably trained using skip-gram


In [19]:
embedding_dim = 3

model = tf.keras.Sequential()
# Parameters: max_features * embedding_dim 
model.add(tf.keras.layers.Embedding(name='embedding', input_dim=max_features, output_dim=embedding_dim, input_length=maxlen))

# Output: maxlen * embedding_dim (8)
model.add(tf.keras.layers.Flatten(name='flatten'))

# binary classifier
model.add(tf.keras.layers.Dense(name='fc', units=32, activation='relu'))
model.add(tf.keras.layers.Dense(name='classifier', units=1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model.summary()


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
embedding (Embedding)        (None, 20, 3)             3000      
_________________________________________________________________
flatten (Flatten)            (None, 60)                0         
_________________________________________________________________
fc (Dense)                   (None, 32)                1952      
_________________________________________________________________
classifier (Dense)           (None, 1)                 33        
=================================================================
Total params: 4,985
Trainable params: 4,985
Non-trainable params: 0
_________________________________________________________________

In [20]:
batch_size = 128

%time history = model.fit(input_train, y_train, epochs=10, batch_size=batch_size, validation_split=0.2)


Train on 20000 samples, validate on 5000 samples
Epoch 1/10
20000/20000 [==============================] - 2s 98us/step - loss: 0.6993 - acc: 0.5020 - val_loss: 0.6973 - val_acc: 0.4950
Epoch 2/10
20000/20000 [==============================] - 1s 34us/step - loss: 0.6925 - acc: 0.5262 - val_loss: 0.6930 - val_acc: 0.5206
Epoch 3/10
20000/20000 [==============================] - 1s 33us/step - loss: 0.6866 - acc: 0.5475 - val_loss: 0.6878 - val_acc: 0.5436
Epoch 4/10
20000/20000 [==============================] - 1s 33us/step - loss: 0.6748 - acc: 0.5829 - val_loss: 0.6739 - val_acc: 0.5782
Epoch 5/10
20000/20000 [==============================] - 1s 31us/step - loss: 0.6462 - acc: 0.6243 - val_loss: 0.6393 - val_acc: 0.6334
Epoch 6/10
20000/20000 [==============================] - 1s 32us/step - loss: 0.5993 - acc: 0.6727 - val_loss: 0.6052 - val_acc: 0.6646
Epoch 7/10
20000/20000 [==============================] - 1s 31us/step - loss: 0.5596 - acc: 0.7081 - val_loss: 0.5940 - val_acc: 0.6816
Epoch 8/10
20000/20000 [==============================] - 1s 30us/step - loss: 0.5357 - acc: 0.7265 - val_loss: 0.5652 - val_acc: 0.7074
Epoch 9/10
20000/20000 [==============================] - 1s 32us/step - loss: 0.5202 - acc: 0.7380 - val_loss: 0.5561 - val_acc: 0.7122
Epoch 10/10
20000/20000 [==============================] - 1s 32us/step - loss: 0.5099 - acc: 0.7437 - val_loss: 0.5533 - val_acc: 0.7142
Wall time: 7.95 s

In [21]:
train_loss, train_accuracy = model.evaluate(input_train, y_train, batch_size=batch_size)
train_accuracy


25000/25000 [==============================] - 0s 10us/step
Out[21]:
0.745319999961853

In [22]:
test_loss, test_accuracy = model.evaluate(input_test, y_test, batch_size=batch_size)
test_accuracy


25000/25000 [==============================] - 0s 9us/step
Out[22]:
0.722879999961853

In [23]:
# precition
model.predict(input_test[0:5])


Out[23]:
array([[0.7059556 ],
       [0.8062973 ],
       [0.45886868],
       [0.64379567],
       [0.96133244]], dtype=float32)

In [24]:
# ground truth
y_test[0:5]


Out[24]:
array([0, 1, 1, 0, 1], dtype=int64)

How does the output of the trained embedding look like?


In [25]:
embedding_layer = model.get_layer('embedding')

In [26]:
model_stub= tf.keras.Model(inputs=model.input, outputs=embedding_layer.output)

In [27]:
embedding_prediction = model_stub.predict(input_test[0:5])

In [28]:
# 5 sample reviews, 500 words per review, 8 dimensions per word
embedding_prediction.shape


Out[28]:
(5, 20, 3)

In [29]:
# 8 embedding dimensions of first word of first sample review
embedding_prediction[0][0]


Out[29]:
array([0.46506825, 0.18255262, 0.5230578 ], dtype=float32)

Comparing trained to untrained model


In [30]:
input_train[0]


Out[30]:
array([ 65,  16,  38,   2,  88,  12,  16, 283,   5,  16,   2, 113, 103,
        32,  15,  16,   2,  19, 178,  32])

In [31]:
model_stub.predict(input_train[:1])


Out[31]:
array([[[ 0.43953612,  0.7414288 ,  0.37631312],
        [ 0.54084265,  0.2680084 ,  0.7580117 ],
        [ 0.4474395 ,  0.50141734,  0.5125725 ],
        [ 0.76818293,  0.63989884,  0.7294859 ],
        [-0.01756167,  0.33895952,  0.47610214],
        [ 0.6234705 ,  0.6071456 ,  0.41210645],
        [ 0.54084265,  0.2680084 ,  0.7580117 ],
        [ 1.1229801 ,  0.3734608 ,  0.3536867 ],
        [ 0.7051124 ,  0.707146  ,  0.3594228 ],
        [ 0.54084265,  0.2680084 ,  0.7580117 ],
        [ 0.76818293,  0.63989884,  0.7294859 ],
        [ 0.94658554,  0.69197726,  0.5638408 ],
        [ 0.7194364 ,  0.68996537,  0.4780143 ],
        [ 0.72959805,  0.66114867,  0.5501039 ],
        [ 0.17081198,  0.1380256 ,  0.1959505 ],
        [ 0.54084265,  0.2680084 ,  0.7580117 ],
        [ 0.76818293,  0.63989884,  0.7294859 ],
        [ 0.795604  ,  0.80453324,  0.628472  ],
        [ 0.3977472 ,  0.95927393,  0.50550807],
        [ 0.72959805,  0.66114867,  0.5501039 ]]], dtype=float32)

In [32]:
random_model.predict(input_train[:1])


Out[32]:
array([[[0.07384551, 0.29555106, 0.3009336 ],
        [0.45920503, 0.16309488, 0.46034312],
        [0.39517987, 0.81220245, 0.6899681 ],
        [0.13280201, 0.5241946 , 0.01194322],
        [0.8166592 , 0.38502324, 0.5417099 ],
        [0.7756258 , 0.04511654, 0.13022017],
        [0.45920503, 0.16309488, 0.46034312],
        [0.64187396, 0.67822623, 0.95672846],
        [0.10987711, 0.07286727, 0.07477903],
        [0.45920503, 0.16309488, 0.46034312],
        [0.13280201, 0.5241946 , 0.01194322],
        [0.32317853, 0.99291134, 0.70348394],
        [0.71620905, 0.57316947, 0.35613513],
        [0.14302993, 0.59799194, 0.8499563 ],
        [0.00432396, 0.6946956 , 0.00477517],
        [0.45920503, 0.16309488, 0.46034312],
        [0.13280201, 0.5241946 , 0.01194322],
        [0.5677427 , 0.4795617 , 0.4080261 ],
        [0.16597784, 0.17524338, 0.966107  ],
        [0.14302993, 0.59799194, 0.8499563 ]]], dtype=float32)

In [ ]: