Chapter 6.1.2 - Using word embeddings

Embedding layer with Keras


In [1]:
import keras


C:\ProgramData\Anaconda3\lib\site-packages\h5py\__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
  from ._conv import register_converters as _register_converters
Using TensorFlow backend.

In [2]:
keras.__version__


Out[2]:
'2.1.3'

In [3]:
from keras.layers import Embedding

# Number of maximum tokens is equal of maximum word index + 1
max_number_of_tokens = 1000
embedding_dimentionality = 64
embedding_layer = Embedding(max_number_of_tokens, embedding_dimentionality)

The layers transforms a 2D input tensor of integer of shape (number_of samples, sequence_length) into a 3D floating point tensor, of shape (number_of_samples, sequence_length, embedding_dimensionality.) Such tensor can be processed a RNN layer of a 1D convolutional layer.

IMDB example


In [4]:
from keras.datasets import imdb
from keras.preprocessing.sequence import pad_sequences

In [5]:
# Number of words considered as features
max_features = 10000

In [6]:
# Cutting reviews after only 20 words
sequence_max_length = 20

In [7]:
# Loading data
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words = max_features)

In [8]:
x_train.shape


Out[8]:
(25000,)

In [9]:
x_train_sequence = pad_sequences(x_train, maxlen = sequence_max_length)

In [10]:
x_train_sequence.shape


Out[10]:
(25000, 20)

In [11]:
x_train[0:2]


Out[11]:
array([list([1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 5952, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32]),
       list([1, 194, 1153, 194, 8255, 78, 228, 5, 6, 1463, 4369, 5012, 134, 26, 4, 715, 8, 118, 1634, 14, 394, 20, 13, 119, 954, 189, 102, 5, 207, 110, 3103, 21, 14, 69, 188, 8, 30, 23, 7, 4, 249, 126, 93, 4, 114, 9, 2300, 1523, 5, 647, 4, 116, 9, 35, 8163, 4, 229, 9, 340, 1322, 4, 118, 9, 4, 130, 4901, 19, 4, 1002, 5, 89, 29, 952, 46, 37, 4, 455, 9, 45, 43, 38, 1543, 1905, 398, 4, 1649, 26, 6853, 5, 163, 11, 3215, 2, 4, 1153, 9, 194, 775, 7, 8255, 2, 349, 2637, 148, 605, 2, 8003, 15, 123, 125, 68, 2, 6853, 15, 349, 165, 4362, 98, 5, 4, 228, 9, 43, 2, 1157, 15, 299, 120, 5, 120, 174, 11, 220, 175, 136, 50, 9, 4373, 228, 8255, 5, 2, 656, 245, 2350, 5, 4, 9837, 131, 152, 491, 18, 2, 32, 7464, 1212, 14, 9, 6, 371, 78, 22, 625, 64, 1382, 9, 8, 168, 145, 23, 4, 1690, 15, 16, 4, 1355, 5, 28, 6, 52, 154, 462, 33, 89, 78, 285, 16, 145, 95])],
      dtype=object)

In [12]:
x_train[0].__getitem__(-20)


Out[12]:
65

In [13]:
x_train_sequence[0, :]


Out[13]:
array([  65,   16,   38, 1334,   88,   12,   16,  283,    5,   16, 4472,
        113,  103,   32,   15,   16, 5345,   19,  178,   32])

In [14]:
x_train_sequence[0]


Out[14]:
array([  65,   16,   38, 1334,   88,   12,   16,  283,    5,   16, 4472,
        113,  103,   32,   15,   16, 5345,   19,  178,   32])

In [15]:
x_train_sequence[1]


Out[15]:
array([  23,    4, 1690,   15,   16,    4, 1355,    5,   28,    6,   52,
        154,  462,   33,   89,   78,  285,   16,  145,   95])

Model


In [16]:
from keras.models import Sequential
from keras.layers import Flatten, Dense

In [17]:
model = Sequential()
model.add(Embedding(input_dim = max_features, output_dim = 8, input_length = sequence_max_length))
model.add(Flatten())
model.add(Dense(units = 1, activation = 'sigmoid'))

In [18]:
# Compiling the model
model.compile(optimizer = 'rmsprop', 
              loss = 'binary_crossentropy', 
              metrics = ['acc'])

In [19]:
model.summary()


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
embedding_2 (Embedding)      (None, 20, 8)             80000     
_________________________________________________________________
flatten_1 (Flatten)          (None, 160)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 161       
=================================================================
Total params: 80,161
Trainable params: 80,161
Non-trainable params: 0
_________________________________________________________________

In [20]:
# Training
history = model.fit(x = x_train_sequence, 
                    y = y_train, 
                    epochs = 10, 
                    batch_size = 32, 
                    validation_split = 0.2)


Train on 20000 samples, validate on 5000 samples
Epoch 1/10
20000/20000 [==============================] - 5s 266us/step - loss: 0.6759 - acc: 0.6050 - val_loss: 0.6398 - val_acc: 0.6814
Epoch 2/10
20000/20000 [==============================] - 4s 198us/step - loss: 0.5657 - acc: 0.7427 - val_loss: 0.5467 - val_acc: 0.7206
Epoch 3/10
20000/20000 [==============================] - 4s 197us/step - loss: 0.4752 - acc: 0.7808 - val_loss: 0.5113 - val_acc: 0.7384
Epoch 4/10
20000/20000 [==============================] - 4s 195us/step - loss: 0.4263 - acc: 0.8077 - val_loss: 0.5008 - val_acc: 0.7452
Epoch 5/10
20000/20000 [==============================] - 4s 206us/step - loss: 0.3930 - acc: 0.8258 - val_loss: 0.4981 - val_acc: 0.7538
Epoch 6/10
20000/20000 [==============================] - 4s 198us/step - loss: 0.3668 - acc: 0.8395 - val_loss: 0.5014 - val_acc: 0.7530
Epoch 7/10
20000/20000 [==============================] - 4s 197us/step - loss: 0.3435 - acc: 0.8533 - val_loss: 0.5052 - val_acc: 0.7520
Epoch 8/10
20000/20000 [==============================] - 4s 215us/step - loss: 0.3223 - acc: 0.8657 - val_loss: 0.5132 - val_acc: 0.7486
Epoch 9/10
20000/20000 [==============================] - 5s 253us/step - loss: 0.3022 - acc: 0.8766 - val_loss: 0.5213 - val_acc: 0.7490
Epoch 10/10
20000/20000 [==============================] - 5s 243us/step - loss: 0.2839 - acc: 0.8860 - val_loss: 0.5303 - val_acc: 0.7466

Using pre-trained word embeddings

The data can be downloaded from: http://mng.bz/0tIo


In [21]:
import os

In [22]:
imdb_dir = './data/Chapter 6.1.2 - Using word embeddings/aclImdb/'

In [23]:
train_dir = os.path.join(imdb_dir, 'train')

In [24]:
labels = []
texts = []

for label_type in ['neg', 'pos']:
    dir_name = os.path.join(train_dir, label_type)
    for fname in os.listdir(dir_name):
        # Taking into consideration files which are only .txt
        if fname[-4:] == '.txt':
            f = open(os.path.join(dir_name, fname), encoding="utf8")
            texts.append(f.read())
            f.close()
            if label_type == 'neg':
                labels.append(0)
            else:
                labels.append(1)

In [25]:
len(labels)


Out[25]:
25000

In [26]:
len(texts)


Out[26]:
25000

In [27]:
texts[0]


Out[27]:
"Story of a man who has unnatural feelings for a pig. Starts out with a opening scene that is a terrific example of absurd comedy. A formal orchestra audience is turned into an insane, violent mob by the crazy chantings of it's singers. Unfortunately it stays absurd the WHOLE time with no general narrative eventually making it just too off putting. Even those from the era should be turned off. The cryptic dialogue would make Shakespeare seem easy to a third grader. On a technical level it's better than you might think with some good cinematography by future great Vilmos Zsigmond. Future stars Sally Kirkland and Frederic Forrest can be seen briefly."

In [28]:
labels[0]


Out[28]:
0

Tokenizing the data


In [29]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np

In [30]:
# Using only first 100 words of each review
maxlen = 100

In [31]:
# Number of training samples
training_samples = 200

In [32]:
# Number of validation samples
validation_samples = 10000

In [33]:
# Tokenizing only top 10 000 words in the dataset.
max_words = 10000

In [34]:
# Initializing Tokenizer
tokenizer = Tokenizer(num_words = max_words)

In [35]:
# Fitting the Tokenizer on the text
tokenizer.fit_on_texts(texts)

In [36]:
# Text to sequence
sequences = tokenizer.texts_to_sequences(texts)

In [37]:
sequences[0:2]


Out[37]:
[[62,
  4,
  3,
  129,
  34,
  44,
  7576,
  1414,
  15,
  3,
  4252,
  514,
  43,
  16,
  3,
  633,
  133,
  12,
  6,
  3,
  1301,
  459,
  4,
  1751,
  209,
  3,
  7693,
  308,
  6,
  676,
  80,
  32,
  2137,
  1110,
  3008,
  31,
  1,
  929,
  4,
  42,
  5120,
  469,
  9,
  2665,
  1751,
  1,
  223,
  55,
  16,
  54,
  828,
  1318,
  847,
  228,
  9,
  40,
  96,
  122,
  1484,
  57,
  145,
  36,
  1,
  996,
  141,
  27,
  676,
  122,
  1,
  411,
  59,
  94,
  2278,
  303,
  772,
  5,
  3,
  837,
  20,
  3,
  1755,
  646,
  42,
  125,
  71,
  22,
  235,
  101,
  16,
  46,
  49,
  624,
  31,
  702,
  84,
  702,
  378,
  3493,
  2,
  8422,
  67,
  27,
  107,
  3348],
 [4517,
  514,
  14,
  3,
  3417,
  159,
  8595,
  1702,
  6,
  4892,
  53,
  16,
  4518,
  5674,
  138,
  5,
  1023,
  4988,
  3050,
  4519,
  588,
  1339,
  34,
  6,
  1544,
  95,
  3,
  758,
  4,
  5,
  24,
  3513,
  8,
  4,
  9,
  109,
  3051,
  5,
  1,
  1067,
  14,
  3,
  4520,
  79,
  20,
  2086,
  6,
  4519,
  574,
  2798,
  7262,
  38,
  489,
  1,
  8595,
  301,
  122,
  14,
  4253,
  18,
  1693,
  942,
  1,
  1702,
  6,
  6538,
  31,
  1,
  998,
  1807,
  667,
  24,
  104,
  2602,
  485,
  34,
  3285,
  1,
  6539,
  1048,
  43,
  16,
  2753,
  2547,
  33,
  1340,
  5,
  2103,
  1,
  4518,
  1537,
  20,
  3,
  1702,
  3249,
  20,
  32,
  4348,
  1105,
  18,
  134,
  228,
  24,
  4760,
  217,
  1927,
  32,
  3230,
  8,
  1,
  4676,
  1975,
  1135,
  4,
  1,
  1702,
  5675,
  9,
  6627,
  80,
  1,
  2016,
  118,
  9,
  8169,
  5,
  1,
  1321,
  205,
  4010,
  8,
  1,
  652,
  4,
  1,
  5924,
  16,
  942,
  8,
  343,
  6259,
  1090,
  8,
  257,
  117,
  6260,
  2058,
  122,
  261,
  1,
  709,
  15,
  1,
  14,
  33,
  335,
  16,
  55,
  699,
  617,
  43,
  7,
  7,
  79,
  570,
  463,
  1,
  1072,
  272,
  4517,
  6041,
  11,
  330,
  751,
  5,
  1,
  6792,
  566,
  1685,
  705,
  4517,
  5456,
  13,
  523,
  31,
  1513,
  9878,
  134,
  277,
  171,
  37,
  42,
  8288,
  10,
  188,
  132,
  4517,
  6,
  98,
  429,
  4,
  1547,
  353,
  9,
  6,
  438,
  258,
  21,
  2696,
  15,
  1,
  205,
  1003,
  43,
  4,
  1,
  286,
  4517,
  105,
  10,
  25,
  107,
  35,
  227,
  10,
  162,
  420,
  11,
  28,
  1,
  115,
  40,
  9,
  44,
  58,
  1636,
  111,
  4,
  1,
  286,
  16,
  3,
  324,
  1693,
  942,
  6538,
  92,
  1,
  6627,
  158,
  26,
  64,
  1,
  3230,
  6261,
  4,
  1,
  276,
  1,
  1184,
  68,
  266,
  5,
  1656,
  1,
  201,
  4517,
  16,
  157,
  1059,
  1685,
  506,
  4,
  1,
  807,
  1,
  1150,
  4483,
  6,
  118,
  9,
  2665,
  363,
  1,
  127,
  16,
  3,
  5283,
  6540,
  4416,
  145,
  2603,
  1001,
  342,
  51,
  1,
  942,
  1126,
  43,
  39,
  14,
  1,
  39,
  45,
  98,
  4,
  1,
  3584,
  23,
  3051,
  42,
  3,
  539,
  323,
  12,
  97,
  25,
  90,
  15,
  3,
  84,
  114,
  1685,
  506,
  18,
  75,
  6887,
  1727,
  750,
  411,
  267,
  1322,
  3,
  144,
  580,
  4,
  2373,
  39,
  833,
  39,
  1071,
  814,
  11,
  6,
  3,
  1045,
  1429,
  134,
  1,
  244,
  111,
  938,
  28,
  2161,
  15,
  1028,
  231,
  21,
  12,
  73,
  567,
  100,
  1,
  1702,
  8169,
  222,
  21,
  14,
  73,
  8926,
  14,
  10,
  194,
  47,
  141,
  25,
  74,
  57,
  51,
  1,
  3022,
  410,
  571,
  180,
  89,
  1257,
  53,
  12,
  73,
  16,
  3,
  168,
  659,
  4,
  663,
  5121,
  1544,
  41,
  18,
  222,
  40,
  139,
  1883,
  130,
  739,
  4201,
  14,
  1,
  3494,
  911,
  6,
  142,
  18,
  61,
  211,
  3,
  375,
  4,
  136,
  1196,
  57,
  555,
  229,
  5,
  40,
  165,
  3765,
  8,
  1,
  972,
  7,
  7,
  1,
  341,
  371,
  2246,
  307,
  4,
  4517,
  518,
  231,
  134,
  1,
  175,
  245,
  2052,
  759,
  32,
  1724,
  531,
  4,
  926,
  583,
  3,
  159,
  633,
  894,
  717,
  108,
  50,
  136,
  16,
  739,
  4201,
  14,
  2178,
  5,
  2104,
  43,
  1727,
  1203,
  2225,
  136,
  1,
  3789,
  39,
  157,
  375,
  4,
  348,
  2345,
  583,
  1,
  134,
  10,
  59,
  37,
  5,
  64,
  11,
  1724,
  926,
  10,
  241,
  21,
  249,
  10,
  97,
  866,
  140,
  3,
  747,
  286,
  531,
  602,
  4,
  4517,
  14,
  870,
  1,
  19,
  44,
  1957,
  906,
  16,
  524,
  8058,
  7476,
  1588,
  2823,
  10,
  77,
  132,
  54,
  50,
  82,
  71,
  1,
  2876,
  1702,
  2179,
  299,
  710,
  84,
  342,
  364,
  16,
  1,
  82,
  104,
  4517,
  2279,
  11,
  301,
  3108,
  4,
  270,
  8,
  1,
  2365,
  4,
  899,
  258,
  10,
  67,
  101,
  4,
  773,
  4,
  430,
  105,
  71,
  11,
  35,
  10,
  195,
  3,
  114,
  2485,
  1,
  202,
  136,
  23,
  3,
  114,
  750,
  469,
  1,
  1060,
  6,
  547,
  21,
  73,
  2315,
  39,
  1071,
  6,
  4844,
  60,
  6,
  3,
  899,
  14,
  10,
  11,
  97,
  25,
  74,
  3,
  181,
  49,
  19,
  45,
  90,
  2877,
  7,
  7,
  1,
  362,
  1230,
  23,
  2652,
  45,
  161,
  2087,
  1,
  113,
  215,
  84,
  104,
  55,
  731,
  2280,
  714,
  4311,
  44,
  298,
  234,
  9,
  13,
  3,
  1319,
  5,
  320,
  8,
  11,
  28,
  55,
  731,
  2280,
  588,
  1339,
  269,
  151,
  79,
  28,
  55,
  731,
  2280,
  844,
  2105,
  269,
  1816,
  134,
  2682,
  1365,
  844,
  6,
  345,
  114,
  5,
  78,
  47,
  23,
  955,
  4,
  82,
  1076,
  1586,
  5,
  165,
  43,
  15,
  96,
  7,
  7,
  4517,
  6,
  1,
  88,
  1685,
  4,
  1,
  286,
  4517,
  105,
  35,
  227,
  10,
  420,
  1,
  1005,
  493,
  9,
  57,
  45,
  33,
  68,
  3,
  224,
  706,
  1,
  362,
  1898,
  455,
  149,
  335,
  148,
  3,
  19,
  41,
  3,
  1702,
  40,
  1609,
  27,
  11,
  354,
  39,
  1474,
  31,
  1,
  4517,
  5457]]

In [38]:
# Word index
word_index = tokenizer.word_index

In [39]:
type(word_index)


Out[39]:
dict

In [40]:
first10pairs = {k: word_index[k] for k in list(word_index)[:10]}

In [41]:
first10pairs


Out[41]:
{'a': 3,
 'and': 2,
 'br': 7,
 'i': 10,
 'in': 8,
 'is': 6,
 'it': 9,
 'of': 4,
 'the': 1,
 'to': 5}

In [42]:
# Padding the sequence
data = pad_sequences(sequences, maxlen = maxlen)

In [43]:
data.shape


Out[43]:
(25000, 100)

In [44]:
labels = np.asarray(labels)

In [45]:
labels.shape


Out[45]:
(25000,)

In [46]:
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]

In [47]:
# Splitting the data into train and validation datasets
x_train = data[:training_samples]
y_train = labels[:training_samples]
x_val = data[training_samples: training_samples + validation_samples]
y_val = labels[training_samples: training_samples + validation_samples]

In [48]:
x_train.shape


Out[48]:
(200, 100)

In [49]:
x_val.shape


Out[49]:
(10000, 100)

GloVe Embedding


In [50]:
# Importing tqdm to show a progress bar
from tqdm import tqdm

In [51]:
glove_dir = './data/Chapter 6.1.2 - Using word embeddings/glove.6B/'
embeddings_index = {}
f = open(os.path.join(glove_dir, 'glove.6B.100d.txt'), 
         encoding = 'utf-8')
for line in tqdm(f):
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], 
                       dtype = 'float32')
    embeddings_index[word] = coefs
f.close()


400000it [00:20, 19883.21it/s]

In [52]:
len(embeddings_index)


Out[52]:
400000

In [53]:
embedding_dim = 100
embedding_matrix = np.zeros((max_words, embedding_dim))

for word, i in word_index.items():
    if i < max_words:
        embedding_vector = embeddings_index.get(word)
        # Words not found in the embedding index will be represented as zeros
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

In [54]:
embedding_matrix


Out[54]:
array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.038194  , -0.24487001,  0.72812003, ..., -0.1459    ,
         0.82779998,  0.27061999],
       [-0.071953  ,  0.23127   ,  0.023731  , ..., -0.71894997,
         0.86894   ,  0.19539   ],
       ...,
       [ 0.13787   , -0.17727   , -0.62436002, ...,  0.35506001,
         0.33443999,  0.14436001],
       [-0.88968998,  0.55208999, -0.50498998, ..., -0.54351002,
        -0.21874   ,  0.51186001],
       [-0.17381001, -0.037609  ,  0.068837  , ..., -0.097167  ,
         1.08840001,  0.22676   ]])

Model


In [55]:
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense

In [56]:
model = Sequential()
model.add(Embedding(input_dim = max_words, 
                    output_dim = embedding_dim, 
                    input_length = maxlen))
model.add(Flatten())
model.add(Dense(units = 32, 
                activation = 'relu'))
model.add(Dense(units = 1, 
                activation = 'sigmoid'))
model.summary()


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
embedding_3 (Embedding)      (None, 100, 100)          1000000   
_________________________________________________________________
flatten_2 (Flatten)          (None, 10000)             0         
_________________________________________________________________
dense_2 (Dense)              (None, 32)                320032    
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 33        
=================================================================
Total params: 1,320,065
Trainable params: 1,320,065
Non-trainable params: 0
_________________________________________________________________

In [57]:
# Loading pretrained word embeddings
model.layers[0].set_weights([embedding_matrix])
# Freezing the layer
model.layers[0].trainable = False

In [58]:
model.compile(optimizer = 'rmsprop',
              loss = 'binary_crossentropy',
              metrics = ['acc'])

In [59]:
history = model.fit(x = x_train, 
                    y = y_train,
                    epochs = 10,
                    batch_size = 32,
                    validation_data = (x_val, y_val))


Train on 200 samples, validate on 10000 samples
Epoch 1/10
200/200 [==============================] - ETA: 2s - loss: 0.7250 - acc: 0.625 - 1s 7ms/step - loss: 1.6337 - acc: 0.5250 - val_loss: 0.7130 - val_acc: 0.5100
Epoch 2/10
200/200 [==============================] - ETA: 0s - loss: 0.6804 - acc: 0.468 - 1s 4ms/step - loss: 0.7565 - acc: 0.5800 - val_loss: 0.6910 - val_acc: 0.5418
Epoch 3/10
200/200 [==============================] - ETA: 0s - loss: 0.5768 - acc: 0.812 - 1s 4ms/step - loss: 0.5956 - acc: 0.6950 - val_loss: 1.1205 - val_acc: 0.4936
Epoch 4/10
200/200 [==============================] - ETA: 0s - loss: 1.1240 - acc: 0.375 - 1s 4ms/step - loss: 0.5335 - acc: 0.7350 - val_loss: 0.7134 - val_acc: 0.5362
Epoch 5/10
200/200 [==============================] - ETA: 0s - loss: 0.2450 - acc: 0.937 - 1s 4ms/step - loss: 0.4713 - acc: 0.8100 - val_loss: 0.7177 - val_acc: 0.5589
Epoch 6/10
200/200 [==============================] - ETA: 0s - loss: 0.1753 - acc: 0.968 - 1s 4ms/step - loss: 0.1448 - acc: 0.9800 - val_loss: 1.3373 - val_acc: 0.4952
Epoch 7/10
200/200 [==============================] - ETA: 0s - loss: 0.4039 - acc: 0.750 - 1s 4ms/step - loss: 0.2545 - acc: 0.8800 - val_loss: 1.3110 - val_acc: 0.4960
Epoch 8/10
200/200 [==============================] - ETA: 0s - loss: 0.2257 - acc: 0.968 - 1s 4ms/step - loss: 0.1102 - acc: 0.9800 - val_loss: 0.8168 - val_acc: 0.5558
Epoch 9/10
200/200 [==============================] - ETA: 0s - loss: 0.0502 - acc: 1.000 - 1s 4ms/step - loss: 0.0760 - acc: 0.9800 - val_loss: 1.5204 - val_acc: 0.5115
Epoch 10/10
200/200 [==============================] - ETA: 0s - loss: 0.2294 - acc: 0.906 - 1s 4ms/step - loss: 0.0680 - acc: 0.9850 - val_loss: 0.7458 - val_acc: 0.5759

In [60]:
model.save_weights('./saved_checkpoints/Chapter 6.1.2 - Using word embeddings/pre_trained_glove_model.h5')

Performance


In [61]:
import matplotlib.pyplot as plt

In [62]:
acc = history.history['acc']
val_acc = history.history['val_acc']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(1, len(acc) + 1)

plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()

plt.figure()

plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()

plt.show()


The model overfits very quickly.

Model without pre-trained embeddings


In [63]:
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense

model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=maxlen))
model.add(Flatten())
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.summary()

model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['acc'])
history = model.fit(x_train, y_train,
                    epochs=10,
                    batch_size=32,
                    validation_data=(x_val, y_val))


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
embedding_4 (Embedding)      (None, 100, 100)          1000000   
_________________________________________________________________
flatten_3 (Flatten)          (None, 10000)             0         
_________________________________________________________________
dense_4 (Dense)              (None, 32)                320032    
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 33        
=================================================================
Total params: 1,320,065
Trainable params: 1,320,065
Non-trainable params: 0
_________________________________________________________________
Train on 200 samples, validate on 10000 samples
Epoch 1/10
200/200 [==============================] - ETA: 3s - loss: 0.6994 - acc: 0.375 - 2s 8ms/step - loss: 0.6951 - acc: 0.4350 - val_loss: 0.6950 - val_acc: 0.5167
Epoch 2/10
200/200 [==============================] - ETA: 0s - loss: 0.5114 - acc: 1.000 - 1s 4ms/step - loss: 0.5028 - acc: 0.9800 - val_loss: 0.7054 - val_acc: 0.5069
Epoch 3/10
200/200 [==============================] - ETA: 0s - loss: 0.3616 - acc: 0.968 - 1s 4ms/step - loss: 0.2898 - acc: 0.9850 - val_loss: 0.7012 - val_acc: 0.5187
Epoch 4/10
200/200 [==============================] - ETA: 0s - loss: 0.1380 - acc: 1.000 - 1s 4ms/step - loss: 0.1183 - acc: 1.0000 - val_loss: 0.7166 - val_acc: 0.5156
Epoch 5/10
200/200 [==============================] - ETA: 0s - loss: 0.0590 - acc: 1.000 - 1s 4ms/step - loss: 0.0524 - acc: 1.0000 - val_loss: 0.7150 - val_acc: 0.5288
Epoch 6/10
200/200 [==============================] - ETA: 0s - loss: 0.0273 - acc: 1.000 - 1s 4ms/step - loss: 0.0261 - acc: 1.0000 - val_loss: 0.7249 - val_acc: 0.5260
Epoch 7/10
200/200 [==============================] - ETA: 0s - loss: 0.0150 - acc: 1.000 - 1s 4ms/step - loss: 0.0141 - acc: 1.0000 - val_loss: 0.7211 - val_acc: 0.5389
Epoch 8/10
200/200 [==============================] - ETA: 0s - loss: 0.0090 - acc: 1.000 - 1s 4ms/step - loss: 0.0082 - acc: 1.0000 - val_loss: 0.7390 - val_acc: 0.5267
Epoch 9/10
200/200 [==============================] - ETA: 0s - loss: 0.0049 - acc: 1.000 - 1s 4ms/step - loss: 0.0049 - acc: 1.0000 - val_loss: 0.7283 - val_acc: 0.5393
Epoch 10/10
200/200 [==============================] - ETA: 0s - loss: 0.0032 - acc: 1.000 - 1s 4ms/step - loss: 0.0030 - acc: 1.0000 - val_loss: 0.7476 - val_acc: 0.5313

In [64]:
acc = history.history['acc']
val_acc = history.history['val_acc']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(1, len(acc) + 1)

plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()

plt.figure()

plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()

plt.show()