In [0]:
# Based on
# https://github.com/fchollet/deep-learning-with-python-notebooks/blob/master/6.2-understanding-recurrent-neural-networks.ipynb
In [0]:
import warnings
warnings.filterwarnings('ignore')
In [32]:
%matplotlib inline
%pylab inline
In [33]:
import tensorflow as tf
tf.logging.set_verbosity(tf.logging.ERROR)
print(tf.__version__)
In [0]:
from tensorflow import keras
# https://keras.io/datasets/#imdb-movie-reviews-sentiment-classification
max_features = 10000 # number of words to consider as features
maxlen = 50 # cut texts after this number of words (among top max_features most common words)
# each review is encoded as a sequence of word indexes
# indexed by overall frequency in the dataset
# output is 0 (negative) or 1 (positive)
imdb = keras.datasets.imdb.load_data(num_words=max_features)
(raw_input_train, y_train), (raw_input_test, y_test) = imdb
In [0]:
# tf.keras.datasets.imdb.load_data?
In [36]:
y_train.min()
Out[36]:
In [37]:
y_train.max()
Out[37]:
In [38]:
# 25000 texts
len(raw_input_train)
Out[38]:
In [39]:
# first text has 218 words
len(raw_input_train[0])
Out[39]:
In [40]:
raw_input_train[0]
Out[40]:
In [0]:
# tf.keras.preprocessing.sequence.pad_sequences?
In [0]:
# https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/sequence/pad_sequences
input_train = keras.preprocessing.sequence.pad_sequences(raw_input_train, maxlen=maxlen)
input_test = keras.preprocessing.sequence.pad_sequences(raw_input_test, maxlen=maxlen)
In [43]:
input_train.shape, input_test.shape, y_train.shape, y_test.shape
Out[43]:
In [44]:
# left padded with zeros
# As a convention, "0" does not stand for a specific word, but instead is used to encode any unknown word.
input_train[0]
Out[44]:
In [0]:
# tf.keras.layers.Embedding?
In [46]:
from tensorflow.keras.layers import Embedding, Flatten, GlobalAveragePooling1D, Dense, Dropout
embedding_dim = 2
model = keras.Sequential()
# Parameters: max_features * embedding_dim
model.add(Embedding(name='embedding', input_dim=max_features, output_dim=embedding_dim, input_length=maxlen))
# Output: maxlen * embedding_dim (8)
model.add(Flatten(name='flatten'))
# ALTERNATIVE
# average of all embeddings (does not preserve sequence)
# model.add(GlobalAveragePooling1D(name='average_pooling'))
# binary classifier
# model.add(Dense(name='fc', units=32, activation='relu'))
# model.add(Dropout(0.4))
model.add(Dense(name='classifier', units=1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()
In [47]:
batch_size = 96
%time history = model.fit(input_train, y_train, epochs=40, batch_size=batch_size, validation_data=(input_test, y_test))
In [48]:
import pandas as pd
def plot_history(history, samples=10, init_phase_samples=None):
epochs = history.params['epochs']
acc = history.history['acc']
val_acc = history.history['val_acc']
every_sample = int(epochs / samples)
acc = pd.DataFrame(acc).iloc[::every_sample, :]
val_acc = pd.DataFrame(val_acc).iloc[::every_sample, :]
fig, ax = plt.subplots(figsize=(20,5))
ax.plot(acc, 'bo', label='Training acc')
ax.plot(val_acc, 'b', label='Validation acc')
ax.set_title('Training and validation accuracy')
ax.legend()
plot_history(history)
In [49]:
train_loss, train_accuracy = model.evaluate(input_train, y_train, batch_size=batch_size)
train_accuracy
Out[49]:
In [50]:
test_loss, test_accuracy = model.evaluate(input_test, y_test, batch_size=batch_size)
test_accuracy
Out[50]:
In [51]:
# precition
model.predict(input_test[0:5])
Out[51]:
In [52]:
# ground truth
y_test[0:5]
Out[52]:
In [0]:
embedding_layer = model.get_layer('embedding')
In [0]:
model_stub= keras.Model(inputs=model.input, outputs=embedding_layer.output)
In [0]:
word_to_id = keras.datasets.imdb.get_word_index()
def encode_text(text):
input_words = text.split()
input_tokens = np.array([word_to_id[word] for word in input_words])
padded_input_tokens = keras.preprocessing.sequence.pad_sequences([input_tokens], maxlen=maxlen)
return padded_input_tokens
def plot_text_embedding(model, text):
input_words = text.split()
input_sequence = encode_text(text)
embeddings = model.predict(input_sequence)[0][-len(input_words):, :]
x_coords = embeddings[:, 0] # First latent dim
y_coords = embeddings[:, 1] # Second latent dim
plt.figure(figsize=(20, 20))
plt.scatter(x_coords, y_coords)
for i, txt in enumerate(input_words):
plt.annotate(txt, (x_coords[i], y_coords[i]))
plt.show()
In [56]:
text = """good best brilliant amazing great lovely awesome
bad worst awful
art
garbage gross horrible
sad funny
beautiful ugly
movie actor male female love"""
plot_text_embedding(model_stub, text)
In [57]:
from tensorflow.keras.layers import Embedding, Flatten, GlobalAveragePooling1D, Dense, Dropout
embedding_dim = 1
model = keras.Sequential()
# Parameters: max_features * embedding_dim
model.add(Embedding(name='embedding', input_dim=max_features, output_dim=embedding_dim, input_length=maxlen))
# Output: maxlen * embedding_dim (8)
model.add(Flatten(name='flatten'))
# ALTERNATIVE
# average of all embeddings (does not preserve sequence)
# model.add(GlobalAveragePooling1D(name='average_pooling'))
# binary classifier
model.add(Dense(name='fc', units=32, activation='relu'))
# model.add(Dropout(0.4))
model.add(Dense(name='classifier', units=1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
# model.summary()
batch_size = 96
%time history = model.fit(input_train, y_train, epochs=40, batch_size=batch_size, validation_data=(input_test, y_test))
In [58]:
embedding_layer = model.get_layer('embedding')
model_stub= keras.Model(inputs=model.input, outputs=embedding_layer.output)
def plot_1d_text_embedding(model, text):
input_words = text.split()
input_sequence = encode_text(text)
embeddings = model.predict(input_sequence)[0][-len(input_words):, :]
plt.figure(figsize=(20, 5))
plt.scatter(embeddings, np.zeros(len(embeddings)))
for i, txt in enumerate(input_words):
plt.annotate(txt, (embeddings[i], 0.004), rotation=80)
plt.show()
text = """good best brilliant amazing great lovely awesome
bad worst awful
garbage gross horrible
sad funny
beautiful ugly"""
plot_1d_text_embedding(model_stub, text)
In [0]: