In [3]:
import os
import pandas as pd
import numpy as np
from keras.models import Model
from ch10 import construct_seq2seq_model
from nlpia.loaders import get_data, DATA_PATH
In [11]:
batch_size = 64 # Batch size for training.
epochs = 100 # Number of epochs to train for.
num_samples = 10000
data_path = os.path.join(DATA_PATH, 'movie_dialog.txt') # preprocessed CMU movie dialogue samples
In [12]:
try:
import cPickle as pickle
except ImportError:
import pickle
from io import open
with open("../data/characters_stats.pkl", "rb") as filehandler:
input_characters, target_characters, input_token_index, target_token_index = pickle.load(filehandler)
with open("../data/encoder_decoder_stats.pkl", "rb") as filehandler:
num_encoder_tokens, num_decoder_tokens, max_encoder_seq_length, max_decoder_seq_length = pickle.load(filehandler)
In [13]:
input_texts = []
target_texts = []
input_characters = set()
target_characters = set()
lines = open(data_path).read().split('\n')
for line in lines[: min(num_samples, len(lines) - 1)]:
input_text, target_text = line.split('\t')
# We use "tab" as the "start sequence" character
# for the targets, and "\n" as "end sequence" character.
target_text = '\t' + target_text + '\n'
input_texts.append(input_text)
target_texts.append(target_text)
for char in input_text:
if char not in input_characters:
input_characters.add(char)
for char in target_text:
if char not in target_characters:
target_characters.add(char)
input_characters = sorted(list(input_characters))
target_characters = sorted(list(target_characters))
num_encoder_tokens = len(input_characters)
num_decoder_tokens = len(target_characters)
max_encoder_seq_length = max([len(txt) for txt in input_texts])
max_decoder_seq_length = max([len(txt) for txt in target_texts])
print('Number of samples:', len(input_texts))
print('Number of unique input tokens:', num_encoder_tokens)
print('Number of unique output tokens:', num_decoder_tokens)
print('Max sequence length for inputs:', max_encoder_seq_length)
print('Max sequence length for outputs:', max_decoder_seq_length)
In [14]:
encoder_input_data = np.zeros(
(len(input_texts), max_encoder_seq_length, num_encoder_tokens),
dtype='float32')
decoder_input_data = np.zeros(
(len(input_texts), max_decoder_seq_length, num_decoder_tokens),
dtype='float32')
decoder_target_data = np.zeros(
(len(input_texts), max_decoder_seq_length, num_decoder_tokens),
dtype='float32')
In [15]:
for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
for t, char in enumerate(input_text):
encoder_input_data[i, t, input_token_index[char]] = 1.
for t, char in enumerate(target_text):
# decoder_target_data is ahead of decoder_input_data by one timestep
decoder_input_data[i, t, target_token_index[char]] = 1.
if t > 0:
# decoder_target_data will be ahead by one timestep
# and will not include the start character.
decoder_target_data[i, t - 1, target_token_index[char]] = 1.
In [32]:
# model = construct_seq2seq_model(num_encoder_tokens, num_decoder_tokens)
from keras.layers import Input, LSTM, Dense
batch_size = 64 # <1>
epochs = 100 # <2>
num_neurons = 256 # <3>
encoder_inputs = Input(shape=(None, num_encoder_tokens))
encoder = LSTM(num_neurons, return_state=True)
_, state_h, state_c = encoder(encoder_inputs)
encoder_states = [state_h, state_c]
decoder_inputs = Input(shape=(None, num_decoder_tokens))
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs,
initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
In [8]:
# Run training
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['acc'])
model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
batch_size=batch_size,
epochs=epochs,
validation_split=0.2)
Out[8]:
In [9]:
model_path = os.path.join(DATA_PATH, 'ch10_train_seq2seq_keras.h5')
model.save(model_path + '_model')
In [ ]:
model.save_weights(model_path + '_weights.h5')
In [33]:
from keras.models import load_model
model_path = os.path.join(DATA_PATH, 'ch10_train_seq2seq_keras')
model = load_model(model_path + '_model.h5')
In [34]:
model.load_weights(model_path + '_weights.h5')
In [35]:
encoder_model = Model(encoder_inputs, encoder_states)
thought_input = [
Input(shape=(num_neurons,)), Input(shape=(num_neurons,))]
decoder_outputs, state_h, state_c = decoder_lstm(
decoder_inputs, initial_state=thought_input)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model(
inputs=[decoder_inputs] + thought_input,
output=[decoder_outputs] + decoder_states)
In [ ]:
>>> def decode_sequence(input_seq):
... thought = encoder_model.predict(input_seq) # <1>
... target_seq = np.zeros((1, 1, output_vocab_size)) # <2>
... target_seq[0, 0, target_token_index[stop_token]
... ] = 1. # <3>
... stop_condition = False
... generated_sequence = ''
... while not stop_condition:
... output_tokens, h, c = decoder_model.predict(
... [target_seq] + thought) # <4>
... generated_token_idx = np.argmax(output_tokens[0, -1, :])
... generated_char = reverse_target_char_index[generated_token_idx]
... generated_sequence += generated_char
... if (generated_char == stop_token or
... len(generated_sequence) > max_decoder_seq_length
... ): # <5>
... stop_condition = True
... target_seq = np.zeros((1, 1, output_vocab_size)) # <6>
... target_seq[0, 0, generated_token_idx] = 1.
... thought = [h, c] # <7>
... return generated_sequence