The Unreasonable effectiveness of RNN: http://karpathy.github.io/2015/05/21/rnn-effectiveness/
Great blog by Distill founder: https://colah.github.io/posts/2015-08-Understanding-LSTMs/
LSTM variants, hyperparam tuning etc.: LSTM: A Search Space Odyssey
Minimal char-rnn in Numpy: https://gist.github.com/karpathy/d4dee566867f8291f086
Minimal char-rnn in Keras: https://github.com/fchollet/keras/blob/master/examples/lstm_text_generation.py
Dataset: http://deron.meranda.us/data/census-derived-all-first.txt
More name list links: https://stackoverflow.com/questions/1803628/raw-list-of-person-names
In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import sys
import time
import random
In [2]:
import tensorflow as tf
import tensorflow.contrib.keras as keras
from tensorflow.contrib.keras import backend as K
from tensorflow.contrib.keras.python.keras.models import Sequential
from tensorflow.contrib.keras.python.keras.layers import Dense, LSTM, Activation
In [3]:
print("TensorFlow version =", tf.__version__)
#print("TF contrib Keras version =",keras.__version__) ???
print("Python version =",sys.version)
print("Keras backend =", keras.backend.backend())
In [4]:
def vector_to_char(vec, ix_to_char):
"""Returns most probable character represented by the 'one-hot' vector of probabilities."""
return ix_to_char[np.argmax(vec)]
In [5]:
filename = 'data/first_names.txt'
data = open(filename, 'r').read() # should be simple plain text file
print("Loaded data from", filename)
data = data.replace('\n', ' ') #change '\n' to ' ' for better readability
chars = sorted(list(set(data)))
data_size, vocab_size = len(data), len(chars)
print("Data has {} characters, {} unique.".format(data_size, vocab_size))
char_to_ix = { ch:i for i,ch in enumerate(chars) }
ix_to_char = { i:ch for i,ch in enumerate(chars) }
print("Turn an array of characters to an array of numbers:")
data_ix = [char_to_ix[char] for char in data]
print(" data[0]={} has been turned to: \n data_ix[0]={}".format(data[0], data_ix[0]))
In [6]:
seq_len = 20 #length of a sequence of characters fed to RNN before asking for a next character while training
step_size = 3 #first sequence starts at index 0, next one at index 3 etc...
print("Cutting text into sequences of seq_len={} overlapping after each step_size={} characters:".format(seq_len, step_size))
sequences = [] #list of sequences
correct_next_char = [] #correct next char after each sequence - this will be used as target data to train the RNN
for i in range(0, len(data) - seq_len, step_size):
sequences.append(data_ix[i:i+seq_len])
correct_next_char.append(data_ix[i+seq_len])
sequences = np.array(sequences)
correct_next_char = np.array(correct_next_char)
print(" Sequences shape =", sequences.shape, sequences.dtype)
print(" correct_next_char shape =", correct_next_char.shape, correct_next_char.dtype)
In [7]:
print("Vectorizing = transforming sequences and next_char to one-hot encoding:")
seq_one_hot = np.zeros(shape=(len(sequences), seq_len, vocab_size), dtype=np.bool)
next_char_one_hot = np.zeros(shape=(len(sequences), vocab_size), dtype=np.bool)
for s in range(len(sequences)):
for c in range(seq_len):
idx = sequences[s,c] #index of c-th char in s-th sequence
seq_one_hot[s,c,idx] = 1
next_char_idx = correct_next_char[s] #index of next char after the s-th sequence
next_char_one_hot[s,next_char_idx] = 1
x = seq_one_hot
y = next_char_one_hot
print(" sequences[0,0]={} has been turned to: \n seq_one_hot[0,0]={}".format(sequences[0,0], seq_one_hot[0,0]))
print(" correct_next_char[0]={} has been turned to: \n next_char_one_hot[0]={}".format(correct_next_char[0], next_char_one_hot[0]))
print(" x = seq_one_hot shape =", seq_one_hot.shape, seq_one_hot.dtype)
print(" y = next_char_one_hot shape =", next_char_one_hot.shape, next_char_one_hot.dtype)
In [8]:
def build_model(neurons, seq_len, vocab_size):
print('Building single layer LSTM model with {} neurons...'.format(neurons))
model = Sequential()
model.add(LSTM(neurons, input_shape=(seq_len, vocab_size)))
model.add(Dense(vocab_size, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer=keras.optimizers.RMSprop(lr=0.01))
print("LSTM layer input shape =", (seq_len, vocab_size))
print("Dense layer with vocab_size={} neurons and 'softmax' activation".format(vocab_size))
return model
In [9]:
neurons = 50
model = build_model(neurons, seq_len, vocab_size)
In [19]:
def train_model(epochs, batch_size, generate_chars, model, data, char_to_ix, ix_to_char, seq_len, vocab_size):
print("Started training for {} epochs with batch size = {}".format(epochs, batch_size))
for epoch in range(epochs):
print("")
print("-"*30)
print("Epoch", epoch)
model.fit(x, y, batch_size=batch_size, epochs=1)
seq_start_index = random.randint(0, len(data) - seq_len - 1)
sentence = data[seq_start_index: seq_start_index + seq_len] #sentence = sequence
print('----- Generating with seed: "' + sentence + '"')
for i in range(generate_chars):
x_pred = np.zeros((1, seq_len, vocab_size)) #single sequence will be passed to trained RNN
for t, char in enumerate(sentence): #turn sequence to one-hot
x_pred[0, t, char_to_ix[char]] = 1. #sequence is from data = characters not numbers
preds = model.predict(x_pred, verbose=0)[0] #get next_char prediction = vector of probabilities
#next_index = sample(preds, diversity) #TODO implement sample function supporting different diversities
next_char = vector_to_char(preds, ix_to_char)
sentence = sentence[1:] + next_char
sys.stdout.write(next_char)
sys.stdout.flush()
In [10]:
epochs = 20
batch_size = 128
generate_chars = 50 #How many characters should be generated after each epoch
print("Started training for {} epochs with batch size = {}".format(epochs, batch_size))
for epoch in range(epochs):
print("")
print("-"*30)
print("Epoch", epoch)
model.fit(x, y, batch_size=batch_size, epochs=1)
seq_start_index = random.randint(0, len(data) - seq_len - 1)
sentence = data[seq_start_index: seq_start_index + seq_len] #sentence = sequence
print('----- Generating with seed: "' + sentence + '"')
for i in range(generate_chars):
x_pred = np.zeros((1, seq_len, vocab_size)) #single sequence will be passed to trained RNN
for t, char in enumerate(sentence): #turn sequence to one-hot
x_pred[0, t, char_to_ix[char]] = 1. #sequence is from data = characters not numbers
preds = model.predict(x_pred, verbose=0)[0] #get next_char prediction = vector of probabilities
#next_index = sample(preds, diversity) #TODO implement sample function supporting different diversities
next_char = vector_to_char(preds, ix_to_char)
sentence = sentence[1:] + next_char
sys.stdout.write(next_char)
sys.stdout.flush()
In [18]:
model_backup = model
epochs = 20
batch_size = 128
generate_chars = 250 #5 times as more chars generated after each epoch
train_model(epochs, batch_size, generate_chars, model, data, char_to_ix, ix_to_char, seq_len, vocab_size)
In [20]:
model_backup = model
epochs = 20
batch_size = 128
generate_chars = 250 #5 times as more chars generated after each epoch
train_model(epochs, batch_size, generate_chars, model, data, char_to_ix, ix_to_char, seq_len, vocab_size)
In [22]:
generate_chars = 1000
sentence = data[seq_start_index: seq_start_index + seq_len] #sentence = sequence
print('----- Generating with seed: "' + sentence + '"')
for i in range(generate_chars):
x_pred = np.zeros((1, seq_len, vocab_size)) #single sequence will be passed to trained RNN
for t, char in enumerate(sentence): #turn sequence to one-hot
x_pred[0, t, char_to_ix[char]] = 1. #sequence is from data = characters not numbers
preds = model.predict(x_pred, verbose=0)[0] #get next_char prediction = vector of probabilities
#next_index = sample(preds, diversity) #TODO implement sample function supporting different diversities
next_char = vector_to_char(preds, ix_to_char)
sentence = sentence[1:] + next_char
sys.stdout.write(next_char)
sys.stdout.flush()
In [ ]: