This notebook contains code to train a LSTM RNN on the corpus of all Steam game store long descriptions and generate new descriptions.
Based on this blog post: http://machinelearningmastery.com/text-generation-lstm-recurrent-neural-networks-python-keras/ and this example: https://github.com/fchollet/keras/blob/master/examples/lstm_text_generation.py
Things tried:
In [1]:
%matplotlib inline
import dataset
import keras
import numpy as np
import pandas as pd
import os
import re
from tqdm import tqdm
from pathlib import Path
In [2]:
db = dataset.connect(os.environ['POSTGRES_URI'])
Pull texts from our database. Limit the number of descriptions pulled to prevent us from running out of memory (and order randomly so we get a random sample). Keep only descriptions with metacritic scores to hopefully cut out a lot of the really tiny indie games with broken English in their descriptions. This biases us toward AAA games, but I think that's fine for the purpose of generating stereotypical game descriptions, and there are still plenty to choose from.
In [3]:
description_query = '''
WITH filtered_games AS (
SELECT *
FROM game_crawl
WHERE is_dlc = FALSE
AND game_name IS NOT NULL
AND metacritic_score IS NOT NULL
),
lower_length_limit AS (
SELECT percentile_cont(0.01) WITHIN GROUP (ORDER BY length(long_description)) AS lower_limit
FROM filtered_games
),
upper_length_limit AS (
SELECT percentile_cont(0.99) WITHIN GROUP (ORDER BY length(long_description)) AS upper_limit
FROM filtered_games
)
SELECT *
FROM filtered_games
WHERE length(long_description)
BETWEEN (SELECT lower_limit FROM lower_length_limit)
AND (SELECT upper_limit FROM upper_length_limit)
ORDER BY random()
LIMIT 1000
'''
corpus = [r['long_description'] for r in db.query(description_query)]
print(len(corpus))
print(corpus[:1])
Check the distribution of lengths on the descriptions to make sure we didn't get any crazy outliers.
In [4]:
pd.Series(corpus).apply(len).plot(kind='hist')
Out[4]:
Apply cleaning to help the model out.
In [5]:
bad_char_re = re.compile(r'[^-a-zA-Z0-9 !.,?\n:()]')
multi_spaces_re = re.compile(r'(\s){2,}')
def clean_description(description):
filtered_description = bad_char_re.sub('', description)
# Replace two or more spaces with one space
filtered_description = multi_spaces_re.sub(r'\1\1', filtered_description)
return filtered_description
cleaned_corpus = [clean_description(d) for d in corpus]
del corpus
Create a mapping of unique chars to integers
In [6]:
joined_corpus = '\n'.join(cleaned_corpus)
del cleaned_corpus
chars = sorted(list(set(joined_corpus)))
print(chars)
char_to_int = dict((c, i) for i, c in enumerate(chars))
Total number of characters in the corpus.
In [7]:
n_chars = len(joined_corpus)
print(n_chars)
Total number of characters in the vocab
In [8]:
n_vocab = len(chars)
print(n_vocab)
Prepare the dataset of input to output pairs encoded as integers
In [9]:
seq_length = 140
step = 1
data_x = []
data_y = []
for i in tqdm(range(0, n_chars - seq_length, step)):
start = i
end = i + seq_length
seq_in = joined_corpus[start:end]
seq_out = joined_corpus[end]
data_x.append([char_to_int[char] for char in seq_in])
data_y.append(char_to_int[seq_out])
n_patterns = len(data_x)
print(n_patterns)
del joined_corpus
Reshape the X array to be [samples, time steps, features], normalize, and one-hot encode the output
In [10]:
def transform_text_samples(text_samples, n_patterns, seq_length):
return np.reshape(text_samples, (n_patterns, seq_length, 1)) / float(n_vocab)
X = transform_text_samples(data_x, n_patterns, seq_length)
y = keras.utils.np_utils.to_categorical(data_y)
Define the model
In [11]:
model = keras.models.Sequential()
model.add(keras.layers.LSTM(128, input_shape=(X.shape[1], X.shape[2]), return_sequences=True, implementation=2))
model.add(keras.layers.Dropout(0.3))
model.add(keras.layers.LSTM(128, return_sequences=True, implementation=2))
model.add(keras.layers.LSTM(128, implementation=2))
model.add(keras.layers.Dense(y.shape[1], activation='softmax'))
# optimizer = keras.optimizers.RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer='adadelta')
checkpoint_path = Path('models', 'weights-improvement-{epoch:02d}-{loss:.4f}.hdf5')
checkpoint = keras.callbacks.ModelCheckpoint(str(checkpoint_path), monitor='loss',
verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]
In [12]:
model.fit(X, y, epochs=60, batch_size=128, callbacks=callbacks_list)
In [13]:
filename = Path('models', 'weights-improvement-31-1.6567.hdf5')
model.load_weights(str(filename))
model.compile(loss='categorical_crossentropy', optimizer='adadelta')
Generate a reverse mapping for ints to chars
In [14]:
int_to_char = dict((i, c) for i, c in enumerate(chars))
Generate predictions from a seed sequence
In [15]:
start = np.random.randint(0, len(data_x)-1)
pattern = data_x[start]
print("Seed:\n{}".format(''.join([int_to_char[value] for value in pattern])))
num_generated_chars = 1000
def sample(preds, temperature=1.0):
# sample an index from a probability array
preds = np.asarray(preds).astype('float64')
preds = np.log(preds) / temperature
exp_preds = np.exp(preds)
preds = exp_preds / np.sum(exp_preds)
probas = np.random.multinomial(1, preds)
return np.argmax(probas)
for diversity in (0.05, 0.1, 0.2, 0.5, 0.75, 1.0, 1.2):
generated_str = ''
for i in range(num_generated_chars):
x = transform_text_samples(pattern, 1, len(pattern))
prediction = model.predict(x, verbose=0)
index = sample(prediction[0], temperature=diversity)
result = int_to_char[index]
seq_in = [int_to_char[value] for value in pattern]
generated_str += result
pattern.append(index)
pattern = pattern[1:]
print("\n\nResult (diversity {}):\n{}".format(diversity, generated_str))