In [1]:
import numpy as np
from collections import Counter
import tensorflow as tf
from keras.callbacks import ModelCheckpoint
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.recurrent import LSTM
In [2]:
corpus_file = open('corpus.txt', mode='r', encoding="utf8")
corpus = corpus_file.read()
# corpus = corpus.lower()
print(corpus[:45])
In [3]:
vocab = set(corpus)
num_classes = len(vocab)
print(vocab)
print(num_classes)
vocab_to_int = {c:i for i, c in enumerate(vocab)}
int_to_vocab = {i:c for i, c in enumerate(vocab)}
encoded = [vocab_to_int[c] for c in corpus]
textsize = len(encoded)
print(encoded[:45])
print(textsize)
In [62]:
# Hyperparams
# samples = 1300
steps = 50
dropout = 0.5
epochs = 150
batch_size = 256
In [63]:
X = []
y = []
for i in range(0, textsize - steps, 1):
X.append(encoded[i : i + steps])
y.append(encoded[i + steps])
X = np.reshape(X, (len(X), steps, 1))
X = X/float(num_classes)
X_train = X
y_train = np.eye(num_classes)[y]
print(X_train.shape)
print(y_train.shape)
In [64]:
Counter(y)
Out[64]:
In [65]:
model = Sequential()
# model.add(LSTM(512,input_shape=(X_train.shape[1], X_train.shape[2]),return_sequences=True))
# model.add(Dropout(dropout))
model.add(LSTM(512,input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(Dropout(dropout))
# model.add(LSTM(256))
# model.add(Dropout(dropout))
model.add(Dense(num_classes, activation='softmax'))
model.compile(loss="categorical_crossentropy", optimizer="adam")
In [66]:
filepath="weights-improvement-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=0, save_best_only=True, mode='min')
callbacks_list = [checkpoint]
In [ ]:
model.fit(X_train, y_train, batch_size = batch_size, epochs=epochs, callbacks=callbacks_list)
Out[ ]:
In [68]:
y_pred = model.predict(np.reshape(X_train[102,:,:],(1,steps,1)))
# print(y_pred)
y_transformed = np.argmax(y_pred)
# print(y_transformed)
print(int_to_vocab[y_transformed])
In [74]:
seed = np.random.randint(textsize-steps)
x_seed = corpus[seed:seed + steps]
x_seed = [vocab_to_int[char] for char in x_seed]
x_seed = np.reshape(x_seed, (1, steps, 1))
x_seed = x_seed/float(num_classes)
print(x_seed.shape)
In [75]:
x = x_seed
out = []
charsize = 1000
for i in range(charsize):
y_pred = model.predict(x)
y_transformed = np.argmax(y_pred)
output = int_to_vocab[y_transformed]
out.append(output)
x_new = y_transformed/float(num_classes)
x = np.append(x[:,1:100,:], np.reshape(x_new, (1,1,1)), axis = 1)
print('completed')
In [73]:
print(len(out))
print(''.join(out))
In [ ]: