Inspired by https://machinelearningmastery.com



In [ ]:

    
import numpy as np
import string
import time
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.utils import np_utils
from keras.preprocessing.sequence import pad_sequences



In [81]:

    
alphabet = string.ascii_uppercase



In [82]:

    
alphabet









    Out[82]:





'ABCDEFGHIJKLMNOPQRSTUVWXYZ'



In [83]:

    
map_to_int = {c:num for num, c in enumerate(alphabet)}



In [84]:

    
map_to_char = {num:c for num, c in enumerate(alphabet)}



In [85]:

    
# generating input data: a sequnce of characters of diffrerent lenths (max_lenth = 5)

max_lenth = 5
samples = 1000
batch_size = 1

X, y = [], []

for _ in range(samples):
    start = np.random.randint(len(alphabet) - 2)
    # not to exceed num of characters
    end = np.random.randint(start, min(start + max_lenth, len(alphabet) -1))
    seq_in = alphabet[start: end + 1]
    seq_out = alphabet[end + 1]
    X.append([map_to_int[i] for i in seq_in])
    y.append([map_to_int[i] for i in seq_out])



In [86]:

    
X = pad_sequences(X, maxlen=max_lenth, dtype='float32')
# normalization
X = X/len(alphabet)
# reshaping [samples, time steps, features]
X = X.reshape(X.shape[0], max_lenth, 1)
y = np_utils.to_categorical(y)



In [98]:

    
def build_model():
    model = Sequential()
    model.add(LSTM(32, input_shape=(X.shape[1], 1)))
    model.add(Dense(y.shape[1], activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model



In [99]:

    
model = build_model()

Training was done using CPU (4 cores)



In [100]:

    
start = time.time()
model.fit(X, y, epochs=800, batch_size=batch_size, verbose=0)
scores = model.evaluate(X, y, verbose=0)
print("Model Accuracy: %.2f%%" % (scores[1]*100))
print("It took: {} minutes".format((time.time() - start)/60 ))









    



Model Accuracy: 100.00%
It took: 167.41289522250494 minutes



In [129]:

    
for i in ['B', "BCD", 'STU', 'R', 'DEFGH']:
    s = [map_to_int[s] for s in i]
    x = pad_sequences([s], maxlen=max_lenth, dtype='float32')
    x = x.reshape(1, max_lenth, 1)
    x = x / float(len(alphabet))
    prediction = model.predict(x, verbose=0)
    index = np.argmax(prediction)
    result = map_to_char[index]
    print("After {} comes {}".format(i, result))









    



After B comes C
After BCD comes E
After STU comes V
After R comes S
After DEFGH comes I