In [1]:
import numpy as np
from collections import Counter
import tensorflow as tf
from keras.callbacks import ModelCheckpoint
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.recurrent import LSTM


Using TensorFlow backend.

Reading Corpus


In [2]:
corpus_file = open('corpus.txt', mode='r', encoding="utf8")
corpus = corpus_file.read()
# corpus = corpus.lower()
print(corpus[:45])


THE BLACK CAT.

BY EDGAR ALLAN POE.


For th

In [3]:
vocab = set(corpus)
num_classes = len(vocab)
print(vocab)
print(num_classes)

vocab_to_int = {c:i for i, c in enumerate(vocab)}
int_to_vocab = {i:c for i, c in enumerate(vocab)}

encoded = [vocab_to_int[c] for c in corpus]
textsize = len(encoded)
print(encoded[:45])
print(textsize)


{'\n', 'm', '$', 'A', ' ', 'B', ':', 'c', 'P', '7', 'I', 'W', 'R', 'b', 'V', 'N', 'O', '8', 't', 'i', '[', 'n', '*', 'k', 'J', '(', '\ufeff', 'H', 'E', 'L', ',', '!', 'K', 'o', 'M', ';', '_', 'a', 'T', 'x', '’', '9', '”', 'd', 'z', 'u', 'g', 'G', 'Y', 'p', 'j', '.', '‘', 'y', '2', '-', '?', ')', ']', 'F', 'D', 'C', 'v', 'e', '0', 'S', 'l', 'h', 'w', '6', 's', '“', '1', 'f', '3', 'r', 'U', '5', 'q'}
79
[26, 38, 27, 28, 4, 5, 29, 3, 61, 32, 4, 61, 3, 38, 51, 0, 0, 5, 48, 4, 28, 60, 47, 3, 12, 4, 3, 29, 29, 3, 15, 4, 8, 16, 28, 51, 0, 0, 0, 59, 33, 75, 4, 18, 67]
51528

In [62]:
# Hyperparams
# samples = 1300
steps = 50
dropout = 0.5
epochs = 150
batch_size = 256

In [63]:
X = []
y = []

for i in range(0, textsize - steps, 1):
    X.append(encoded[i : i + steps])
    y.append(encoded[i + steps])

X = np.reshape(X, (len(X), steps, 1))
X = X/float(num_classes)

X_train = X
y_train = np.eye(num_classes)[y]

print(X_train.shape)
print(y_train.shape)


(51478, 50, 1)
(51478, 79)

In [64]:
Counter(y)


Out[64]:
Counter({0: 918,
         1: 1185,
         2: 3,
         3: 71,
         4: 8584,
         5: 45,
         6: 13,
         7: 1007,
         8: 32,
         9: 3,
         10: 318,
         11: 41,
         12: 11,
         13: 584,
         14: 3,
         15: 19,
         16: 26,
         17: 2,
         18: 3574,
         19: 2471,
         20: 4,
         21: 2670,
         22: 15,
         23: 288,
         24: 4,
         25: 3,
         27: 34,
         28: 22,
         29: 9,
         30: 750,
         31: 47,
         32: 10,
         33: 2942,
         34: 29,
         35: 33,
         36: 8,
         37: 3083,
         38: 108,
         39: 77,
         40: 73,
         41: 1,
         42: 74,
         43: 1949,
         44: 14,
         45: 1085,
         46: 797,
         47: 15,
         48: 17,
         49: 689,
         50: 34,
         51: 461,
         52: 11,
         53: 821,
         54: 4,
         55: 222,
         56: 23,
         57: 3,
         58: 4,
         59: 28,
         60: 14,
         61: 7,
         62: 349,
         63: 5113,
         64: 3,
         65: 53,
         66: 1549,
         67: 2423,
         68: 901,
         69: 1,
         70: 2279,
         71: 81,
         72: 2,
         73: 1007,
         74: 1,
         75: 2286,
         76: 8,
         77: 2,
         78: 28})

In [65]:
model = Sequential()
# model.add(LSTM(512,input_shape=(X_train.shape[1], X_train.shape[2]),return_sequences=True))
# model.add(Dropout(dropout))
model.add(LSTM(512,input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(Dropout(dropout))
# model.add(LSTM(256))
# model.add(Dropout(dropout))
model.add(Dense(num_classes, activation='softmax'))
model.compile(loss="categorical_crossentropy", optimizer="adam")

In [66]:
filepath="weights-improvement-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=0, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

In [ ]:
model.fit(X_train, y_train, batch_size = batch_size, epochs=epochs, callbacks=callbacks_list)


Epoch 1/150
51478/51478 [==============================] - 32s - loss: 3.1766      
Epoch 2/150
51478/51478 [==============================] - 30s - loss: 3.1016     
Epoch 3/150
51478/51478 [==============================] - 30s - loss: 3.0714     
Epoch 4/150
51478/51478 [==============================] - 30s - loss: 3.0121     
Epoch 5/150
51478/51478 [==============================] - 30s - loss: 2.9746     
Epoch 6/150
51478/51478 [==============================] - 30s - loss: 2.9457     
Epoch 7/150
51478/51478 [==============================] - 30s - loss: 2.9212     
Epoch 8/150
51478/51478 [==============================] - 30s - loss: 2.8995     
Epoch 9/150
51478/51478 [==============================] - 30s - loss: 2.8780     
Epoch 10/150
51478/51478 [==============================] - 30s - loss: 2.8630     
Epoch 11/150
51478/51478 [==============================] - 30s - loss: 2.8456     
Epoch 12/150
51478/51478 [==============================] - 30s - loss: 2.8260     
Epoch 13/150
51478/51478 [==============================] - 30s - loss: 2.8097     
Epoch 14/150
51478/51478 [==============================] - 30s - loss: 2.7917     
Epoch 15/150
51478/51478 [==============================] - 30s - loss: 2.7761     
Epoch 16/150
51478/51478 [==============================] - 30s - loss: 2.7582     
Epoch 17/150
51478/51478 [==============================] - 30s - loss: 2.7391     
Epoch 18/150
51478/51478 [==============================] - 30s - loss: 2.7220     
Epoch 19/150
51478/51478 [==============================] - 30s - loss: 2.7056     
Epoch 20/150
51478/51478 [==============================] - 30s - loss: 2.6821     
Epoch 21/150
51478/51478 [==============================] - 30s - loss: 2.6624     
Epoch 22/150
51478/51478 [==============================] - 30s - loss: 2.6387     
Epoch 23/150
51478/51478 [==============================] - 30s - loss: 2.6133     
Epoch 24/150
51478/51478 [==============================] - 30s - loss: 2.5901     
Epoch 25/150
51478/51478 [==============================] - 30s - loss: 2.5608     
Epoch 26/150
51478/51478 [==============================] - 30s - loss: 2.5267     
Epoch 27/150
51478/51478 [==============================] - 30s - loss: 2.4899     
Epoch 28/150
51478/51478 [==============================] - 30s - loss: 2.4545     
Epoch 29/150
51478/51478 [==============================] - 30s - loss: 2.4150     
Epoch 30/150
51478/51478 [==============================] - 30s - loss: 2.3697     
Epoch 31/150
51478/51478 [==============================] - 30s - loss: 2.3268     
Epoch 32/150
51478/51478 [==============================] - 30s - loss: 2.2668     
Epoch 33/150
51478/51478 [==============================] - 30s - loss: 2.2187     
Epoch 34/150
51478/51478 [==============================] - 30s - loss: 2.1731     
Epoch 35/150
51478/51478 [==============================] - 30s - loss: 2.1136     
Epoch 36/150
51478/51478 [==============================] - 30s - loss: 2.0665     
Epoch 37/150
51478/51478 [==============================] - 30s - loss: 2.0177     
Epoch 38/150
51478/51478 [==============================] - 30s - loss: 1.9568     
Epoch 39/150
51478/51478 [==============================] - 30s - loss: 1.8990     
Epoch 40/150
51478/51478 [==============================] - 30s - loss: 1.8496     
Epoch 41/150
51478/51478 [==============================] - 30s - loss: 1.8028     
Epoch 42/150
51478/51478 [==============================] - 30s - loss: 1.7420     
Epoch 43/150
51478/51478 [==============================] - 30s - loss: 1.7019     
Epoch 44/150
51478/51478 [==============================] - 30s - loss: 1.6602     
Epoch 45/150
51478/51478 [==============================] - 30s - loss: 1.6166     
Epoch 46/150
51478/51478 [==============================] - 30s - loss: 1.5671     
Epoch 47/150
51478/51478 [==============================] - 30s - loss: 1.5197     
Epoch 48/150
51478/51478 [==============================] - 30s - loss: 1.4715     
Epoch 49/150
51478/51478 [==============================] - 30s - loss: 1.4451     
Epoch 50/150
51478/51478 [==============================] - 30s - loss: 1.4144     
Epoch 51/150
51478/51478 [==============================] - 30s - loss: 1.2323     
Epoch 56/150
51478/51478 [==============================] - 31s - loss: 1.2010     
Epoch 57/150
51478/51478 [==============================] - 30s - loss: 1.1709     
Epoch 58/150
51478/51478 [==============================] - 30s - loss: 1.1602     
Epoch 59/150
51478/51478 [==============================] - 30s - loss: 1.1171     
Epoch 60/150
51478/51478 [==============================] - 30s - loss: 1.0820     
Epoch 61/150
51478/51478 [==============================] - 30s - loss: 1.0575     
Epoch 62/150
51478/51478 [==============================] - 30s - loss: 1.0594     
Epoch 63/150
51478/51478 [==============================] - 30s - loss: 1.0145     
Epoch 64/150
51478/51478 [==============================] - 30s - loss: 0.9837     
Epoch 65/150
51478/51478 [==============================] - 30s - loss: 0.9681     
Epoch 66/150
51478/51478 [==============================] - 30s - loss: 0.9580     
Epoch 67/150
51478/51478 [==============================] - 30s - loss: 0.9321     
Epoch 68/150
51478/51478 [==============================] - 30s - loss: 0.9053     
Epoch 69/150
51478/51478 [==============================] - 30s - loss: 0.8900     
Epoch 70/150
51478/51478 [==============================] - 30s - loss: 0.8690     
Epoch 71/150
51478/51478 [==============================] - 30s - loss: 0.8493     
Epoch 72/150
51478/51478 [==============================] - 30s - loss: 0.8290     
Epoch 73/150
51478/51478 [==============================] - 30s - loss: 0.8110     
Epoch 74/150
51478/51478 [==============================] - 30s - loss: 0.8096     
Epoch 75/150
51478/51478 [==============================] - 30s - loss: 0.8047     
Epoch 76/150
51478/51478 [==============================] - 30s - loss: 0.7699     
Epoch 77/150
51478/51478 [==============================] - 30s - loss: 0.7554     
Epoch 78/150
51478/51478 [==============================] - 30s - loss: 0.7661     
Epoch 79/150
51478/51478 [==============================] - 30s - loss: 0.7228     
Epoch 80/150
51478/51478 [==============================] - 30s - loss: 0.7201     
Epoch 81/150
51478/51478 [==============================] - 30s - loss: 0.6986     
Epoch 82/150
51478/51478 [==============================] - 30s - loss: 0.6956     
Epoch 83/150
51478/51478 [==============================] - 30s - loss: 0.6702     
Epoch 84/150
51478/51478 [==============================] - 30s - loss: 0.6789     
Epoch 85/150
51478/51478 [==============================] - 30s - loss: 0.6645     
Epoch 86/150
51478/51478 [==============================] - 30s - loss: 0.6431     
Epoch 87/150
51478/51478 [==============================] - 30s - loss: 0.6106     
Epoch 88/150
51478/51478 [==============================] - 30s - loss: 0.6045     
Epoch 89/150
51478/51478 [==============================] - 30s - loss: 0.6123     
Epoch 90/150
51478/51478 [==============================] - 30s - loss: 0.6161     
Epoch 91/150
51478/51478 [==============================] - 30s - loss: 0.5986     
Epoch 92/150
51478/51478 [==============================] - 30s - loss: 0.5727     
Epoch 93/150
51478/51478 [==============================] - 30s - loss: 0.5861     
Epoch 94/150
51478/51478 [==============================] - 30s - loss: 0.5713     
Epoch 95/150
51478/51478 [==============================] - 30s - loss: 0.5588     
Epoch 96/150
51478/51478 [==============================] - 30s - loss: 0.5372     
Epoch 97/150
51478/51478 [==============================] - 30s - loss: 0.5455     
Epoch 98/150
51478/51478 [==============================] - 30s - loss: 0.5313     
Epoch 99/150
51478/51478 [==============================] - 30s - loss: 0.5051     
Epoch 100/150
51478/51478 [==============================] - 30s - loss: 0.5174     
Epoch 101/150
51478/51478 [==============================] - 30s - loss: 0.5019     
Epoch 102/150
51478/51478 [==============================] - 30s - loss: 0.5011     
Epoch 103/150
51478/51478 [==============================] - 30s - loss: 0.4973     
Epoch 104/150
51478/51478 [==============================] - 30s - loss: 0.4896     
Epoch 105/150
51478/51478 [==============================] - 30s - loss: 0.4768     
Epoch 106/150
51478/51478 [==============================] - 30s - loss: 0.4579     
Epoch 107/150
51478/51478 [==============================] - 30s - loss: 0.4914     
Epoch 108/150
51478/51478 [==============================] - 30s - loss: 0.4697     
Epoch 109/150
51478/51478 [==============================] - 30s - loss: 0.4651     
Epoch 110/150
51478/51478 [==============================] - 30s - loss: 0.4445     
Epoch 111/150
51478/51478 [==============================] - 30s - loss: 0.4348     
Epoch 112/150
51478/51478 [==============================] - 30s - loss: 0.4414     
Epoch 113/150
51478/51478 [==============================] - 30s - loss: 0.4400     
Epoch 114/150
51478/51478 [==============================] - 30s - loss: 0.4263     
Epoch 115/150
51478/51478 [==============================] - 30s - loss: 0.4291     
Epoch 116/150
51478/51478 [==============================] - 30s - loss: 0.4321     
Epoch 117/150
51478/51478 [==============================] - 30s - loss: 0.4152     
Epoch 118/150
51478/51478 [==============================] - 30s - loss: 0.4140     
Epoch 119/150
51478/51478 [==============================] - 30s - loss: 0.3805     
Epoch 120/150
51478/51478 [==============================] - 30s - loss: 0.4010     
Epoch 121/150
51478/51478 [==============================] - 30s - loss: 0.3968     
Epoch 122/150
51478/51478 [==============================] - 30s - loss: 0.3974     
Epoch 123/150
51478/51478 [==============================] - 30s - loss: 0.4324     
Epoch 124/150
51478/51478 [==============================] - 30s - loss: 0.3934     
Epoch 125/150
51478/51478 [==============================] - 30s - loss: 0.3762     
Epoch 126/150
51478/51478 [==============================] - 30s - loss: 0.3859     
Epoch 127/150
51478/51478 [==============================] - 30s - loss: 0.3633     
Epoch 128/150
51478/51478 [==============================] - 30s - loss: 0.3723     
Epoch 129/150
51478/51478 [==============================] - 30s - loss: 0.3563     
Epoch 130/150
51478/51478 [==============================] - 30s - loss: 0.3436     
Epoch 131/150
51478/51478 [==============================] - 30s - loss: 0.3501     
Epoch 132/150
51478/51478 [==============================] - 30s - loss: 0.3653     
Epoch 133/150
51478/51478 [==============================] - 30s - loss: 0.3702     
Epoch 134/150
51478/51478 [==============================] - 31s - loss: 0.3619     
Epoch 135/150
51478/51478 [==============================] - 30s - loss: 0.3482     
Epoch 136/150
51478/51478 [==============================] - 30s - loss: 0.3372     
Epoch 137/150
51478/51478 [==============================] - 30s - loss: 0.3432     
Epoch 138/150
51478/51478 [==============================] - 30s - loss: 0.3540     
Epoch 139/150
51478/51478 [==============================] - 30s - loss: 0.3248     
Epoch 140/150
51478/51478 [==============================] - 30s - loss: 0.3235     
Epoch 141/150
51478/51478 [==============================] - 30s - loss: 0.3556     
Epoch 142/150
51478/51478 [==============================] - 30s - loss: 0.3198     
Epoch 143/150
51478/51478 [==============================] - 30s - loss: 0.3077     
Epoch 144/150
51478/51478 [==============================] - 30s - loss: 0.3355     
Epoch 145/150
51478/51478 [==============================] - 30s - loss: 0.3136     
Epoch 146/150
51478/51478 [==============================] - 30s - loss: 0.3120     
Epoch 147/150
51478/51478 [==============================] - 30s - loss: 0.3360     
Epoch 148/150
51478/51478 [==============================] - 30s - loss: 0.3180     
Epoch 149/150
51478/51478 [==============================] - 30s - loss: 0.3093     
Epoch 150/150
51478/51478 [==============================] - 30s - loss: 0.2935     
Out[ ]:
<keras.callbacks.History at 0x1a7617a3940>

In [68]:
y_pred = model.predict(np.reshape(X_train[102,:,:],(1,steps,1)))
# print(y_pred)
y_transformed = np.argmax(y_pred)
# print(y_transformed)
print(int_to_vocab[y_transformed])


d

In [74]:
seed = np.random.randint(textsize-steps)
x_seed = corpus[seed:seed + steps]
x_seed = [vocab_to_int[char] for char in x_seed]
x_seed = np.reshape(x_seed, (1, steps, 1))
x_seed = x_seed/float(num_classes)
print(x_seed.shape)


(1, 50, 1)

In [75]:
x = x_seed
out = []
charsize = 1000

for i in range(charsize):
    y_pred = model.predict(x)
    y_transformed = np.argmax(y_pred)
    output = int_to_vocab[y_transformed]
    out.append(output)
    x_new = y_transformed/float(num_classes)
    x = np.append(x[:,1:100,:], np.reshape(x_new, (1,1,1)), axis = 1)

print('completed')


completed

In [73]:
print(len(out))
print(''.join(out))


1000
owardice, instead of running to the girl’s aid, had hailed a party of
miners who were returning from their mid-day meal through a field near
by hiee, to wave tle ihe leaoreo a dreas pad to beser, and inotserd mat the fpol hh the wise toe it torser. Bo  tntse tt a yareot ond tous boumexer hid hengh  fine ale wolls as aheaca theica
leich dodn thes widlsd of e serr war wtten gnt to anotcerang Ig tos wire. the cors of singw ttt alali and tnomoely Iu t
s sold found, spile ina tie bure toet as the brane of the aeiosties oh aat
the hopes is the serile ur core wocsenng hith my thows ont
tnrore to heak ho sising said siih the mariss tf the oeling cohe oo nhat s serl sound of his anfy tall Io tets ante-aard seal ani his sape ware ioen dithtiy sf mhe wised himo sith ahe chiig antea ttd sas atfanget de a teaat bean dy lad sednced Yee  sisdy weod a wedn end ceaa io dos ao beenr Ieseed cr ihe eaneeh, alee watli of sis ayssicanl 
“eret he hed eoved io anser tos lace tide aotil serp conntse add 
tye e

In [ ]: