In [ ]:
#!/usr/bin/env python3

Hello World! Python Workshops @ Think Coffee

3-5pm, 7/30/17

Day 3, Alice NLP generator

@python script author (original content): Rahul

@jupyter notebook converted tutorial author: Nick Giangreco

Ntbk of python script in same directory. Building an RNN based on Lewis Carrol's Alice in Wonderland text.

Importing modules


In [1]:
from __future__ import print_function
from keras.models import Model
from keras.layers import Dense, Activation, Embedding
from keras.layers import LSTM, Input
from keras.layers.merge import concatenate
from keras.optimizers import RMSprop, Adam
from keras.utils.data_utils import get_file
from keras.layers.normalization import BatchNormalization
from keras.callbacks import Callback, ModelCheckpoint
from sklearn.decomposition import PCA
from keras.utils import plot_model
import numpy as np
import random
import sys
import csv
import os
import h5py
import time


Using TensorFlow backend.

Setting params for model setup and build.


In [10]:
embeddings_path = "./glove.840B.300d-char.txt" # http://nlp.stanford.edu/data/glove.840B.300d.zip
embedding_dim = 300
batch_size = 32
use_pca = False
lr = 0.001
lr_decay = 1e-4
maxlen = 300
consume_less = 2   # 0 for cpu, 2 for gpu

Loading and reading Alice.txt corpus, saving characters (unique alphabet and punctuation characters in corpus) in array, and making dictionary associating each character with it's position in the character array (making two dictionaries where the key and position are either the key or value)


In [3]:
text = open('./Alice.txt').read()
print('corpus length:', len(text))

chars = sorted(list(set(text)))
print('total chars:', len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))


corpus length: 148545
total chars: 73

Cutting the document into semi-redundant sentences, where each element in the sentences list contain 40 sentences that overlap with the previous element's sentences (also doing a step size of 3 through each line in the text). Also, storing character in each next_chars array's elements, where the current element is the 40th character after the previous character.


In [4]:
# cut the text in semi-redundant sequences of maxlen characters

step = 3
sentences = []
next_chars = []
for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i: i + maxlen])
    next_chars.append(text[i + maxlen])
print('nb sequences:', len(sentences))


nb sequences: 49415

Making X boolean (false) array with a shape of the length of the sentences by the step (40) by the length of the unique characters/punctuation in the document.

Making y boolean (false) array with a shape of the length of the sentences by the length of the unique characters/punctuation in the document.

Then, going through each sentence and character in the sentence, storing a 1 (converting false to true) in the respective sentence and characters in X and y.


In [5]:
print('Vectorization...')
X = np.zeros((len(sentences), maxlen), dtype=np.int)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        X[i, t] = char_indices[char]
    y[i, char_indices[next_chars[i]]] = 1


Vectorization...

Defining helper functions.


In [6]:
# test code to sample on 10% for functional model testing

def random_subset(X, y, p=0.1):

    idx = np.random.randint(X.shape[0], size=int(X.shape[0] * p))
    X = X[idx, :]
    y = y[idx]
    return (X, y)


# https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html
def generate_embedding_matrix(embeddings_path):
    print('Processing pretrained character embeds...')
    embedding_vectors = {}
    with open(embeddings_path, 'r') as f:
        for line in f:
            line_split = line.strip().split(" ")
            vec = np.array(line_split[1:], dtype=float)
            char = line_split[0]
            embedding_vectors[char] = vec

    embedding_matrix = np.zeros((len(chars), 300))
    #embedding_matrix = np.random.uniform(-1, 1, (len(chars), 300))
    for char, i in char_indices.items():
        #print ("{}, {}".format(char, i))
        embedding_vector = embedding_vectors.get(char)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

    # Use PCA from sklearn to reduce 300D -> 50D
    if use_pca:
        pca = PCA(n_components=embedding_dim)
        pca.fit(embedding_matrix)
        embedding_matrix_pca = np.array(pca.transform(embedding_matrix))
        embedding_matrix_result = embedding_matrix_pca
        print (embedding_matrix_pca)
        print (embedding_matrix_pca.shape)
    else:
        embedding_matrix_result = embedding_matrix
    return embedding_matrix_result

def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds + 1e-6) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

Building text embedding matrix and RNN model. This is what differentiates this tutorial from tutorial 03.

  • Input layer

  • Embedding layer - with embedding matrix as weights

  • RNN Layer - LSTM instance with 256 nodes

  • Dense layer (2 hidden layers)

  • Activation (softmax) layer for converting to output probability

full table given below


In [11]:
print('Build model...')
main_input = Input(shape=(maxlen,))
embedding_matrix = generate_embedding_matrix(embeddings_path)
embedding_layer = Embedding(
len(chars), embedding_dim, input_length=maxlen,
weights=[embedding_matrix])
# embedding_layer = Embedding(
#     len(chars), embedding_dim, input_length=maxlen)
embedded = embedding_layer(main_input)

    # RNN Layer
rnn = LSTM(256, implementation=consume_less)(embedded)

aux_output = Dense(len(chars))(rnn)
aux_output = Activation('softmax', name='aux_out')(aux_output)

    # Hidden Layers
hidden_1 = Dense(512, use_bias=False)(rnn)
hidden_1 = BatchNormalization()(hidden_1)
hidden_1 = Activation('relu')(hidden_1)

hidden_2 = Dense(256, use_bias=False)(hidden_1)
hidden_2 = BatchNormalization()(hidden_2)
hidden_2 = Activation('relu')(hidden_2)

main_output = Dense(len(chars))(hidden_2)
main_output = Activation('softmax', name='main_out')(main_output)

model = Model(inputs=main_input, outputs=[main_output, aux_output])

optimizer = Adam(lr=lr, decay=lr_decay)
model.compile(loss='categorical_crossentropy',
          optimizer=optimizer, loss_weights=[1., 0.2])
model.summary()

#plot_model(model, to_file='model.png', show_shapes=True)


if not os.path.exists('./output'):
    os.makedirs('./output')

f = open('./log.csv', 'w')
log_writer = csv.writer(f)
log_writer.writerow(['iteration', 'batch', 'batch_loss',
                     'epoch_loss', 'elapsed_time'])

checkpointer = ModelCheckpoint(
    "./output/model.hdf5", monitor='main_out_loss', save_best_only=True)


Build model...
Processing pretrained character embeds...
____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
====================================================================================================
input_3 (InputLayer)             (None, 300)           0                                            
____________________________________________________________________________________________________
embedding_3 (Embedding)          (None, 300, 300)      21900       input_3[0][0]                    
____________________________________________________________________________________________________
lstm_3 (LSTM)                    (None, 256)           570368      embedding_3[0][0]                
____________________________________________________________________________________________________
dense_10 (Dense)                 (None, 512)           131072      lstm_3[0][0]                     
____________________________________________________________________________________________________
batch_normalization_5 (BatchNorm (None, 512)           2048        dense_10[0][0]                   
____________________________________________________________________________________________________
activation_5 (Activation)        (None, 512)           0           batch_normalization_5[0][0]      
____________________________________________________________________________________________________
dense_11 (Dense)                 (None, 256)           131072      activation_5[0][0]               
____________________________________________________________________________________________________
batch_normalization_6 (BatchNorm (None, 256)           1024        dense_11[0][0]                   
____________________________________________________________________________________________________
activation_6 (Activation)        (None, 256)           0           batch_normalization_6[0][0]      
____________________________________________________________________________________________________
dense_12 (Dense)                 (None, 73)            18761       activation_6[0][0]               
____________________________________________________________________________________________________
dense_9 (Dense)                  (None, 73)            18761       lstm_3[0][0]                     
____________________________________________________________________________________________________
main_out (Activation)            (None, 73)            0           dense_12[0][0]                   
____________________________________________________________________________________________________
aux_out (Activation)             (None, 73)            0           dense_9[0][0]                    
====================================================================================================
Total params: 895,006
Trainable params: 893,470
Non-trainable params: 1,536
____________________________________________________________________________________________________

Making batchloss class for more efficient epoch training and writing.


In [17]:
class BatchLossLogger(Callback):

    def on_epoch_begin(self, epoch, logs={}):
        self.losses = []

    def on_batch_end(self, batch, logs={}):
        self.losses.append(logs.get('main_out_loss'))
        if batch % 50 == 0:
            log_writer.writerow([iteration, batch,
                                 logs.get('main_out_loss'),
                                 np.mean(self.losses),
                                 round(time.time() - start_time, 2)])

Model training. Use one epoch instead of ten.


In [20]:
ep = 1

In [22]:
start_time = time.time()
for iteration in range(1, 20):
    print()
    print('-' * 50)
    print('Iteration', iteration)

    logger = BatchLossLogger()
    # X_train, y_train = random_subset(X, y)
    # history = model.fit(X_train, [y_train, y_train], batch_size=batch_size,
    #                     epochs=1, callbacks=[logger, checkpointer])
    history = model.fit(X, [y, y], batch_size=batch_size,
                        epochs=ep, callbacks=[logger, checkpointer])
    loss = str(history.history['main_out_loss'][-1]).replace(".", "_")

    f2 = open('./output/iter-{:02}-{:.6}.txt'.format(iteration, loss), 'w')

    start_index = random.randint(0, len(text) - maxlen - 1)

    for diversity in [0.2, 0.5, 1.0, 1.2]:
        print()
        print('----- diversity:', diversity)
        f2.write('----- diversity:' + ' ' + str(diversity) + '\n')

        generated = ''
        sentence = text[start_index: start_index + maxlen]
        generated += sentence
        print('----- Generating with seed: "' + sentence + '"')
        f2.write('----- Generating with seed: "' + sentence + '"' + '\n---\n')
        sys.stdout.write(generated)

        for i in range(1200):
            x = np.zeros((1, maxlen), dtype=np.int)
            for t, char in enumerate(sentence):
                x[0, t] = char_indices[char]

            preds = model.predict(x, verbose=0)[0][0]
            next_index = sample(preds, diversity)
            next_char = indices_char[next_index]

            generated += next_char
            sentence = sentence[1:] + next_char

            sys.stdout.write(next_char)
            sys.stdout.flush()
        f2.write(generated + '\n')
        print()
    f2.close()

    # Write embeddings for current characters to file
    # The second layer has the embeddings.

    embedding_weights = model.layers[1].get_weights()[0]
    f3 = open('./output/char-embeddings.txt', 'w')
    for char in char_indices:
        if ord(char) < 128:
            embed_vector = embedding_weights[char_indices[char], :]
            f3.write(char + " " + " ".join(str(x)
                                           for x in embed_vector) + "\n")
    f3.close()

f.close()


--------------------------------------------------
Iteration 1
Epoch 1/1
49415/49415 [==============================] - 2062s - loss: 2.7572 - main_out_loss: 2.1911 - aux_out_loss: 2.8303  

----- diversity: 0.2
----- Generating with seed: "e same thing with you,' said the Hatter, and here the
conversation dropped, and the party sat silent for a minute,
while Alice thought over all she could remember about ravens and
writing-desks, which wasn't much.

  The Hatter was the first to break the silence.  `What day of
the month is it?' he s"
e same thing with you,' said the Hatter, and here the
conversation dropped, and the party sat silent for a minute,
while Alice thought over all she could remember about ravens and
writing-desks, which wasn't much.

  The Hatter was the first to break the silence.  `What day of
the month is it?' he she saice, and the she she saide the the saice the was she was she was the waid the wis the she was ince the she saide the as ince haid the saide the licked the the came the waid the the saide the she saide the she came the waide the was in the dooke the saide the waid the waid the saide the was she was all the it the the was waide the was ince the she was all caice the saide the the saide the was the came itsee inche the caice the was the came the saide the waid the saide the castell the said the saide the she she came the said the was ince the was the was inge the was of the saide the the saide the the came and the she was waide the she she case the she was ince the saide it the was ince the saide the saide the the waid the she saide the waid the she came the came the was inge the saide the she came the was ince of the waid the she came the the was waid the was ince the waide the waid ittell the the came the was ince all the she waid the came the was ince the saide the wise the she came the she was the saice the she she was the saide the was ince the waid the waid the was inge the saide the the said the the saide the she came the she saide the the said the saide the came the saide

----- diversity: 0.5
----- Generating with seed: "e same thing with you,' said the Hatter, and here the
conversation dropped, and the party sat silent for a minute,
while Alice thought over all she could remember about ravens and
writing-desks, which wasn't much.

  The Hatter was the first to break the silence.  `What day of
the month is it?' he s"
e same thing with you,' said the Hatter, and here the
conversation dropped, and the party sat silent for a minute,
while Alice thought over all she could remember about ravens and
writing-desks, which wasn't much.

  The Hatter was the first to break the silence.  `What day of
the month is it?' he she Rase waided, it heas whid the faice intte waide the
saided.  `You she she meell the all of heak

  `I was waike the all to becke the said the wise all ass veese caice The paicke the bease the licke in off uell itteed the shating one the beeare waise the camet it the METTE TEETE TOTTEE TOATETOETENTEES Lall gusise thee the the or was daide the dookeng one waiding the it haid the was of ink waise the waid the silk the came them wasing `I wall ittle said the inow the saide of in the what waide the waid the saide all the saice the wis witeld the fhicke the keailse, and is waid the call istell them bese she she thise it the puches waild thing weas is the Greme of bee of as the licked the was ill was as the wis inseee, `I wis was was istle the whied incesid.  `I wit heaking, and time ill ittell the the she sell outen all the it ink the wise aboce ance the said.'

  `You deaid the dooke and heam Alice und, and the the saide itsle the saided wis istlick in the heat wall of, and waice the waide the she paice inow the was Alice us waid the was the sicke saide the and, and she she was waid and the she saice.  `You waid--
                                                                      

----- diversity: 1.0
----- Generating with seed: "e same thing with you,' said the Hatter, and here the
conversation dropped, and the party sat silent for a minute,
while Alice thought over all she could remember about ravens and
writing-desks, which wasn't much.

  The Hatter was the first to break the silence.  `What day of
the month is it?' he s"
e same thing with you,' said the Hatter, and here the
conversation dropped, and the party sat silent for a minute,
while Alice thought over all she could remember about ravens and
writing-desks, which wasn't much.

  The Hatter was the first to break the silence.  `What day of
the month is it?' he saokige.

 `Bustle Labk-ino ing neiw-j:
  IN, rame onk Ratteir the veotesers, `it Cal came puchh, whe saiig lell et'me teir KIEN Ale Alice womed.'

  The, shevumely.'

   `You Mly she gustlige `Allege Alece heak
theen ther, `paway
grilltgry
uo.-

  `-
                      Thimee-saisey
is the in allockete istlee, as she doret ing, andwez's the
sOREE TITT TTiee. 
  `'nd'l didiokedy aicee:  `Yed bkeme.

  `Dusle
Ill Coce tive's DaAl 
                   the un!  Raide them abike thig, it Alice yougselng.

  The her.  Tnokese thes nime theahs oneking ittle of comee daidenou--"Ands awo 'sjeonged at ine istelf.'

  `You?   littlen?  Here them UTESETKiu--hat hat breves bep?' 
  `YUch daid.


  ` snouking hand quet, 
`Yey' laiging the salle soo?'

                  Th Boee nmel once dookeng,  I bis whi may theme conken's and, (is neppe ilseoted it allsngey'y
leawene,' rendied Ill efpy iddy urcave to , herdy's ROINTRE's Rref theme thaide them Bucend ifmieed Alice, `Oum bame, it the ind.  `Nuok daide thaily aid; `noug Alice on, Quet I'm--tre I Al waid the  apgilly Aliceing peio ' laidong of, was'

  She whatteger!' said theid---'

  `Yhe worde, pIT I
NElD allrucelfnlow the camn_ as ne! he is

----- diversity: 1.2
----- Generating with seed: "e same thing with you,' said the Hatter, and here the
conversation dropped, and the party sat silent for a minute,
while Alice thought over all she could remember about ravens and
writing-desks, which wasn't much.

  The Hatter was the first to break the silence.  `What day of
the month is it?' he s"
e same thing with you,' said the Hatter, and here the
conversation dropped, and the party sat silent for a minute,
while Alice thought over all she could remember about ravens and
writing-desks, which wasn't much.

  The Hatter was the first to break the silence.  `What day of
the month is it?' he siose buce HEYNTTET DOYT OUEV LORDHELWE CLU O heas you on osm,
yelf Maje
MAAS Tooke---Heny
agybut it! illle.

  I'mb
"aeLodsay. fuy gas it; yucking,  I le, Fuy (Bucch) ta
keast-uninga":  see thald' iot'bong iefiou AlSILTE MEULbiS (Yais,' said to laiseas, `indmenme--`YOTT SARLO TAINl-MO fubet: theoke, whe kise it
bs ligessong?'

  ``Howc RhALE ag--is
ixwa giwatioo?'

  I'vid.
  `Nelvce,  andyeh:  puncse's it)a wLAHTHGE
iapusingednuged thelilf vee in toe haid it'll--inomeor. (Aliee of lattwe) olkez'ss; buitskam--a
(wand All !' Asie fuel onckliok--`IN snawnle otplbaice notle bepetncoomedsH
teck Oas-Se sheb--WOOET
sMat naw in-zAmE quie Tuicc mem--co maice, saidiy--o
draotes?'

 `Bow the!
"no pavbieeds,' she as nasem-'"'


 Ticke Alice: Shy whir
cawiall bce jum, FO ATEATTEIRH TORT han'tpsiokeded, cim deatted JeRAUTATTE
TORE TOR OTADENE
       `YES -O EeEVTK joineet, `Sou MOSEB" urie-sowe. yOw ixteem.  Lheurver--aele ceall nece, and ther nele ter, and as.) `I MON AE CHTER LNEF VETTOPTEI DOLNKTTEEL INE AT
kEEAOTEESEYL ILA RNSENTEHTTENTET eAs
OOALUSS YWI RBE  Shik-bid; thee vacg up tmear.'

 Shaice briknuttundiot
Foquee.  `
Yhis.  Whem Tnougedd Thike.'

  `Yuy'm--

              bodye dhikl

--------------------------------------------------
Iteration 2
Epoch 1/1
  704/49415 [..............................] - ETA: 2096s - loss: 2.5736 - main_out_loss: 2.0791 - aux_out_loss: 2.4725
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-22-6634de9a3d9c> in <module>()
     10     #                     epochs=1, callbacks=[logger, checkpointer])
     11     history = model.fit(X, [y, y], batch_size=batch_size,
---> 12                         epochs=ep, callbacks=[logger, checkpointer])
     13     loss = str(history.history['main_out_loss'][-1]).replace(".", "_")
     14 

/anaconda/lib/python3.6/site-packages/keras/engine/training.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, **kwargs)
   1428                               val_f=val_f, val_ins=val_ins, shuffle=shuffle,
   1429                               callback_metrics=callback_metrics,
-> 1430                               initial_epoch=initial_epoch)
   1431 
   1432     def evaluate(self, x, y, batch_size=32, verbose=1, sample_weight=None):

/anaconda/lib/python3.6/site-packages/keras/engine/training.py in _fit_loop(self, f, ins, out_labels, batch_size, epochs, verbose, callbacks, val_f, val_ins, shuffle, callback_metrics, initial_epoch)
   1077                 batch_logs['size'] = len(batch_ids)
   1078                 callbacks.on_batch_begin(batch_index, batch_logs)
-> 1079                 outs = f(ins_batch)
   1080                 if not isinstance(outs, list):
   1081                     outs = [outs]

/anaconda/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py in __call__(self, inputs)
   2266         updated = session.run(self.outputs + [self.updates_op],
   2267                               feed_dict=feed_dict,
-> 2268                               **self.session_kwargs)
   2269         return updated[:len(self.outputs)]
   2270 

/anaconda/lib/python3.6/site-packages/tensorflow/python/client/session.py in run(self, fetches, feed_dict, options, run_metadata)
    787     try:
    788       result = self._run(None, fetches, feed_dict, options_ptr,
--> 789                          run_metadata_ptr)
    790       if run_metadata:
    791         proto_data = tf_session.TF_GetBuffer(run_metadata_ptr)

/anaconda/lib/python3.6/site-packages/tensorflow/python/client/session.py in _run(self, handle, fetches, feed_dict, options, run_metadata)
    995     if final_fetches or final_targets:
    996       results = self._do_run(handle, final_targets, final_fetches,
--> 997                              feed_dict_string, options, run_metadata)
    998     else:
    999       results = []

/anaconda/lib/python3.6/site-packages/tensorflow/python/client/session.py in _do_run(self, handle, target_list, fetch_list, feed_dict, options, run_metadata)
   1130     if handle is None:
   1131       return self._do_call(_run_fn, self._session, feed_dict, fetch_list,
-> 1132                            target_list, options, run_metadata)
   1133     else:
   1134       return self._do_call(_prun_fn, self._session, handle, feed_dict,

/anaconda/lib/python3.6/site-packages/tensorflow/python/client/session.py in _do_call(self, fn, *args)
   1137   def _do_call(self, fn, *args):
   1138     try:
-> 1139       return fn(*args)
   1140     except errors.OpError as e:
   1141       message = compat.as_text(e.message)

/anaconda/lib/python3.6/site-packages/tensorflow/python/client/session.py in _run_fn(session, feed_dict, fetch_list, target_list, options, run_metadata)
   1119         return tf_session.TF_Run(session, options,
   1120                                  feed_dict, fetch_list, target_list,
-> 1121                                  status, run_metadata)
   1122 
   1123     def _prun_fn(session, handle, feed_dict, fetch_list):

KeyboardInterrupt: 

Increasing diversity in the model decreases the model predictions (output text).


In [ ]: