In [1]:
from keras.models import Sequential, Model
from keras.layers.embeddings import Embedding
from keras.layers import Input, Activation, Dense, Permute, Dropout, add, dot, concatenate
from keras.layers import LSTM, TimeDistributed, Conv1D
from keras.layers.wrappers import Bidirectional
from keras.utils import vis_utils
In [2]:
# from seya.layers.ntm import NeuralTuringMachine as NTM
In [40]:
from keras.models import Sequential, Model
from keras.layers.embeddings import Embedding
from keras.layers import Input, Activation, Dense, Permute, Dropout, add, dot, concatenate
from keras.layers import LSTM, TimeDistributed
from keras.layers import Conv1D, MaxPool1D
from keras.layers.wrappers import Bidirectional
class ConfigurableNetwork:
defaultfile = 'default.cfg'
def __init__(self, modelname):
self._modelname = modelname
# Set up the environment. load config file if it is there, if not, then create it form default
# Load the weights
# existential question: should this contain all of the handling for fitting and callbacks?
# yeah I think that makes sense. One stop shop for configs, weights, and logging.
def setup(self):
pass
@property
def modelname(self): return self._modelname
class QueryableNet:
def __init__(self, vocab_size=22, story_maxlen=68, query_maxlen=4):
self.model = None
self.vocab_size = vocab_size
self.query_maxlen = query_maxlen
self.story_maxlen = story_maxlen
def query(self, storyvec, queryvec):
storyvec = storyvec.reshape((-1, self.story_maxlen))
queryvec = queryvec.reshape((-1, self.query_maxlen))
ans = self.model.predict([storyvec, queryvec])
return ans
class DeepMemNet(QueryableNet):
"""
DeepMemNet for the Facebook bAbI context task.
Model notes:
Single context task:
Regular LSTM (accuracy/val_acc):
Run 1
47/50% @ 7 epochs
72/70% @ 49 epochs
86/80% @ 61 epochs
95/90% @ 88 epochs
Run 2
Bidirectional LSTM:
Run 1
50%/50% @ 6
81%/80% @ 34
87%/86% @ 48 (peak valacc)
90%/86% @ 60 epochs - minor overfitting
I think parameters were not configured right, these might be Single LSTM! need to rerun everything >_<
Run 2
73/71% @ 44
83/80% @ 53
94/90% @ 75
Run 3
71/70% @ 41
83/80% @ 50
94/90% @ 76
Bidirectional + extra forward LSTM:
55%/??% @ 80 (stalls)
TDDense + Bidirectional:
Run 1:
76/73% @ 37
80/80% @ 41
91/90% @ 54 - new record!!
Double Context task:
Regular:
50%/??% @ 35 epochs
67%/??% @ 80 epochs
70%/??% @ 100
80%/??% @ 192
84.7%/??% @ 260
Bidirectional:
50%/??% @ 26 epochs
70%/??% @ 48 epochs - improvement!
80%/??% @ 68 epochs - super improvement!
90%/??% @ 110 epochs - smokin'!
95%/??% @ 148 epochs - starting to level off
97%/??% @ 200 epochs - i think it's starting to overfit
"""
# todo: add performance logging
def __init__(self, vocab_size=22, story_maxlen=68, query_maxlen=4, n_lstm=32, bidirect=True, tdd=True,
matchconv=False, permute=False):
"""
DeepMemNet
Param note - changing parameters will require new model file (duh) - this isn't automatic yet
:param vocab_size:
:param story_maxlen:
:param query_maxlen:
:param n_lstm:
:param bidirect:
"""
# todo: config file for model hyperparams with logging link
# self.vocab_size = vocab_size
# self.story_maxlen = story_maxlen
# self.query_maxlen = query_maxlen
super().__init__(vocab_size=vocab_size, story_maxlen=story_maxlen, query_maxlen=query_maxlen)
# placeholders
input_sequence = Input((story_maxlen,), name='InputSeq')
question = Input((query_maxlen,), name='Question')
# Encoders - initial encoders are pretty much just projecting the input into a useful space
# not much need to optimize here really
input_encoder_m = Sequential(name='InputEncoderM')
input_encoder_m.add(Embedding(input_dim=vocab_size,
output_dim=64, name='InEncM_Embed'))
input_encoder_m.add(Dropout(0.3))
# output: (samples, story_maxlen, embedding_dim)
# embed the input into a sequence of vectors of size query_maxlen
input_encoder_c = Sequential(name='InputEncoderC')
input_encoder_c.add(Embedding(input_dim=vocab_size,
output_dim=query_maxlen, name='InEncC_Embed'))
input_encoder_c.add(Dropout(0.3))
# output: (samples, story_maxlen, query_maxlen)
# embed the question into a sequence of vectors
question_encoder = Sequential(name='QuestionEncoder')
question_encoder.add(Embedding(input_dim=vocab_size,
output_dim=64,
input_length=query_maxlen, name='QuesEnc_Embed'))
question_encoder.add(Dropout(0.3))
# output: (samples, query_maxlen, embedding_dim)
# encode input sequence and questions (which are indices)
# to sequences of dense vectors
input_encoded_m = input_encoder_m(input_sequence)
input_encoded_c = input_encoder_c(input_sequence)
question_encoded = question_encoder(question)
# compute a 'match' between the first input vector sequence
# and the question vector sequence
# shape: `(samples, story_maxlen, query_maxlen)`
match = dot([input_encoded_m, question_encoded], axes=(2, 2), name='Match')
match = Activation('softmax')(match)
if matchconv:
match = Conv1D(query_maxlen, 4, padding='same')(match)
# add the match matrix with the second input vector sequence
response = add([match, input_encoded_c], name='ResponseAdd') # (samples, story_maxlen, query_maxlen)
response = Permute((2, 1), name='ResponsePermute')(response) # (samples, query_maxlen, story_maxlen)
# concatenate the match matrix with the question vector sequence
answer = concatenate([response, question_encoded], name='AnswerConcat')
# Trying to feed in the long axis as the timestep causes the GPU to get very angry.
# It would appear it causes it to start thrashing memory
if permute:
answer = Permute((2, 1), name='AnswerPermute')(answer) # (samples, story_maxlen, query_maxlen)
# Let's try with a time distributed dense before the RNN
if tdd:
answer = TimeDistributed(Dense(n_lstm, name='Answer_TDD'))(answer)
# Bidirectional LSTM for better context recognition, plus an additional one for flavor
lstm_rev = Bidirectional(LSTM(n_lstm, return_sequences=True, name='Ans_LSTM_reverse'))
lstm_for = Bidirectional(LSTM(n_lstm, return_sequences=False, name='Ans_LSTM_forward'))
if bidirect:
answer = lstm_rev(answer) # "reverse" pass goes first
answer = lstm_for(answer)
# answer = LSTM(n_lstm, name='Ans_LSTM_3)(answer) # Extra LSTM completely runs out of steam at 55% acc! Bidirectional seems to help
# one regularization layer -- more would probably be needed.
answer = Dropout(0.3, name='Answer_Drop')(answer)
answer = Dense(vocab_size, name='Answer_Dense')(answer) # (samples, vocab_size)
# we output a probability distribution over the vocabulary
answer = Activation('softmax')(answer)
# build the final model
model = Model([input_sequence, question], answer)
model.compile(optimizer='rmsprop', loss='categorical_crossentropy',
metrics=['accuracy'])
self.model = model
# def query(self, storyvec, queryvec):
# storyvec = storyvec.reshape((-1, self.story_maxlen))
# queryvec = queryvec.reshape((-1, self.query_maxlen))
# ans = self.model.predict([storyvec, queryvec])
# return ans
class ConvoLSTM(QueryableNet):
def __init__(self, vocab_size=22, story_maxlen=68, query_maxlen=4, n_lstm=32, bidirect=True, tdd=True,
matchconv=False, permute=False):
super().__init__(vocab_size=vocab_size, story_maxlen=story_maxlen, query_maxlen=query_maxlen)
dropout_rate = 0.2
embed_vector_len = 64
n_filter = 120
filter_length = 5
input_sequence = Input((story_maxlen,), name='InputSeq')
question = Input((query_maxlen,), name='Question')
# input_sequence = Activation('linear')(input_sequence)
# question = Activation('linear')(question)
# model = Model()
model = concatenate([input_sequence, question], name='AnswerConcat')
model = Embedding(vocab_size, embed_vector_len)(model)
model = Conv1D(filters=n_filter, kernel_size=filter_length, padding='valid', activation='relu')(model)
model = MaxPool1D(pool_size=2)(model)
model = Dropout(dropout_rate)(model)
# model = LSTM(n_lstm)(model)
# Let's try with a time distributed dense before the RNN
if tdd:
model.add(TimeDistributed(Dense(n_lstm, name='Answer_TDD')))
# Bidirectional LSTM for better context recognition, plus an additional one for flavor
lstm_rev = Bidirectional(LSTM(n_lstm, return_sequences=True, name='Ans_LSTM_reverse'))
lstm_for = Bidirectional(LSTM(n_lstm, return_sequences=False, name='Ans_LSTM_forward'))
if bidirect:
model = lstm_rev(model) # "reverse" pass goes first
model = lstm_for(model)
# model = LSTM(n_lstm)(model)
model = Dropout(dropout_rate)(model)
output = Dense(vocab_size, activation='sigmoid')(model)
model = Model([input_sequence, question], output)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
self.model = model
In [41]:
# dmn = DeepMemNet(bidirect=False, tdd=False)
dmn = ConvoLSTM(bidirect=False, tdd=False)
In [42]:
vis_utils.plot_model(dmn.model, 'model.png', show_shapes=True)
In [ ]: