In [1]:
import numpy
import theano
from theano import tensor
from blocks.bricks import Tanh
from blocks.bricks.recurrent import GatedRecurrent
from blocks.bricks.sequence_generators import (SequenceGenerator, Readout, SoftmaxEmitter, LookupFeedback)
from blocks.graph import ComputationGraph
import blocks.algorithms
from blocks.algorithms import GradientDescent
from blocks.initialization import Orthogonal, IsotropicGaussian, Constant
from blocks.model import Model
from blocks.monitoring import aggregation
from blocks.extensions import FinishAfter, Printing
from blocks.extensions.saveload import Checkpoint
from blocks.extensions.monitoring import TrainingDataMonitoring
from blocks.main_loop import MainLoop
import blocks.serialization
from blocks.select import Selector
import logging
import pprint
logger = logging.getLogger(__name__)
theano.config.floatX='float32'
print theano.config.device
In [2]:
# Dictionaries
import string
all_chars = [ a for a in string.printable]+['<UNK>']
code2char = dict(enumerate(all_chars))
char2code = {v: k for k, v in code2char.items()}
In [3]:
if False:
data_file = 'Shakespeare.poetry.txt'
dim = 32
hidden_state_dim = 32
feedback_dim = 32
else:
data_file = 'Shakespeare.plays.txt'
dim = 64
hidden_state_dim = 64
feedback_dim = 64
seq_len = 256 # The input file is learned in chunks of text this large
# Network parameters
num_states=len(char2code) # This is the size of the one-hot input and SoftMax output layers
batch_size = 100 # This is for mini-batches : Helps optimize GPU workload
num_epochs = 100 # Number of reads-through of corpus to do a training
data_path = '../data/' + data_file
save_path = '../models/' + data_file + '.model'
In [4]:
#from fuel.datasets import Dataset
from fuel.streams import DataStream
from fuel.schemes import ConstantScheme
from fuel.datasets import Dataset
#from fuel.datasets import TextFile
#dataset = TextFile([data_file], bos_token=None, eos_token=None, level="character", dictionary=char2code)
#data_stream = DataStream(dataset, iteration_scheme=ConstantScheme(batch_size))
class CharacterTextFile(Dataset):
provides_sources = ("data",)
def __init__(self, fname, chunk_len, dictionary, **kwargs):
self.fname = fname
self.chunk_len = chunk_len
self.dictionary = dictionary
super(CharacterTextFile, self).__init__(**kwargs)
def open(self):
return open(self.fname,'r')
def get_data(self, state, request):
assert isinstance(request, int)
x = numpy.zeros((self.chunk_len, request), dtype='int64')
for i in range(request):
txt=state.read(self.chunk_len)
if len(txt)<self.chunk_len: raise StopIteration
#print(">%s<\n" % (txt,))
x[:, i] = [ self.dictionary[c] for c in txt ]
return (x,)
def close(self, state):
state.close()
dataset = CharacterTextFile(data_path, chunk_len=seq_len, dictionary=char2code)
data_stream = DataStream(dataset, iteration_scheme=ConstantScheme(batch_size))
a=data_stream.get_data(10)
#[ code2char[v] for v in [94, 27, 21, 94, 16, 14, 54, 23, 14, 12] ] # Horizontally
#[ code2char[v] for v in [94, 94,95,36,94,47,50,57,40,53,68,54,94,38] ] # Vertically
''.join([ code2char[v] for v in a[0][:,0] ])
In [5]:
transition = GatedRecurrent(name="transition", dim=hidden_state_dim, activation=Tanh())
generator = SequenceGenerator(
Readout(readout_dim=num_states, source_names=["states"],
emitter=SoftmaxEmitter(name="emitter"),
feedback_brick=LookupFeedback(
num_states, feedback_dim, name='feedback'),
name="readout"),
transition,
weights_init=IsotropicGaussian(0.01), biases_init=Constant(0),
name="generator"
)
generator.push_initialization_config()
transition.weights_init = Orthogonal()
generator.initialize()
#dir(generator.readout.emitter)
#print(generator.readout.emitter.get_unique_path())
#print(generator.readout.emitter.name)
print(generator.readout.emitter.readout_dim)
That's the underlying network defined - now need to create the infrastructure to iteratively improve it :
In [6]:
# Give an idea of what's going on.
logger.info("Parameters:\n" + pprint.pformat(
[(key, value.get_value().shape) for key, value in Selector(generator).get_params().items()],
width=120))
#logger.info("Markov chain entropy: {}".format(MarkovChainDataset.entropy))
#logger.info("Expected min error: {}".format( -MarkovChainDataset.entropy * seq_len))
In [7]:
# Build the cost computation graph.
x = tensor.lmatrix('data')
cost = aggregation.mean(generator.cost_matrix(x[:, :]).sum(), x.shape[1])
cost.name = "sequence_log_likelihood"
model=Model(cost)
algorithm = GradientDescent(
cost=cost, params=list(Selector(generator).get_params().values()),
step_rule=blocks.algorithms.CompositeRule([blocks.algorithms.StepClipping(10.0), blocks.algorithms.Scale(0.01)]) )
# tried: blocks.algorithms.Scale(0.001), blocks.algorithms.RMSProp(), blocks.algorithms.AdaGrad()
In [8]:
# from IPython.display import SVG
# SVG(theano.printing.pydotprint(cost, return_image=True, format='svg'))
#from IPython.display import Image
#Image(theano.printing.pydotprint(cost, return_image=True, format='png'))
In [9]:
main_loop = MainLoop(
algorithm=algorithm,
data_stream=data_stream,
model=model,
extensions=[
FinishAfter(after_n_epochs=num_epochs),
TrainingDataMonitoring([cost], prefix="this_step", after_batch=True),
TrainingDataMonitoring([cost], prefix="average", every_n_batches=100),
Checkpoint(save_path, every_n_batches=1000),
Printing(every_n_batches=500)
]
)
In [12]:
main_loop.run()
In [ ]:
In [ ]:
## continuing models : (new method is not cPickle) :
# https://groups.google.com/forum/#!topic/blocks-users/jns-KKWTtko
# http://blocks.readthedocs.org/en/latest/serialization.html?highlight=load
## To inspect contents of saved/Checkpoint-ed file :
# unzip -t models/Shakespeare.poetry.txt.model
#from six.moves import cPickle
#main_loop = cPickle.load(open(save_path, "rb"))
#blocks.serialization.load(save_path)
#def author(input):
# pass
#model=Model(cost)
# Read back in from disk
if False:
model.set_param_values(blocks.serialization.load_parameter_values(save_path))
# Includes generator(?)
#generator = main_loop.model
In [16]:
output_length = 1000 # in characters
sampler = ComputationGraph(
generator.generate(n_steps=output_length, batch_size=1, iterate=True)
)
#print("Sampler variables : ", sampler.variables)
sample = sampler.get_theano_function()
states, outputs, costs = [data[:, 0] for data in sample()]
numpy.set_printoptions(precision=3, suppress=True)
print("Generation cost:\n{}".format(costs.sum()))
#freqs = numpy.bincount(outputs).astype(floatX)
#freqs /= freqs.sum()
#print("Frequencies:\n {} vs {}".format(freqs, MarkovChainDataset.equilibrium))
#trans_freqs = numpy.zeros((num_states, num_states), dtype=floatX)
#for a, b in zip(outputs, outputs[1:]):
# trans_freqs[a, b] += 1
#trans_freqs /= trans_freqs.sum(axis=1)[:, None]
#print("Transition frequencies:\n{}\nvs\n{}".format(
# trans_freqs, MarkovChainDataset.trans_prob))
#print(numpy.shape(states))
#print(numpy.shape(outputs))
#print(outputs[:])
print(''.join([ code2char[c] for c in outputs]))
In [ ]:
#from blocks.serialization import continue_training
#blocks.serialization.continue_training(save_path)