In [90]:
import numpy as np
import pandas as pd
import zipfile
from theano import tensor
import fuel
import h5py
from fuel.datasets.hdf5 import H5PYDataset
from fuel.streams import DataStream
from fuel.schemes import SequentialScheme
from blocks.bricks import Linear, Softmax
from blocks.bricks.lookup import LookupTable
from blocks.bricks.cost import CategoricalCrossEntropy
from blocks.initialization import IsotropicGaussian, Constant
from blocks.filter import VariableFilter
from blocks.roles import WEIGHT
from blocks.graph import ComputationGraph
from blocks.main_loop import MainLoop
from blocks.algorithms import GradientDescent, AdaGrad
from blocks.extensions import FinishAfter, Printing, ProgressBar
from blocks.extensions.monitoring import TrainingDataMonitoring
from blocks_extras.extensions.plot import Plot
%matplotlib inline
In [60]:
from blocks.extensions import SimpleExtension
class SaveWeights(SimpleExtension):
def __init__(self, layers, prefixes, **kwargs):
kwargs.setdefault("after_epoch", True)
super(SaveWeights, self).__init__(**kwargs)
self.step = 1
self.layers = layers
self.prefixes = prefixes
def do(self, callback_name, *args):
for i in xrange(len(self.layers)):
filename = "%s_%d.npy" % (self.prefixes[i], self.step)
np.save(filename, self.layers[i].get_value())
self.step += 1
In [19]:
filename = "./data/text8.zip"
def read_data(filename):
f = zipfile.ZipFile(filename)
for name in f.namelist():
return f.read(name).split()
f.close()
#words = read_data(filename)
#print 'Data size', len(words)
In [50]:
from fuel.datasets.base import Dataset
class WordWindow(Dataset):
def __init__(self, **kwargs):
self.provides_sources = ('features', 'targets')
# for technical reasons
self.axis_labels = None
with self.open() as fh:
self.corpus = fh.read().split()
#print self.corpus
self.vocabulary_size = len(set(self.corpus))
self.vocabulary = pd.factorize(self.corpus)
self.num_instances = len([((self.vocabulary[0][i],self.vocabulary[0][i+2]),self.vocabulary[0][i+1]) for i in xrange(len(self.corpus)-2)])
super(WordWindow, self).__init__(**kwargs)
def get_data(self, state=None, request=None):
data = [((self.vocabulary[0][i],self.vocabulary[0][i+2]),self.vocabulary[0][i+1]) for i in xrange(len(self.corpus)-2)]
x, y = zip(*data)
return np.array(x, dtype=np.int32), np.array(y, dtype=np.int32)
def open(self):
return open('./data/small')
def close(self,fh):
fh.close()
dataset = WordWindow()
print dataset.vocabulary_size
print dataset.vocabulary
In [51]:
dataset.get_data()
Out[51]:
In [75]:
VOCAB_DIM = dataset.vocabulary_size
EMBEDDING_DIM = min(5,VOCAB_DIM)
CONTEXT = 1
def makeGraph():
Xs = tensor.lmatrix("features")
y = tensor.ivector('targets')
w1 = LookupTable(name="w1", length=VOCAB_DIM, dim=EMBEDDING_DIM)
w2 = Linear(name='w2', input_dim=EMBEDDING_DIM, output_dim=VOCAB_DIM)
hidden = tensor.mean(w1.apply(Xs), axis=1)
y_hat = Softmax().apply(w2.apply(hidden))
w1.weights_init = w2.weights_init = IsotropicGaussian(0.01)
w1.biases_init = w2.biases_init = Constant(0)
w1.initialize()
w2.initialize()
cost = CategoricalCrossEntropy().apply(y, y_hat)
cg = ComputationGraph(cost)
W1, W2 = VariableFilter(roles=[WEIGHT])(cg.variables)
cost = cost + 0.005 * (W1 ** 2).sum() + 0.005 * (W2 ** 2).sum()
cost.name = "loss"
return cg,(W1,W2),cost
In [92]:
cg, (W1, W2), cost = makeGraph()
main = MainLoop(data_stream = DataStream(
dataset,
iteration_scheme=SequentialScheme(dataset.num_instances, batch_size=50)),
algorithm = GradientDescent(
cost = cost,
parameters = cg.parameters,
step_rule = AdaGrad()),
extensions = [
ProgressBar(),
FinishAfter(after_n_epochs=10),
#Printing(),
TrainingDataMonitoring(variables=[cost], after_batch=True),
SaveWeights(layers=[W1, W2], prefixes=["./w1","./w2"]),
])
main.run()
In [93]:
#np.load("w1_1.npy")
In [94]:
#np.load("w1_2.npy")
In [95]:
word_vectors = (W1.get_value() + W2.get_value().T) / 2
In [96]:
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
low_dim_embs = pca.fit_transform(word_vectors)
labels = dataset.vocabulary[1]
plt.figure(figsize=(18, 18)) #in inches
for i, label in enumerate(labels):
x, y = low_dim_embs[i,:]
plt.scatter(x, y)
plt.annotate(label,
xy=(x, y),
xytext=(5, 2),
textcoords='offset points',
ha='right',
va='bottom')
In [ ]: