importing require packages

In [1]:
from __future__ import print_function

import json
import os
import numpy as np
import sys
import h5py

from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from keras.engine import Input
from keras.layers import Embedding, merge
from keras.models import Model
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.preprocessing import sequence
from embeddings import Embeddings
from keras.callbacks import ModelCheckpoint

from nltk.tokenize import word_tokenize
import random

Using TensorFlow backend.

In [2]:
np.mean([1, 2, 3])


Instantiate Embeddings

In [3]:
embeddings = Embeddings(100, 4, 1, 4)

Loading the embeddings from the cache

getting data from preprocessing

In [4]:
word2vec_weights = embeddings.get_weights()
word2index, index2word = embeddings.get_vocabulary()
word2vec_model = embeddings.get_model()
tokenized_indexed_sentences = embeddings.get_tokenized_indexed_sentences()

generating training data

In [5]:
window_size = 5
vocab_size = len(word2index)
#sentence_max_length = max([len(sentence) for sentence in tokenized_indexed_sentence ])


Defining model

In [6]:
model_weights_path = "../weights/LSTM-2-512-Window-5-Batch-128-Epoch-10-Stateful"
if not os.path.exists(model_weights_path):

In [7]:
seq_in = []
seq_out = []

# generating dataset
for sentence in tokenized_indexed_sentences:
    sentence_seq_in = []
    sentence_seq_out = []
    for i in range(len(sentence)-window_size-1):
        x = sentence[i:i + window_size]
        y = sentence[i + window_size]

# converting seq_in and seq_out into numpy array
seq_in = np.array(seq_in)
seq_out = np.array(seq_out)
n_samples = len(seq_in)
print ("Number of samples : ", n_samples)

Number of samples :  18473

In [8]:
subsamples = np.array([len(seq) for seq in seq_in])


In [9]:
subsamples_in = np.array([s for seq in seq_in for s in seq])
subsamples_out = np.array([s for seq in seq_out for s in seq])

Train Model

In [10]:
np.expand_dims(seq_in[0][0], axis=1)

       [  8],
       [ 25]])

In [11]:
total_batches = int(subsamples_in.shape[0] / 256)

In [12]:
batch_len = []
for i in range(total_batches):
min_batch_len = min(batch_len)

In [18]:
# Changes to the model to be done here
model = Sequential()
model.add(Embedding(input_dim=word2vec_weights.shape[0], output_dim=word2vec_weights.shape[1], weights=[word2vec_weights], batch_input_shape=(min_batch_len, 5)))
model.add(LSTM(512, return_sequences=True, stateful=True))
model.add(LSTM(512, stateful=True))
model.add(Dense(word2vec_weights.shape[1], activation='sigmoid'))
model.compile(loss='mse', optimizer='adam',metrics=['accuracy'])

Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (256, 5, 100)             13218400  
lstm_5 (LSTM)                (256, 5, 512)             1255424   
dropout_5 (Dropout)          (256, 5, 512)             0         
lstm_6 (LSTM)                (256, 512)                2099200   
dropout_6 (Dropout)          (256, 512)                0         
dense_3 (Dense)              (256, 100)                51300     
Total params: 16,624,324
Trainable params: 16,624,324
Non-trainable params: 0

In [33]:
for epoch in range(15):
    print("Epoch {0}/{1}".format(epoch+1, 15))
    mean_tr_accuracy = []
    mean_tr_loss = []
    for i in range(total_batches):
        # print("Done with {0}/{1} batches".format(i, total_batches))
        train_accuracy, train_loss = model.train_on_batch(subsamples_in[i::total_batches][:min_batch_len], subsamples_out[i::total_batches][:min_batch_len])
    mean_accuracy = np.mean(mean_tr_accuracy)
    mean_loss = np.mean(mean_tr_loss)
    print("Mean Accuracy", mean_accuracy)
    print("Mean Loss", mean_loss)
    filepath = "../weights/LSTM-2-512-Window-5-Batch-128-Epoch-10-Stateful/weights-{0}-{1}".format(epoch+1, mean_accuracy, mean_loss)

Epoch 1/15
model predict

In [19]:
start = 20
samples = subsamples_in[start::total_batches][:min_batch_len]
predictions = model.predict_on_batch(samples)
for index, prediction in enumerate(predictions):
    print(' '.join(index2word[index] for index in samples[index]))
    pred_word = word2vec_model.similar_by_vector(prediction)[0][0]
    sys.stdout.write("*"+pred_word+" \n")

afc champion denver broncos defeated
in the pro bowl thomas
of the turf collapsed under
and post-game coverage while martin
super bowl record 61-yard return
freely and royal carps in
host a permanent exhibition of
the prince of płock bolesław
the plain vistula terraces flooded
a registration number depends on
the rivers of france evolved
force of franks into the
of arguments as to whether
did not have the rich
his earlier illnessin 1875 tesla
tesla 's patent would probably
of the tesla coil the
the letter s dotdotdot in
their former star inventor was
said that he had been
invented or envisioned by tesla
not intended as a practical
which are defined using quantum
would be a major breakthrough
be responsible for some or
demanding university entrance examinations receive
line of work including occupational
a standard bachelor 's degree
the holy roman emperor charles
soul from purgatory also attested
assembly of the estates of
the murderous thieving hordes of
and marriage servicesluther and his
speratus this and seven other
to us in his word
and bucer citing as a
that ignored other contributory factors
for the greater southern california
a heavily developed urban environment
home to los angeles international
rename its skydrive cloud storage
can get carriage on a
settlement the area now constituting
schools and just over 311800
arrived on 6 april 1652
the origin of the name
exodus huguenots remained in large
in 1530 william farel was
most cases was used for
in the case of model
with reduction gearing although the
1606 in 1698 thomas savery
and submarines either use a
were monatomic and that the
of welding and cutting metal
the decay of these organisms
materials in the synthesis of
for arms purchases that exacerbated
the saudis were forced to
including avionics telecommunications and computersthe
him on the condition that
descent propellant surface stay consumables
with the lunar module eliminating
walking distance of the surveyor
in lunar orbit the degree
to join because of lack
its work ' but it
justice has the final say
second a citizen or company
rights as general principle of
that a free trade area
36 or as a mandatory
hours a week in germany
fake laser gun services from
277 million tons 15 of
that the rainforest could be
has long been debated and
of the epidermis and have
their body size than adults
beroids have cydippid-like larvae it
the tower district were developed
san joaquin river parkway 's
in cities along sr 99
but still has the dlci
to a pad or by
the mid 14th-century epidemic as
by over 100 as no
more than 125 million deaths
to date sections of rock
of these experiments horizontal layers
elizabeth in 1589 a 25-foot
market itself was opened in
canny a versatile word meaning
another gangster film the 1988
joining up routes that are
the museum is a non-departmental
the refreshment rooms reinstated as
in the art library in
examples of ceramics especially iznik
and cataloged must be audited
received the talbot hughes collection
church of castello at fontignano
sculptors such as dalou who
the entire nbc blue network
the morning news program good
with its first broadcast running
september 2005 rumors circulated that
a war where the battlefield
the weekly budget for abc
and all of mca 's
program on the network 's
roone arledge which featured a
by a development fund for
a third of the revenue
that the abc network became
secret history of the mongols
turned over to temüjin by
remnants of the khwarezmid empire
khan decided to give the
fighting in central asia and
and historical figures such as
towards the use of trained
medications in 2013 being specialty
innate immune system provides an
not require activation in order
some autoimmune diseases such as
calcidiol into the steroid hormone
mainly on the observation that
the majority may be powerful
very man i have to
breaking the law for self-gratification
the design team a number
average contractor employed fewer than
to operate outside of government
who wish to pursue collegetechnical
was initially called new college
when charles william eliot a
program has been continuously among
populous city proper in florida
film production center ended the
households had children under the
growthanother cause is the rate
an example income inequality did
run-up in consumption inequality has
notarize transaction documents or having
life process of time lords
a television programme while remaining
of regeneration to permit the
mary tamm and lalla ward
the theme was released in
before it aired on fox
doctor who executive producer russell
the fourth doctor in the
of freedom of inquiry and
faculty and visiting scholars to
hunt to model un in
liberalism defender friedrich hayek meteorologist
modeling his government on the
and continue much of the
tibetan buddhism flourished although taoism
damaging to the mongol nation
used woodblocks to print paper
north-west ethiopia to the north
signified the ultimate defeat of
inquiry the waki commission commended
on kenya 's capital fm
comprising eight years in primary
climate change its potential impacts
group ii said that what
lockstep situation of the ipcc
live in shallow water have
approximately 150000 base pair genome
in terms of function the
only protein complex needed for
other types of plastids which
's list of primes up
chebyshev which states that there
for these insectsthe concept of
from flowing into the open
of the second east-west shipping
the rhone and danube drained
depended on whether a state
the terms of the scotland
recorded in text form in
including borrowing powers and some
the constituency and the member
the detriment of progressive moderates
as semi-legal and was the
and in 2007 it drove
and 1969 in jordan and
area it is not unusual
imperialism was largely focused on
europe in the middle period
manifest destiny through policies such
the baltimore christmas conference of
liturgical and charismatic and between
senior pastor 's right to
the council also determines whether
course of study at an
and the main effort by
november 1749 it went up
had sent a company of
so he ordered an attack
of the seven years '
of constant velocity unless acted
a parallelogram gives an equivalent
the feynman diagram represents any
the sources of the fields
way that the direction and
the spring meetings of the
oldest quarterback to play in
the 50 given to the
denver score at the end
the fighters of the warsaw
offer tesla to redesign a
kind of memory was tesla
used to convey the continuum
type of accountant other than
english translation of the bible
hymn from depths of woe
region of california is palm
uk limited is formerly known
what was required of huguenot
engines became popular for power
in what year did lavoisier
from the us became 5
crew members were required to
defendant in the case of
member state nationals by the
live plants were found to
where is the neighborhood of
who did internet2 partner with
to precisely date rocks within
newcastle provides the majority of
the va library 's collection
what is the former name
night football premiered to which
the merger between abc and
did jochi try to protect
conflict of interest involving doctors
of infection involves inserting a
thoreau was not a well
is an example of an
did harvard stadium become the
is the united states at
the vast disparities in wealth
the main reason for the
in the fall quarter of
decided not to come visit
what do ftsz1 and ftsz2
and 1 what would be
when did europe slowly begin
four years are the ordinary
the ottoman caliphate is believed
who were two of the
did king louis xv respond


In [20]:
def accuracy():
    start = 27
    count = 0
    correct = 0
    predictions = model.predict_on_batch(subsamples_in[start::total_batches][:min_batch_len])
    ytrue = subsamples_out[start::total_batches][:min_batch_len]
    for index, prediction in enumerate(predictions):
        pred_word = word2vec_model.similar_by_vector(prediction)[0][0]
        true_word = word2vec_model.similar_by_vector(ytrue[index])[0][0]
        sim = word2vec_model.similarity(pred_word, true_word)
        if (sim >= 0.85):
            correct += 1
        count += 1
    accur = float(correct/(count))
    print('accuracy = ', float(accur))

In [21]:
# n = no. of predictions

accuracy =  0.1484375

In [ ]: