In [1]:
from __future__ import division, print_function, absolute_import
import json
from pprint import pprint
import pickle
import tflearn
from tflearn.data_utils import to_categorical, pad_sequences
from tflearn.datasets import imdb
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
import math
import csv
import numpy as np

In [2]:
# setting the paths
filepath = '/home/pujun/Desktop/StanfordClasses/lstm for natural language understanding/hi.json'
percentage_split = .7
GLOVE_SIZE = 50

In [3]:
import os
path_to_glove = os.environ.get("GLV_HOME")

In [4]:
def glove2dict(src_filename):
    """GloVe Reader.
    
    Parameters
    ----------
    src_filename : str
        Full path to the GloVe file to be processed.

    Returns
    -------
    dict
        Mapping words to their GloVe vectors.
    
    """
    reader = csv.reader(open(src_filename), delimiter=' ', quoting=csv.QUOTE_NONE)    
    return {line[0]: np.array(list(map(float, line[1: ]))) for line in reader}

In [5]:
glove = glove2dict(os.path.join(path_to_glove, 
            'glove.6B.%dd.txt' % GLOVE_SIZE))

In [6]:
vocab = set()
def parse_data_using_glove(json_data, num_examples_to_read=10000, num_words_in_longest_sentence=82):
    Y = []
    
    X = np.random.rand(num_examples_to_read, 82, 50)
    for i, d in enumerate(json_data):
        if i >= num_examples_to_read:
            break
        current_attribute_list = np.random.rand(82, 50)
        tokenized_and_lowercase = word_tokenize(d['example'].lower())
        for j,w in enumerate(tokenized_and_lowercase):
            current_attribute_list[j,:] = np.array(glove.get(w))
            vocab.add(w)

        for j in range(len(tokenized_and_lowercase), num_words_in_longest_sentence):
            current_attribute_list[j,:] = np.zeros(50);
        
        X[i,:, :] = current_attribute_list
        Y.append(d['label'])

    return (X, np.array(Y))

In [7]:
data = []
with open(filepath) as f:
    for line in f:
        data.append(json.loads(line))
    X, Y = parse_data_using_glove(data)
word_count = len(vocab)
print("Vocab size:", word_count)


Vocab size: 5430

In [8]:
print(X.shape)
print (np.array(X)[1][0].shape)


(10000, 82, 50)
(50,)

In [9]:
data_length_list = [len(eg) for eg in X]
num_words_in_longest_sentence = max(data_length_list)

In [10]:
print("Length of the biggest sentence:", num_words_in_longest_sentence)


Length of the biggest sentence: 82

In [11]:
num_training_examples = int(math.ceil(len(X) * percentage_split))
print("number of training examples:", num_training_examples)
npX = np.array(X)
npY = np.array(Y)
trainX = npX[:num_training_examples]
trainY = npY[:num_training_examples]

testX = X[num_training_examples:]
testY = Y[num_training_examples:]


print(trainX.shape)
print(trainY.shape)


number of training examples: 7000
(7000, 82, 50)
(7000,)

In [12]:
# Data preprocessing
# Sequence padding 
# trainX = pad_sequences(trainX, maxlen=num_words_in_longest_sentence, value=0.)
# testX = pad_sequences(testX, maxlen=num_words_in_longest_sentence, value=0.)

# Converting labels to binary vectors
trainY = to_categorical(trainY, nb_classes=2)
testY = to_categorical(testY, nb_classes=2)
word_count = len(vocab)
print("num_words_in_longest_sentence:", num_words_in_longest_sentence)
print("GLOVE_SIZE:", GLOVE_SIZE)
print("word_count",  word_count)
print("dimension of X", len(X), "where each element is", X[0].size)


num_words_in_longest_sentence: 82
GLOVE_SIZE: 50
word_count 5430
dimension of X 10000 where each element is 4100

In [13]:
# Network building
net = tflearn.input_data(shape=[None, 82, 50],name='input')
net = tflearn.lstm(net, 82, return_seq=True)
net = tflearn.dropout(net,0.5)
net = tflearn.lstm(net, 82)
net = tflearn.dropout(net,0.5)
net = tflearn.fully_connected(net, 2, activation='softmax')
net = tflearn.regression(net, optimizer='adam',
                         loss='categorical_crossentropy')

In [14]:
# Training
model = tflearn.DNN(net, clip_gradients=0., tensorboard_verbose=0)
model.fit(trainX, trainY, validation_set=(trainX, trainY), show_metric=True,
          batch_size=128)


Training Step: 550  | total loss: nan
| Adam | epoch: 010 | loss: nan - acc: 0.4969 | val_loss: nan - val_acc: 0.5000 -- iter: 7000/7000
Training Step: 550  | total loss: nan
| Adam | epoch: 010 | loss: nan - acc: 0.4969 | val_loss: nan - val_acc: 0.5000 -- iter: 7000/7000
--

In [ ]: