In [ ]:
__author__ = "Pujun Bhatnagar"
__version__ = "Stanford, Spring 2016"

In [ ]:
from __future__ import division, print_function, absolute_import
import json
from pprint import pprint
import pickle
import tflearn
from tflearn.data_utils import to_categorical, pad_sequences
from tflearn.datasets import imdb
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
import math
import csv
import numpy as np

In [ ]:
# setting the paths
filepath = '/home/pujun/Desktop/StanfordClasses/lstm for natural language understanding/snli_sick.jsonl'
percentage_split = .7
GLOVE_SIZE = 50
num_epoch = 10
saved_model_name = "SNLI_SICK_GloVe_embeddings_FC2"

In [ ]:
import os
path_to_glove = os.environ.get("GLV_HOME")

In [ ]:
def glove2dict(src_filename):
    """GloVe Reader.
    
    Parameters
    ----------
    src_filename : str
        Full path to the GloVe file to be processed.

    Returns
    -------
    dict
        Mapping words to their GloVe vectors.
    
    """
    reader = csv.reader(open(src_filename), delimiter=' ', quoting=csv.QUOTE_NONE)    
    return {line[0]: np.array(list(map(float, line[1: ]))) for line in reader}

In [ ]:
glove = glove2dict(os.path.join(path_to_glove, 
            'glove.6B.%dd.txt' % GLOVE_SIZE))

In [ ]:
vocab = set()
def parse_data_using_glove(json_data,num_words_in_longest_sentence=512):
    Y = []
    X = np.random.rand(num_example_to_read, num_words_in_longest_sentence, 50)
    for i, d in enumerate(json_data):
        current_attribute_list = np.random.rand(num_words_in_longest_sentence, 50)
        tokenized_and_lowercase = word_tokenize(d['example'].lower())
        for j,w in enumerate(tokenized_and_lowercase):
            current_attribute_list[j,:] = np.array(glove.get(w))
            vocab.add(w)

        for j in range(len(tokenized_and_lowercase), num_words_in_longest_sentence):
            current_attribute_list[j,:] = np.ones(50) * 0.5;
        
        X[i,:, :] = current_attribute_list
        Y.append(d['label'])

    return (X, np.array(Y))

In [ ]:
data = []
with open(filepath) as f:
    for line in f:
        data.append(json.loads(line))
    X, Y = parse_data_using_glove(data)
print("X shape: %s, Y shape: %s" % (X.shape, Y.shape))
word_count = len(vocab)
print("Vocab size:", word_count)

In [ ]:
print(X.shape)
print(np.array(X)[1][0].shape)

In [ ]:
data_length_list = [len(eg) for eg in X]
num_words_in_longest_sentence = max(data_length_list)

In [ ]:
print("Length of the biggest sentence:", num_words_in_longest_sentence)

In [ ]:
num_training_examples = int(math.ceil(len(X) * percentage_split))
print("number of training examples:", num_training_examples)
npX = np.array(X)
npY = np.array(Y)
trainX = npX[:num_training_examples]
trainY = npY[:num_training_examples]

testX = X[num_training_examples:]
testY = Y[num_training_examples:]


print(trainX.shape)
print(trainY.shape)

In [ ]:
# Data preprocessing
# Sequence padding 
# trainX = pad_sequences(trainX, maxlen=num_words_in_longest_sentence, value=0.)
# testX = pad_sequences(testX, maxlen=num_words_in_longest_sentence, value=0.)

# Converting labels to binary vectors
trainY = to_categorical(trainY, nb_classes=2)
testY = to_categorical(testY, nb_classes=2)
word_count = len(vocab)
print("num_words_in_longest_sentence:", num_words_in_longest_sentence)
print("GLOVE_SIZE:", GLOVE_SIZE)
print("word_count",  word_count)
print("dimension of X", len(X), "where each element is", X[0].size)

In [ ]:
# Network building
layer_input = tflearn.input_data(shape=[None, 512, 50],name='input')
embedding = tflearn.lstm(layer_input, 512)
dropout = tflearn.dropout(embedding,0.5)
softmax = tflearn.fully_connected(dropout, 2, activation='softmax')
net = tflearn.regression(softmax, optimizer='adam',
                         loss='categorical_crossentropy', learning_rate=1e-7)

In [ ]:
# Training
model = tflearn.DNN(net, clip_gradients=0., tensorboard_verbose=0)
model.fit(trainX, trainY, n_epoch=num_epoch, validation_set=(testX, testY), show_metric=True,
          batch_size=32)

In [ ]:
model.save(saved_model_name)