In [ ]:
__author__ = "Pujun Bhatnagar"
__version__ = "Stanford, Spring 2016"

In [ ]:
from __future__ import division, print_function, absolute_import
import json
from pprint import pprint
import pickle
import tflearn
from tflearn.data_utils import to_categorical, pad_sequences
from tflearn.datasets import imdb
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
import math
import csv
import numpy as np

In [ ]:
# setting the paths
filepath = '/home/pujun/Desktop/StanfordClasses/lstm for natural language understanding/snli_sick.jsonl'
percentage_split = .7
num_epoch = 10
saved_model_name = "SNLI_SICK_custom_embeddings_w_extended_vocab_FC2"

In [ ]:
vocab = {}
word_count = 1

def parse_data(json_data):
    global word_count
    
    X = []
    Y = []
    for d in json_data:
        current_attribute_list = []
        words = tokenized_and_lowercase = word_tokenize(d['example'].lower())
        for w in words:
            if w not in vocab:
                vocab[w] = word_count
                word_count += 1
            current_attribute_list.append(vocab[w])
        X.append(current_attribute_list)
        Y.append(d['label'])

    return (X, Y)

In [ ]:
raw_filepath = '/home/pujun/Desktop/StanfordClasses/lstm for natural language understanding/'
rawDataPath = raw_filepath + "raw_data.txt"

max_length = -1;
with open(rawDataPath) as f:
    lines = [line.strip() for line in f.readlines()]
        
    for l in lines:
        words = word_tokenize(l)
        for w in words:
            if w not in vocab:
                vocab[w] = word_count
                word_count += 1

In [ ]:
data = []
with open(filepath) as f:
    for line in f:
        data.append(json.loads(line))
    X, Y = parse_data(data)

In [ ]:
print("Number of examples:", len(X))
print("Number of distinct words:", word_count)

In [ ]:
data_length_list = [len(eg) for eg in X]
num_words_in_longest_sentence = max(data_length_list)

In [ ]:
print("Length of the biggest sentence:", num_words_in_longest_sentence)

In [ ]:
num_words_in_longest_sentence = 512 # since length of the largest paragraph is 392. Now 1.5 * 392 ~ 494. 512 closest
num_training_examples = int(math.ceil(len(X) * percentage_split))
print(num_training_examples)
trainX = X[:num_training_examples]
trainY = Y[:num_training_examples]

testX = X[num_training_examples:]
testY = Y[num_training_examples:]

In [ ]:
# Data preprocessing
# Sequence padding 
trainX = pad_sequences(trainX, maxlen=num_words_in_longest_sentence, value=0.)
testX = pad_sequences(testX, maxlen=num_words_in_longest_sentence, value=0.)

# Converting labels to binary vectors
trainY = to_categorical(trainY, nb_classes=2)
testY = to_categorical(testY, nb_classes=2)

In [ ]:
# Network building
print("num_words_in_longest_sentence:", num_words_in_longest_sentence)
layer_input = tflearn.input_data([None, num_words_in_longest_sentence])
# embedding = tflearn.embedding(layer_input, input_dim=word_count, output_dim=128)
embedding = tflearn.embedding(layer_input, input_dim=word_count, output_dim=128, name='my_embedding')
lstm = tflearn.lstm(embedding, 128)
dropout = tflearn.dropout(lstm, 0.5)
softmax = tflearn.fully_connected(dropout, 2, activation='softmax')
net = tflearn.regression(softmax, optimizer='adam',
                         loss='categorical_crossentropy')

In [ ]:
# Training
model = tflearn.DNN(net, clip_gradients=0., tensorboard_verbose=0)
model.fit(trainX, trainY, n_epoch=num_epoch,validation_set=(testX, testY), show_metric=True,
          batch_size=128)

In [ ]:
model.save(saved_model_name)

In [ ]:
# dumping the vocab
with open('lstm_vocab_with_augmented_vocab','w') as f:
    pickle.dump(vocab,f)

In [ ]:
model = tflearn.DNN(softmax)

In [ ]:
model.load(saved_model_name)