Installation

execute the following to install the requirements:

Ubuntu/Linux 64-bit

sudo apt-get install python-pip python-dev

Mac OS X

sudo easy_install pip

Ubuntu/Linux 64-bit, CPU only:

sudo pip install --upgrade https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.8.0-cp27-none-linux_x86_64.whl

Ubuntu/Linux 64-bit, GPU enabled. Requires CUDA toolkit 7.5 and CuDNN v4. For

other versions, see "Install from sources" below. sudo pip install --upgrade https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow-0.8.0-cp27-none-linux_x86_64.whl

Mac OS X, CPU only:

sudo easy_install --upgrade six sudo pip install --upgrade https://storage.googleapis.com/tensorflow/mac/tensorflow-0.8.0-py2-none-any.whl

pip install git+https://github.com/tflearn/tflearn.git


In [1]:
from __future__ import division, print_function, absolute_import
import json
from pprint import pprint
import pickle
import tflearn
from tflearn.data_utils import to_categorical, pad_sequences
from tflearn.datasets import imdb
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
import math

In [ ]:
# setting the paths
filepath = '/home/pujun/Desktop/StanfordClasses/lstm for natural language understanding/transfer_learning_snli.jsonl'
percentage_split = .7

In [2]:
vocab = {}
word_count = 1

def parse_data(json_data):
    global word_count
    
    X = []
    Y = []
    for d in json_data:
        current_attribute_list = []
        words = tokenized_and_lowercase = word_tokenize(d['example'].lower())
        for w in words:
            if w not in vocab:
                vocab[w] = word_count
                word_count += 1
            current_attribute_list.append(vocab[w])
        X.append(current_attribute_list)
        Y.append(d['label'])

    return (X, Y)

In [3]:
data = []
with open(filepath) as f:
    for line in f:
        data.append(json.loads(line))
    X, Y = parse_data(data)

In [4]:
print("Number of examples:", len(X))
print("Number of distinct words:", word_count)


Number of examples: 380226
Number of distinct words: 24257

In [5]:
with open('SNLIdata','w') as f:
    pickle.dump(data,f)

In [6]:
data_length_list = [len(eg) for eg in X]
num_words_in_longest_sentence = max(data_length_list)

In [7]:
print("Length of the biggest sentence:", num_words_in_longest_sentence)


Length of the biggest sentence: 82

In [11]:
num_training_examples = int(math.ceil(len(X) * percentage_split))
print(num_training_examples)
trainX = X[:num_training_examples]
trainY = Y[:num_training_examples]

testX = X[num_training_examples:]
testY = Y[num_training_examples:]


266159

In [12]:
# Data preprocessing
# Sequence padding 
trainX = pad_sequences(trainX, maxlen=num_words_in_longest_sentence, value=0.)
testX = pad_sequences(testX, maxlen=num_words_in_longest_sentence, value=0.)

# Converting labels to binary vectors
trainY = to_categorical(trainY, nb_classes=2)
testY = to_categorical(testY, nb_classes=2)

In [13]:
# Network building
net = tflearn.input_data([None, num_words_in_longest_sentence])
net = tflearn.embedding(net, input_dim=word_count, output_dim=128)
net = tflearn.lstm(net, 128)
net = tflearn.dropout(net, 0.5)
net = tflearn.fully_connected(net, 2, activation='softmax')
net = tflearn.regression(net, optimizer='adam',
                         loss='categorical_crossentropy')

In [ ]:
# Training
model = tflearn.DNN(net, clip_gradients=0., tensorboard_verbose=0)
model.fit(trainX, trainY, validation_set=(testX, testY), show_metric=True,
          batch_size=128)


Training Step: 3884  | total loss: 0.28453
| Adam | epoch: 001 | loss: 0.28453 - acc: 0.8805 -- iter: 230912/266159

In [ ]: