In [1]:
from __future__ import division, print_function, absolute_import
import json
from pprint import pprint
import pickle
import tflearn
from tflearn.data_utils import to_categorical, pad_sequences
from tflearn.datasets import imdb
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
import math
import csv
import numpy as np
In [2]:
# setting the paths
filepath = '/home/pujun/Desktop/StanfordClasses/lstm for natural language understanding/hi.json'
percentage_split = .7
GLOVE_SIZE = 50
In [3]:
import os
path_to_glove = os.environ.get("GLV_HOME")
In [4]:
def glove2dict(src_filename):
"""GloVe Reader.
Parameters
----------
src_filename : str
Full path to the GloVe file to be processed.
Returns
-------
dict
Mapping words to their GloVe vectors.
"""
reader = csv.reader(open(src_filename), delimiter=' ', quoting=csv.QUOTE_NONE)
return {line[0]: np.array(list(map(float, line[1: ]))) for line in reader}
In [5]:
glove = glove2dict(os.path.join(path_to_glove,
'glove.6B.%dd.txt' % GLOVE_SIZE))
In [6]:
vocab = set()
def parse_data_using_glove(json_data, num_examples_to_read=10000, num_words_in_longest_sentence=82):
Y = []
X = np.random.rand(num_examples_to_read, 82, 50)
for i, d in enumerate(json_data):
if i >= num_examples_to_read:
break
current_attribute_list = np.random.rand(82, 50)
tokenized_and_lowercase = word_tokenize(d['example'].lower())
for j,w in enumerate(tokenized_and_lowercase):
current_attribute_list[j,:] = np.array(glove.get(w))
vocab.add(w)
for j in range(len(tokenized_and_lowercase), num_words_in_longest_sentence):
current_attribute_list[j,:] = np.zeros(50);
X[i,:, :] = current_attribute_list
Y.append(d['label'])
return (X, np.array(Y))
In [7]:
data = []
with open(filepath) as f:
for line in f:
data.append(json.loads(line))
X, Y = parse_data_using_glove(data)
word_count = len(vocab)
print("Vocab size:", word_count)
In [8]:
print(X.shape)
print (np.array(X)[1][0].shape)
In [9]:
data_length_list = [len(eg) for eg in X]
num_words_in_longest_sentence = max(data_length_list)
In [10]:
print("Length of the biggest sentence:", num_words_in_longest_sentence)
In [11]:
num_training_examples = int(math.ceil(len(X) * percentage_split))
print("number of training examples:", num_training_examples)
npX = np.array(X)
npY = np.array(Y)
trainX = npX[:num_training_examples]
trainY = npY[:num_training_examples]
testX = X[num_training_examples:]
testY = Y[num_training_examples:]
print(trainX.shape)
print(trainY.shape)
In [12]:
# Data preprocessing
# Sequence padding
# trainX = pad_sequences(trainX, maxlen=num_words_in_longest_sentence, value=0.)
# testX = pad_sequences(testX, maxlen=num_words_in_longest_sentence, value=0.)
# Converting labels to binary vectors
trainY = to_categorical(trainY, nb_classes=2)
testY = to_categorical(testY, nb_classes=2)
word_count = len(vocab)
print("num_words_in_longest_sentence:", num_words_in_longest_sentence)
print("GLOVE_SIZE:", GLOVE_SIZE)
print("word_count", word_count)
print("dimension of X", len(X), "where each element is", X[0].size)
In [13]:
# Network building
net = tflearn.input_data(shape=[None, 82, 50],name='input')
net = tflearn.lstm(net, 82, return_seq=True)
net = tflearn.dropout(net,0.5)
net = tflearn.lstm(net, 82)
net = tflearn.dropout(net,0.5)
net = tflearn.fully_connected(net, 2, activation='softmax')
net = tflearn.regression(net, optimizer='adam',
loss='categorical_crossentropy')
In [14]:
# Training
model = tflearn.DNN(net, clip_gradients=0., tensorboard_verbose=0)
model.fit(trainX, trainY, validation_set=(trainX, trainY), show_metric=True,
batch_size=128)
In [ ]: