In [1]:
import csv
import itertools
import operator
import numpy as np
import nltk
import sys
from datetime import datetime
from utils import *
import matplotlib.pyplot as plt
%matplotlib inline
In [2]:
vocabulary_size = 200
sentence_start_token = "START"
sentence_end_token = "END"
f = open('data/ratings_train.txt', 'r')
lines = f.readlines()
for i in range(len(lines)):
lines[i] = lines[i].replace("/n","").replace("\n","")
reader = []
for line in lines:
line_document = line.split("\t")[1]
reader.append(line_document)
f.close()
In [3]:
sentences = ["%s %s %s" % (sentence_start_token, x, sentence_end_token) for x in reader[:1000]]
In [4]:
from konlpy.tag import Twitter
pos_tagger = Twitter()
def tokenize(doc):
return ['/'.join(t) for t in pos_tagger.pos(doc, norm=True, stem=True)]
tokenized_sentences = [tokenize(row) for row in sentences]
In [5]:
vocab = [t for d in tokenized_sentences for t in d]
In [6]:
Verb_Noun_Adjective_Alpha_in_text = []
index = 0
for text in tokenized_sentences:
Verb_Noun_Adjective_Alpha_in_text.append([])
for word in text:
parts_of_speech = word.split("/")
if parts_of_speech[1] in ["Noun","Verb","Adjective"] :
Verb_Noun_Adjective_Alpha_in_text[index].append(word.split("/")[0])
elif parts_of_speech[1] in ["Alpha"] and len(parts_of_speech[0]) ==3 or len(parts_of_speech[0]) ==5:
Verb_Noun_Adjective_Alpha_in_text[index].append(word.split("/")[0])
index += 1
In [7]:
Verb_Noun_Adjective_Alpha_in_text_tokens = [t for d in Verb_Noun_Adjective_Alpha_in_text for t in d]
In [8]:
import nltk
real_tokens = nltk.Text(Verb_Noun_Adjective_Alpha_in_text_tokens, name='RNN')
In [9]:
real_tokens_freq = real_tokens.vocab().most_common(vocabulary_size-1)
In [10]:
index_to_word = [x[0] for x in real_tokens_freq]
index_to_word.append("unknown")
word_to_index = dict([(w,i) for i,w in enumerate(index_to_word)])
In [11]:
for i, sent in enumerate(Verb_Noun_Adjective_Alpha_in_text):
tokenized_sentences[i] = [w if w in word_to_index else "unknown" for w in sent]
In [12]:
X_train = np.asarray([[word_to_index[w] for w in sent[:-1]] for sent in tokenized_sentences])
y_train = np.asarray([[word_to_index[w] for w in sent[1:]] for sent in tokenized_sentences])
In [13]:
X_train[0]
Out[13]:
In [13]:
class RNNNumpy:
def __init__(self, word_dim, hidden_dim=100, bptt_truncate=4):
self.word_dim = word_dim
self.hidden_dim = hidden_dim
self.bptt_truncate = bptt_truncate
self.U = np.random.uniform(-np.sqrt(1./word_dim), np.sqrt(1./word_dim), (hidden_dim, word_dim))
self.V = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (word_dim, hidden_dim))
self.W = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (hidden_dim, hidden_dim))
In [14]:
def forward_propagation(self, x):
T = len(x)
s = np.zeros((T + 1, self.hidden_dim))
s[-1] = np.zeros(self.hidden_dim)
o = np.zeros((T, self.word_dim))
for t in np.arange(T):
s[t] = np.tanh(self.U[:,x[t]] + self.W.dot(s[t-1]))
o[t] = softmax(self.V.dot(s[t]))
return [o, s]
RNNNumpy.forward_propagation = forward_propagation
In [15]:
def predict(self, x):
o, s = self.forward_propagation(x)
return np.argmax(o, axis=1)
RNNNumpy.predict = predict
tokenized_sentences[0]
Out[15]:
In [16]:
np.random.seed(100)
model = RNNNumpy(vocabulary_size)
#for i in range(100):
o, s = model.forward_propagation(X_train[0])
print (o.shape)
print (s.shape)
In [17]:
X_train[0]
Out[17]:
In [18]:
y_train[0]
Out[18]:
In [19]:
predictions = model.predict(X_train[0])
print (predictions.shape)
print (predictions)
In [20]:
def calculate_total_loss(self, x, y):
L = 0
for i in np.arange(len(y)):
o, s = self.forward_propagation(x[i])
correct_word_predictions = o[np.arange(len(y[i])), y[i]]
L += -1 * np.sum(np.log(correct_word_predictions))
return L
def calculate_loss(self, x, y):
N = np.sum((len(y_i) for y_i in y))
return self.calculate_total_loss(x,y)/N
RNNNumpy.calculate_total_loss = calculate_total_loss
RNNNumpy.calculate_loss = calculate_loss
In [21]:
# Limit to 1000 examples to save time
print ("Expected Loss for random predictions: %f" % np.log(vocabulary_size))
print ("Actual loss: %f" % model.calculate_loss(X_train[:1000], y_train[:1000]))
In [22]:
def bptt(self, x, y):
T = len(y)
# Perform forward propagation
o, s = self.forward_propagation(x)
# We accumulate the gradients in these variables
dLdU = np.zeros(self.U.shape)
dLdV = np.zeros(self.V.shape)
dLdW = np.zeros(self.W.shape)
delta_o = o
delta_o[np.arange(len(y)), y] -= 1.
# For each output backwards...
for t in np.arange(T)[::-1]:
dLdV += np.outer(delta_o[t], s[t].T)
# Initial delta calculation
delta_t = self.V.T.dot(delta_o[t]) * (1 - (s[t] ** 2))
# Backpropagation through time (for at most self.bptt_truncate steps)
for bptt_step in np.arange(max(0, t-self.bptt_truncate), t+1)[::-1]:
# print "Backpropagation step t=%d bptt step=%d " % (t, bptt_step)
dLdW += np.outer(delta_t, s[bptt_step-1])
dLdU[:,x[bptt_step]] += delta_t
# Update delta for next step
delta_t = self.W.T.dot(delta_t) * (1 - s[bptt_step-1] ** 2)
return [dLdU, dLdV, dLdW]
RNNNumpy.bptt = bptt
In [23]:
def numpy_sdg_step(self, x, y, learning_rate):
dLdU, dLdV, dLdW = self.bptt(x, y)
self.U -= learning_rate * dLdU
self.V -= learning_rate * dLdV
self.W -= learning_rate * dLdW
RNNNumpy.sgd_step = numpy_sdg_step
In [24]:
def train_with_sgd(model, X_train, y_train, learning_rate=0.005, nepoch=100, evaluate_loss_after=5):
losses = []
num_examples_seen = 0
for epoch in range(nepoch):
if (epoch % evaluate_loss_after == 0):
loss = model.calculate_loss(X_train, y_train)
losses.append((num_examples_seen, loss))
time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
print ("%s: Loss after num_examples_seen=%d epoch=%d: %f" % (time, num_examples_seen, epoch, loss))
for i in range(len(y_train)):
model.sgd_step(X_train[i], y_train[i], learning_rate)
num_examples_seen += 1
print(model)
In [25]:
np.random.seed(10)
model = RNNNumpy(vocabulary_size)
%timeit model.sgd_step(X_train[10], y_train[10], 0.005)
In [26]:
np.random.seed(10)
model = RNNNumpy(vocabulary_size)
losses = train_with_sgd(model, X_train[:100], y_train[:100], nepoch=10, evaluate_loss_after=1)
In [27]:
from rnn_theano import RNNTheano, gradient_check_theano
In [28]:
from utils import load_model_parameters_theano, save_model_parameters_theano
model = RNNTheano(vocabulary_size, hidden_dim=100)
train_with_sgd(model, X_train, y_train, nepoch=50)
save_model_parameters_theano('./data/trained-model-sion_consider.npz', model)
load_model_parameters_theano('./data/trained-model-sion_consider.npz', model)
In [29]:
print(len(model.V.get_value()))
In [39]:
def generate_sentence(model):
new_sentence = [word_to_index[sentence_start_token]]
while not new_sentence[-1] == word_to_index[sentence_end_token]:
next_word_probs = model.forward_propagation(new_sentence)
sampled_word = word_to_index["unknown"]
while sampled_word == word_to_index["unknown"]:
samples = np.random.multinomial(1, next_word_probs[-1])
sampled_word = np.argmax(samples)
new_sentence.append(sampled_word)
sentence_str = [index_to_word[x] for x in new_sentence[1:-1]]
return sentence_str
num_sentences = 2
senten_min_length = 5
for i in range(num_sentences):
sent = []
while len(sent) < senten_min_length:
sent = generate_sentence(model)
print (" ".join(sent))
In [ ]: