In [1]:
import csv
import itertools
import operator
import numpy as np
import nltk
import sys
from datetime import datetime
from utils import *
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
vocabulary_size = 200
sentence_start_token = "START"
sentence_end_token = "END"

f = open('data/ratings_train.txt', 'r')
lines = f.readlines()
for i in range(len(lines)):
    lines[i] = lines[i].replace("/n","").replace("\n","")
reader = []
for line in lines:
    line_document = line.split("\t")[1]
    reader.append(line_document)
f.close()

In [3]:
sentences = ["%s %s %s" % (sentence_start_token, x, sentence_end_token) for x in reader[:1000]]

In [4]:
from konlpy.tag import Twitter
pos_tagger = Twitter()
def tokenize(doc):
    return ['/'.join(t) for t in pos_tagger.pos(doc, norm=True, stem=True)]
tokenized_sentences = [tokenize(row) for row in sentences]

In [5]:
vocab = [t for d in tokenized_sentences for t in d]

In [6]:
Verb_Noun_Adjective_Alpha_in_text = []
index = 0
for text in tokenized_sentences:
    Verb_Noun_Adjective_Alpha_in_text.append([])
    for word in text:
        parts_of_speech = word.split("/")
        if parts_of_speech[1] in ["Noun","Verb","Adjective"] :
            Verb_Noun_Adjective_Alpha_in_text[index].append(word.split("/")[0])
        elif parts_of_speech[1] in ["Alpha"] and len(parts_of_speech[0]) ==3 or len(parts_of_speech[0]) ==5:
            Verb_Noun_Adjective_Alpha_in_text[index].append(word.split("/")[0])            
    index += 1

In [7]:
Verb_Noun_Adjective_Alpha_in_text_tokens = [t for d in Verb_Noun_Adjective_Alpha_in_text for t in d]

In [8]:
import nltk
real_tokens = nltk.Text(Verb_Noun_Adjective_Alpha_in_text_tokens, name='RNN')

In [9]:
real_tokens_freq = real_tokens.vocab().most_common(vocabulary_size-1)

In [10]:
index_to_word = [x[0] for x in real_tokens_freq]
index_to_word.append("unknown")
word_to_index = dict([(w,i) for i,w in enumerate(index_to_word)])

In [11]:
for i, sent in enumerate(Verb_Noun_Adjective_Alpha_in_text):
    tokenized_sentences[i] = [w if w in word_to_index else "unknown" for w in sent]

Make model


In [12]:
X_train = np.asarray([[word_to_index[w] for w in sent[:-1]] for sent in tokenized_sentences])
y_train = np.asarray([[word_to_index[w] for w in sent[1:]] for sent in tokenized_sentences])

In [13]:
X_train[0]


Out[13]:
[0, 199, 9, 84, 98, 199]

In [13]:
class RNNNumpy:
    
    def __init__(self, word_dim, hidden_dim=100, bptt_truncate=4):
        self.word_dim = word_dim
        self.hidden_dim = hidden_dim
        self.bptt_truncate = bptt_truncate
        self.U = np.random.uniform(-np.sqrt(1./word_dim), np.sqrt(1./word_dim), (hidden_dim, word_dim))
        self.V = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (word_dim, hidden_dim))
        self.W = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (hidden_dim, hidden_dim))

In [14]:
def forward_propagation(self, x):

    T = len(x)
    s = np.zeros((T + 1, self.hidden_dim))
    s[-1] = np.zeros(self.hidden_dim)
    o = np.zeros((T, self.word_dim))

    for t in np.arange(T):
        s[t] = np.tanh(self.U[:,x[t]] + self.W.dot(s[t-1]))
        o[t] = softmax(self.V.dot(s[t]))
    return [o, s]

RNNNumpy.forward_propagation = forward_propagation

In [15]:
def predict(self, x):
   
    o, s = self.forward_propagation(x)
    return np.argmax(o, axis=1)

RNNNumpy.predict = predict
tokenized_sentences[0]


Out[15]:
['START', 'unknown', '진짜', '짜증', '나다', 'unknown', 'END']

In [16]:
np.random.seed(100)
model = RNNNumpy(vocabulary_size)
#for i in range(100):
o, s = model.forward_propagation(X_train[0])
print (o.shape)
print (s.shape)


(6, 200)
(7, 100)

In [17]:
X_train[0]


Out[17]:
[0, 199, 9, 86, 94, 199]

In [18]:
y_train[0]


Out[18]:
[199, 9, 86, 94, 199, 1]

In [19]:
predictions = model.predict(X_train[0])
print (predictions.shape)
print (predictions)


(6,)
[ 52  91 192  67  13 186]

In [20]:
def calculate_total_loss(self, x, y):
    L = 0
    for i in np.arange(len(y)):
        o, s = self.forward_propagation(x[i])
        correct_word_predictions = o[np.arange(len(y[i])), y[i]]
        L += -1 * np.sum(np.log(correct_word_predictions))
    return L

def calculate_loss(self, x, y):
    N = np.sum((len(y_i) for y_i in y))
    return self.calculate_total_loss(x,y)/N

RNNNumpy.calculate_total_loss = calculate_total_loss
RNNNumpy.calculate_loss = calculate_loss

In [21]:
# Limit to 1000 examples to save time
print ("Expected Loss for random predictions: %f" % np.log(vocabulary_size))
print ("Actual loss: %f" % model.calculate_loss(X_train[:1000], y_train[:1000]))


Expected Loss for random predictions: 5.298317
Actual loss: 5.301008

In [22]:
def bptt(self, x, y):
    T = len(y)
    # Perform forward propagation
    o, s = self.forward_propagation(x)
    # We accumulate the gradients in these variables
    dLdU = np.zeros(self.U.shape)
    dLdV = np.zeros(self.V.shape)
    dLdW = np.zeros(self.W.shape)
    delta_o = o
    delta_o[np.arange(len(y)), y] -= 1.
    # For each output backwards...
    for t in np.arange(T)[::-1]:
        dLdV += np.outer(delta_o[t], s[t].T)
        # Initial delta calculation
        delta_t = self.V.T.dot(delta_o[t]) * (1 - (s[t] ** 2))
        # Backpropagation through time (for at most self.bptt_truncate steps)
        for bptt_step in np.arange(max(0, t-self.bptt_truncate), t+1)[::-1]:
            # print "Backpropagation step t=%d bptt step=%d " % (t, bptt_step)
            dLdW += np.outer(delta_t, s[bptt_step-1])              
            dLdU[:,x[bptt_step]] += delta_t
            # Update delta for next step
            delta_t = self.W.T.dot(delta_t) * (1 - s[bptt_step-1] ** 2)
    return [dLdU, dLdV, dLdW]

RNNNumpy.bptt = bptt

In [23]:
def numpy_sdg_step(self, x, y, learning_rate):
    
    dLdU, dLdV, dLdW = self.bptt(x, y)
    self.U -= learning_rate * dLdU
    self.V -= learning_rate * dLdV
    self.W -= learning_rate * dLdW

RNNNumpy.sgd_step = numpy_sdg_step

In [24]:
def train_with_sgd(model, X_train, y_train, learning_rate=0.005, nepoch=100, evaluate_loss_after=5):
    
    losses = []
    num_examples_seen = 0
    for epoch in range(nepoch):
        if (epoch % evaluate_loss_after == 0):
            loss = model.calculate_loss(X_train, y_train)
            losses.append((num_examples_seen, loss))
            time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            print ("%s: Loss after num_examples_seen=%d epoch=%d: %f" % (time, num_examples_seen, epoch, loss))
        for i in range(len(y_train)):
            model.sgd_step(X_train[i], y_train[i], learning_rate)
            num_examples_seen += 1
    print(model)

In [25]:
np.random.seed(10)
model = RNNNumpy(vocabulary_size)
%timeit model.sgd_step(X_train[10], y_train[10], 0.005)


1000 loops, best of 3: 968 µs per loop

In [26]:
np.random.seed(10)
model = RNNNumpy(vocabulary_size)
losses = train_with_sgd(model, X_train[:100], y_train[:100], nepoch=10, evaluate_loss_after=1)


2017-01-20 14:52:31: Loss after num_examples_seen=0 epoch=0: 5.304216
2017-01-20 14:52:31: Loss after num_examples_seen=100 epoch=1: 3.442786
2017-01-20 14:52:31: Loss after num_examples_seen=200 epoch=2: 3.131067
2017-01-20 14:52:32: Loss after num_examples_seen=300 epoch=3: 3.017680
2017-01-20 14:52:32: Loss after num_examples_seen=400 epoch=4: 2.958618
2017-01-20 14:52:32: Loss after num_examples_seen=500 epoch=5: 2.930790
2017-01-20 14:52:33: Loss after num_examples_seen=600 epoch=6: 2.916276
2017-01-20 14:52:33: Loss after num_examples_seen=700 epoch=7: 2.905613
2017-01-20 14:52:33: Loss after num_examples_seen=800 epoch=8: 2.895897
2017-01-20 14:52:34: Loss after num_examples_seen=900 epoch=9: 2.886424
<__main__.RNNNumpy object at 0x7f7a9c25ac50>

In [27]:
from rnn_theano import RNNTheano, gradient_check_theano

In [28]:
from utils import load_model_parameters_theano, save_model_parameters_theano

model = RNNTheano(vocabulary_size, hidden_dim=100)
train_with_sgd(model, X_train, y_train, nepoch=50)

save_model_parameters_theano('./data/trained-model-sion_consider.npz', model)
load_model_parameters_theano('./data/trained-model-sion_consider.npz', model)


2017-01-20 14:52:41: Loss after num_examples_seen=0 epoch=0: 5.301648
2017-01-20 14:52:46: Loss after num_examples_seen=5000 epoch=5: 3.193938
2017-01-20 14:52:52: Loss after num_examples_seen=10000 epoch=10: 3.181974
2017-01-20 14:52:58: Loss after num_examples_seen=15000 epoch=15: 3.177310
2017-01-20 14:53:03: Loss after num_examples_seen=20000 epoch=20: 3.171975
2017-01-20 14:53:10: Loss after num_examples_seen=25000 epoch=25: 3.154365
2017-01-20 14:53:15: Loss after num_examples_seen=30000 epoch=30: 3.145856
2017-01-20 14:53:20: Loss after num_examples_seen=35000 epoch=35: 3.141625
2017-01-20 14:53:26: Loss after num_examples_seen=40000 epoch=40: 3.172117
2017-01-20 14:53:31: Loss after num_examples_seen=45000 epoch=45: 3.152063
<rnn_theano.RNNTheano object at 0x7f7add1e0710>
Saved model parameters to ./data/trained-model-sion_consider.npz.
Loaded model parameters from ./data/trained-model-sion_consider.npz. hidden_dim=100 word_dim=200

In [29]:
print(len(model.V.get_value()))


200

In [39]:
def generate_sentence(model):
    new_sentence = [word_to_index[sentence_start_token]]
    while not new_sentence[-1] == word_to_index[sentence_end_token]:
        next_word_probs = model.forward_propagation(new_sentence)
        sampled_word = word_to_index["unknown"]
        while sampled_word == word_to_index["unknown"]:
            samples = np.random.multinomial(1, next_word_probs[-1])
            sampled_word = np.argmax(samples)
        new_sentence.append(sampled_word)
    sentence_str = [index_to_word[x] for x in new_sentence[1:-1]]
    return sentence_str

num_sentences = 2
senten_min_length = 5

for i in range(num_sentences):
    sent = []
    while len(sent) < senten_min_length:
        sent = generate_sentence(model)
    print (" ".join(sent))


점 감동 현실 왜 재밌다
재미없다 말 끝 어떻다 받다

In [ ]: