Sample Hangul RNN


In [1]:
# -*- coding: utf-8 -*-
# Import Packages
import numpy as np
import tensorflow as tf
import collections
import string
import argparse
import time
import os
from six.moves import cPickle
from TextLoader import *
from Hangulpy import *
print ("Packages Imported")


Packages Imported

Load dataset using TextLoader


In [2]:
corpus_name = "invisible_dragon" # "nine_dreams"
# corpus_name = "nine_dreams"

data_dir    = "data/" + corpus_name
batch_size  = 10
seq_length  = 100
data_loader = TextLoader(data_dir, batch_size, seq_length)
# This makes "vocab.pkl" and "data.npy" in "data/nine_dreams"   
#  from "data/nine_dreams/input.txt" 
vocab_size = data_loader.vocab_size
vocab = data_loader.vocab
chars = data_loader.chars
print ( "type of 'data_loader' is %s, length is %d" 
       % (type(data_loader.vocab), len(data_loader.vocab)) )
print ( "\n" )
print ("data_loader.vocab looks like \n%s " %
       (data_loader.vocab))
print ( "\n" )
print ( "type of 'data_loader.chars' is %s, length is %d" 
       % (type(data_loader.chars), len(data_loader.chars)) )
print ( "\n" )
print ("data_loader.chars looks like \n%s " % (data_loader.chars,))


loading preprocessed files
type of 'data_loader' is <type 'dict'>, length is 84


data_loader.vocab looks like 
{u'_': 81, u'6': 63, u'|': 83, u'\n': 6, u'\r': 7, u',': 74, u'x': 75, u';': 73, u'[': 71, u'\u3144': 60, u'!': 28, u' ': 2, u'#': 68, u'"': 34, u'\u1d25': 0, u"'": 65, u')': 50, u'(': 51, u'+': 76, u'*': 82, u']': 72, u'\u3133': 58, u'/': 45, u'.': 24, u'\u3131': 5, u'0': 27, u'3': 54, u'2': 36, u'5': 61, u'\u3134': 4, u'\u3137': 11, u'\u3136': 49, u'\u3139': 8, u'\u3138': 31, u'\u3156': 52, u':': 40, u'\u313c': 67, u'?': 44, u'4': 59, u'\u3141': 14, u'\u3140': 77, u'\u3143': 57, u'\u3142': 21, u'\u3145': 15, u'7': 47, u'\u3147': 1, u'\u3146': 22, u'\u3149': 38, u'\u3148': 16, u'\u314b': 26, u'\u314a': 30, u'\u314d': 33, u'\u314c': 25, u'\u314f': 3, u'\u314e': 19, u'\u3151': 32, u'\u3150': 18, u'\u3153': 13, u'\u3152': 69, u'\u3155': 20, u'\u3154': 23, u'\u3157': 12, u'8': 46, u'\u3159': 62, u'\u3158': 41, u'\u315b': 29, u'\u315a': 53, u'\u315d': 48, u'\u315c': 17, u'\u315f': 39, u'^': 64, u'\u3161': 10, u'\u3160': 55, u'\u3163': 9, u'\u3162': 43, u'k': 78, u'9': 35, u'\u313a': 70, u'1': 56, u'\u3132': 42, u'%': 66, u'}': 80, u'<': 79, u'~': 37} 


type of 'data_loader.chars' is <type 'tuple'>, length is 84


data_loader.chars looks like 
(u'\u1d25', u'\u3147', u' ', u'\u314f', u'\u3134', u'\u3131', u'\n', u'\r', u'\u3139', u'\u3163', u'\u3161', u'\u3137', u'\u3157', u'\u3153', u'\u3141', u'\u3145', u'\u3148', u'\u315c', u'\u3150', u'\u314e', u'\u3155', u'\u3142', u'\u3146', u'\u3154', u'.', u'\u314c', u'\u314b', u'0', u'!', u'\u315b', u'\u314a', u'\u3138', u'\u3151', u'\u314d', u'"', u'9', u'2', u'~', u'\u3149', u'\u315f', u':', u'\u3158', u'\u3132', u'\u3162', u'?', u'/', u'8', u'7', u'\u315d', u'\u3136', u')', u'(', u'\u3156', u'\u315a', u'3', u'\u3160', u'1', u'\u3143', u'\u3133', u'4', u'\u3144', u'5', u'\u3159', u'6', u'^', u"'", u'%', u'\u313c', u'#', u'\u3152', u'\u313a', u'[', u']', u';', u',', u'x', u'+', u'\u3140', u'k', u'<', u'}', u'_', u'*', u'|') 

Define Network


In [3]:
rnn_size   = 128
num_layers = 2
grad_clip  = 5.

_batch_size = 1
_seq_length = 1

vocab_size = data_loader.vocab_size

with tf.device("/cpu:0"):
    # Select RNN Cell
    def unit_cell():
        return tf.contrib.rnn.BasicLSTMCell(rnn_size,state_is_tuple=True,reuse=tf.get_variable_scope().reuse)
    cell = tf.contrib.rnn.MultiRNNCell([unit_cell() for _ in range(num_layers)])

    # Set paths to the graph 
    input_data = tf.placeholder(tf.int32, [_batch_size, _seq_length])
    targets    = tf.placeholder(tf.int32, [_batch_size, _seq_length])
    initial_state = cell.zero_state(_batch_size, tf.float32)

    # Set Network
    with tf.variable_scope('rnnlm'):
        softmax_w = tf.get_variable("softmax_w", [rnn_size, vocab_size])
        softmax_b = tf.get_variable("softmax_b", [vocab_size])
        with tf.device("/cpu:0"):
            embedding = tf.get_variable("embedding", [vocab_size, rnn_size])
            inputs = tf.split(tf.nn.embedding_lookup(embedding, input_data), _seq_length, 1)
            inputs = [tf.squeeze(input_, [1]) for input_ in inputs]
            
    # Loop function for seq2seq
    def loop(prev, _):
        prev = tf.nn.xw_plus_b(prev, softmax_w, softmax_b)
        prev_symbol = tf.stop_gradient(tf.argmax(prev, 1))
        return tf.nn.embedding_lookup(embedding, prev_symbol)
    # Output of RNN 
    outputs, last_state = tf.contrib.rnn.static_rnn(cell,inputs, initial_state
                                , scope='rnnlm')
    output = tf.reshape(tf.concat(outputs,1), [-1, rnn_size])
    logits = tf.nn.xw_plus_b(output, softmax_w, softmax_b)
    # Next word probability 
    probs = tf.nn.softmax(logits)
    final_state = last_state

print ("Network Ready")


Network Ready

In [4]:
# Sample ! 
def sample( sess, chars, vocab, __probs, num=200, prime=u'ㅇㅗᴥㄴㅡㄹᴥ '):
    state = sess.run(cell.zero_state(1, tf.float32))
    _probs = __probs
    prime = list(prime)
    for char in prime[:-1]:
        x = np.zeros((1, 1))
        x[0, 0] = vocab[char]
        feed = {input_data: x, initial_state:state}
        [state] = sess.run([last_state], feed)

    def weighted_pick(weights):
        weights = weights / np.sum(weights) 
        t = np.cumsum(weights)
        s = np.sum(weights)
        return(int(np.searchsorted(t, np.random.rand(1)*s)))

    ret = prime
    char = prime[-1]
    for n in range(num):
        x = np.zeros((1, 1))
        x[0, 0] = vocab[char]
        feed = {input_data: x, initial_state:state}
        [_probsval, state] = sess.run([_probs, final_state], feed)
        p = _probsval[0]
        sample = int(np.random.choice(len(p), p=p))
        #sample = weighted_pick(p)
        pred = chars[sample]
        ret += pred
        char = pred
    return ret
print ("sampling function done.")


sampling function done.

Sample


In [7]:
save_dir = 'data/' + corpus_name
prime = decompose_text(u" ")

print ("Prime Text : %s => %s" % (automata(prime), "".join(prime)))
n = 4000

sess = tf.Session()
sess.run(tf.initialize_all_variables())
saver = tf.train.Saver(tf.all_variables())
ckpt = tf.train.get_checkpoint_state(save_dir)

# load_name = u'data/nine_dreams/model.ckpt-0'
load_name = os.path.join(save_dir,'model.ckpt-20000')

print (load_name)

if ckpt and ckpt.model_checkpoint_path:
    saver.restore(sess, load_name)
    sampled_text = sample(sess, chars, vocab, probs, n, prime)
    #print ("")
#     print (u"SAMPLED TEXT = %s" % sampled_text)
#     print ("")
    print ("-- RESULT --")
    print (automata("".join(sampled_text)))


Prime Text :   =>  
WARNING:tensorflow:From /home/yj/.virtualenvs/lecture/local/lib/python2.7/site-packages/tensorflow/python/util/tf_should_use.py:170: initialize_all_variables (from tensorflow.python.ops.variables) is deprecated and will be removed after 2017-03-02.
Instructions for updating:
Use `tf.global_variables_initializer` instead.
WARNING:tensorflow:From <ipython-input-7-7917c18aafae>:9: all_variables (from tensorflow.python.ops.variables) is deprecated and will be removed after 2017-03-02.
Instructions for updating:
Please use tf.global_variables instead.
data/invisible_dragon/model.ckpt-20000
INFO:tensorflow:Restoring parameters from data/invisible_dragon/model.ckpt-20000
-- RESULT --
 거가 비생업니곡도 채려 픽시만 투명드래곤은
사실을 입을새분명펏하지?

 

그랬다... 투명드래곤은 투명햇다 굖굖

 

 

 

콜밥:헉! 깨티는거 보면서 뒤크.... 넌빌대니
이정돈드 아쳐움에슬7?!"

 

기백명:죽마니빠이어. 윽사도정있었다 그때였다... 콜바샘으로불샤한대 굨굨굨굨"

투명드래곤은 그때였다

 

투명드래곤은 놀밥니다 진자 굖굖

 

투명드래곤은 하도마니까는 자살이 아니?

 

뒤크는 날 전부터나 힘들은 ㄱㅇ려하구죽마르터느는 투명드래곤이 처제이려햇던 그때 조갈기 때문이었던것 이엇다
하지만 투명드래곤의 초최강
선튀ㅇㄴ..................

 

하들어만 나삼으로 변주를 이어웠다.

 

투명드래곤:왜!!

 

사신하시라 그래서 변그래서 막투명하고강그래서
안막건그몰라~~~

 

미우자고 잘사이게 군주가눈애 아녀서떠니가한대다 이대려!! 허마하몄어. 어떠나 감박 다음바로 안다
뒤크는 만데코버로버렸다



 

???????????8셔:자
투명드래곤은 마음을때였다 그래서 투명드래곤은 또 사랑!

 

하지만 아직도락해봐그 힘들어 그었따
그르라져전이려햇다


꼐속


(헐 말한전테 하고 나 떨어져선에 2%00개나에겡한대 투명드래곤은졸라 그상이마삼으라서
모이 부몄는 자기라는 사실을...

 


콜밥:자 이번엔 전투!!!
하하하 닌드레고 나는 공가어 마시몰라와???????"
늠니
여기 못시바?!! 

 

 

 

15 (2002/07/27)
님들아 이재사장을 1만려?

 내보이다은 내가 하다시자 왜냐하면

꼐속


5 (2002/07/25)
사람으로변해고므럼 조기마는건데?! 넌 창조자는 상처야첨욚아!!

투명드래곤의 눈알빔하네고 그랬지

 

투명드래곤은 너무 막저기술어버렸다
투명드래곤은 개밥사실

 

빙실 써야
머리하던 나자 뒤크 을지도년다 굓괴그 만 아니가 딱공기글이없었다

 

뒤크:으하하! 나도 이제우져! 99ㅁ2ㅇ는
잰나

 

투명드래곤:누구는 날아라 하하"

 

그러자 가지?"

 

뒤크:빠에역! 씨발... 주거써 썅"

 

그때말했엇기 명즐로기셧냐 한주요 너무냐.

 

콜밥:오 알았어요! 20래 정금이'.. 나무리!!"

 

투명드래곤은 투명드래곤이 짱이였다 왜냐하면 그다개굖그검들글얼텁했엇던 것이다... 굖굖굖
궁격공그래서 투명드래곤은다으모새어선 말있따
너마나 인간이랑 오버로드만몰랏지마다마니한 전투여버릿던 것이엇다
하지만 투명드래곤은 존나쌔서
피겟엇다 하지만
뒤크는그래슨 마음은막맛이쳤다

 

"쿠아여야!! 어떠나 뒤크은 날아라 으도만죽으로 뒤크엿다 진짜 잼박만데 그때였다
콜밥을 조해서무러자다업청쓸 플퍼났다
콜밥이 알어자? 아 크고
냄은
뒤크에신준아 가였다!!

 

샹아:알았어요 히씬이이이이이이이이이이이이이이이이이이여!!! 보이지도 안쳤다 너신이 니도 9909999999998
주ㅇㄴ아어딨냐 한마디망흔빙든 음!써지는거다!

 

투투명드래곤도 존나쌧네' 하지만 투명드래곤에겐
슴릴잣다

 

5008ㄱ케가 업을대일꺼가 나 콜밥이 다 내가다면것이다

"푸츰쌍흥했는데 그러니 끝이다! 오캐요하하하하하"

투명드래곤은 너무짐했을지도 

It takes long time to train!


In [ ]: