Table of Contents


In [1]:
import collections
import os
import sys
import codecs
import numpy as np
import tensorflow as tf

print tf.__version__

stdout = sys.stdout
reload(sys)
sys.setdefaultencoding('utf-8')
sys.stdout = stdout

start_token = '['
end_token = ']'

data_dir = '~/Documents/github/fastai-notes/deeplearning1/rnn/text/'
%cd $data_dir
filename = 'gucheng_compiled.txt'

poems = []
with codecs.open(filename, 'r', 'utf-8') as f:
    for line in f.readlines():
        try:
            title, content = line.strip().split(':')
            content = content.replace(' ', '')
            if len(content) <= 10:
                continue
            content = start_token + content + end_token
            poems.append(content)
        except ValueError as e:
            pass
%cd ..
poems = sorted(poems, key=lambda line: len(line))


1.3.0
/Users/yingchipei/Documents/github/fastai-notes/deeplearning1/rnn/text

In [2]:
print poems[0]


[连睡梦的路,都难以到达]

In [3]:
all_words = []
for poem in poems:
    all_words += [word for word in poem]
# 这里根据包含了每个字对应的频率
counter = collections.Counter(all_words)
count_pairs = sorted(counter.items(), key=lambda x: -x[1])
words, _ = zip(*count_pairs)

In [4]:
# 取前多少个常用字
words = words[:len(words)] + (' ',)
# 每个字映射为一个数字ID
word_int_map = dict(zip(words, range(len(words))))
word2idfunc = lambda word:  word_int_map.get(word,len(words))
poems_vector = [list(map(word2idfunc, poem)) for poem in poems]

In [5]:
print [ w for w in words[:5]]


[u'\uff0c', u'\u7684', u'\u5728', u'\u4e00', u'[']

In [6]:
#batch-wise padding:do padding to the same size(sequence length) of each batch
batch_size = 1
n_batch = (len(poems_vector)-1) // batch_size
X_data,Y_data = [],[]
for i in range(n_batch):
    cur_vecs = poems_vector[i*batch_size:(i+1)*batch_size]
    current_batch_max_length = max(map(len,cur_vecs))
    batch_matrix = np.full((batch_size,current_batch_max_length),word2idfunc(" "),np.int32)
    for j in range(batch_size):
        batch_matrix[j,:len(cur_vecs[j])] = cur_vecs[j]
    x = batch_matrix
    X_data.append(x)
    y = np.copy(x)
    y[:,:-1] = x[:,1:]
    # print x
    # print y
    Y_data.append(y)

In [7]:
#build rnn
tf.reset_default_graph()
vocab_size = len(words)+1
#input_size:(batch_size,feature_length)
input_sequences = tf.placeholder(tf.int32,shape=[batch_size,None])
output_sequences = tf.placeholder(tf.int32,shape=[batch_size,None])

In [8]:
def build_rnn(hidden_units=128, layers=2):
    # embedding
    with tf.variable_scope("embedding"):
        embedding = tf.get_variable("embedding", [vocab_size, hidden_units], dtype=tf.float32)
        input = tf.nn.embedding_lookup(embedding, input_sequences)
        
    basic_cell = tf.nn.rnn_cell.BasicLSTMCell(hidden_units, state_is_tuple=True)
    stack_cell = tf.nn.rnn_cell.MultiRNNCell([basic_cell]*layers)
    _initial_state = stack_cell.zero_state(batch_size, tf.float32)
    outputs, state = tf.nn.dynamic_rnn(stack_cell, input, initial_state=_initial_state, dtype=tf.float32)
    outputs = tf.reshape(outputs, [-1, hidden_units])
    
    with tf.variable_scope('softmax'):
        softmax_w = tf.get_variable('softmax_w', [hidden_units, vocab_size])
        softmax_b = tf.get_variable('softmax_b', [vocab_size])
        logits = tf.matmul(outputs, softmax_w)+softmax_b
        
    probs = tf.nn.softmax(logits)
    return logits, probs, stack_cell, _initial_state, state

In [9]:
def train(reload=True):
    logits, probs,_,_,_ = build_rnn()

    targets = tf.reshape(output_sequences,[-1])

    loss = tf.contrib.legacy_seq2seq.sequence_loss_by_example([logits], [targets], 
        [tf.ones_like(targets, dtype=tf.float32)],len(words))
    cost = tf.reduce_mean(loss)

    learning_rate = tf.Variable(0.002, trainable=False)
    tvars = tf.trainable_variables()
    grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars), 5)
    optimizer = tf.train.AdamOptimizer(learning_rate)
    train_op = optimizer.apply_gradients(zip(grads, tvars))

    global_step = 0
    with tf.Session() as sess:
        sess.run(tf.initialize_all_variables())
        saver = tf.train.Saver(write_version=tf.train.SaverDef.V2)

        if reload:
            module_file = tf.train.latest_checkpoint('.')
            sess = saver.restore(module_file)
            print "reload sess"

        for epoch in range(50):
            print "learning_rate decrease"
            if global_step%80==0:
                sess.run(tf.assign(learning_rate, 0.002 * (0.97 ** epoch)))
            epoch_steps =  len(zip(X_data,Y_data))
            for step,(x,y) in enumerate(zip(X_data,Y_data)):
                global_step = epoch * epoch_steps + step
                _, los  = sess.run([train_op, cost], feed_dict={
                    input_sequences:x,
                    output_sequences:y,
                    })
            if global_step//1000 >= 1:
                print "epoch:%d steps:%d/%d loss:%3f" % (epoch,step,epoch_steps,los)
                print "save model"
                saver.save(sess,"peotry",global_step=epoch)


def write_poem():

    def to_word(weights):
        t = np.cumsum(weights)
        s = np.sum(weights)
        sample = int(np.searchsorted(t, np.random.rand(1)*s))
        print "sample:",sample
        print "len Words:",len(words)
        return words[sample]

    logits, probs,stack_cell, _initial_state, last_state = build_rnn()
    with tf.Session() as sess:
        sess.run(tf.initialize_all_variables())
        saver = tf.train.Saver(write_version=tf.train.SaverDef.V2)
        module_file = tf.train.latest_checkpoint('model/')
        print "load:",module_file
        saver.restore(sess,module_file)

        _state = sess.run(stack_cell.zero_state(1,dtype=tf.float32))

        x = np.array([[word2idfunc('[')]])

        prob_, _state = sess.run([probs,last_state],feed_dict={input_sequences:x,_initial_state:_state})

        word = to_word(prob_)

        poem = ''

        import time
        while word != ']':
            poem += word
            x = np.array([[word2idfunc(word)]])
            [probs_, _state] = sess.run([probs, last_state], feed_dict={input_sequences: x, _initial_state: _state})
            word = to_word(probs_)
            # time.sleep(1)

    return poem

In [10]:
# print(write_poem())

In [11]:
# train(False)

In [12]:
print(write_poem())


WARNING:tensorflow:From /Users/yingchipei/anaconda3/envs/py27/lib/python2.7/site-packages/tensorflow/python/util/tf_should_use.py:175: initialize_all_variables (from tensorflow.python.ops.variables) is deprecated and will be removed after 2017-03-02.
Instructions for updating:
Use `tf.global_variables_initializer` instead.
load: ./peotry-49
INFO:tensorflow:Restoring parameters from ./peotry-49
sample: 73
len Words: 3047
sample: 21
len Words: 3047
sample: 54
len Words: 3047
sample: 17
len Words: 3047
sample: 893
len Words: 3047
sample: 556
len Words: 3047
sample: 10
len Words: 3047
sample: 793
len Words: 3047
sample: 865
len Words: 3047
sample: 0
len Words: 3047
sample: 165
len Words: 3047
sample: 147
len Words: 3047
sample: 2
len Words: 3047
sample: 38
len Words: 3047
sample: 103
len Words: 3047
sample: 195
len Words: 3047
sample: 15
len Words: 3047
sample: 310
len Words: 3047
sample: 55
len Words: 3047
sample: 0
len Words: 3047
sample: 378
len Words: 3047
sample: 214
len Words: 3047
sample: 70
len Words: 3047
sample: 308
len Words: 3047
sample: 176
len Words: 3047
sample: 405
len Words: 3047
sample: 7
len Words: 3047
sample: 0
len Words: 3047
sample: 554
len Words: 3047
sample: 562
len Words: 3047
sample: 520
len Words: 3047
sample: 255
len Words: 3047
sample: 0
len Words: 3047
sample: 239
len Words: 3047
sample: 6
len Words: 3047
sample: 1
len Words: 3047
sample: 1575
len Words: 3047
sample: 928
len Words: 3047
sample: 608
len Words: 3047
sample: 2186
len Words: 3047
sample: 285
len Words: 3047
sample: 55
len Words: 3047
sample: 1
len Words: 3047
sample: 533
len Words: 3047
sample: 0
len Words: 3047
sample: 131
len Words: 3047
sample: 20
len Words: 3047
sample: 1
len Words: 3047
sample: 99
len Words: 3047
sample: 2104
len Words: 3047
sample: 0
len Words: 3047
sample: 147
len Words: 3047
sample: 146
len Words: 3047
sample: 15
len Words: 3047
sample: 2121
len Words: 3047
sample: 0
len Words: 3047
sample: 738
len Words: 3047
sample: 39
len Words: 3047
sample: 7
len Words: 3047
sample: 655
len Words: 3047
sample: 179
len Words: 3047
sample: 0
len Words: 3047
sample: 25
len Words: 3047
sample: 90
len Words: 3047
sample: 1065
len Words: 3047
sample: 1987
len Words: 3047
sample: 1
len Words: 3047
sample: 2752
len Words: 3047
sample: 384
len Words: 3047
sample: 0
len Words: 3047
sample: 73
len Words: 3047
sample: 1538
len Words: 3047
sample: 68
len Words: 3047
sample: 133
len Words: 3047
sample: 66
len Words: 3047
sample: 320
len Words: 3047
sample: 203
len Words: 3047
sample: 248
len Words: 3047
sample: 13
len Words: 3047
sample: 1
len Words: 3047
sample: 1008
len Words: 3047
sample: 8
len Words: 3047
sample: 393
len Words: 3047
sample: 180
len Words: 3047
sample: 147
len Words: 3047
sample: 45
len Words: 3047
sample: 1
len Words: 3047
sample: 516
len Words: 3047
sample: 1006
len Words: 3047
sample: 847
len Words: 3047
sample: 484
len Words: 3047
sample: 0
len Words: 3047
sample: 823
len Words: 3047
sample: 68
len Words: 3047
sample: 70
len Words: 3047
sample: 1084
len Words: 3047
sample: 74
len Words: 3047
sample: 425
len Words: 3047
sample: 1
len Words: 3047
sample: 576
len Words: 3047
sample: 711
len Words: 3047
sample: 1
len Words: 3047
sample: 632
len Words: 3047
sample: 40
len Words: 3047
sample: 0
len Words: 3047
sample: 476
len Words: 3047
sample: 23
len Words: 3047
sample: 697
len Words: 3047
sample: 158
len Words: 3047
sample: 0
len Words: 3047
sample: 14
len Words: 3047
sample: 42
len Words: 3047
sample: 62
len Words: 3047
sample: 0
len Words: 3047
sample: 239
len Words: 3047
sample: 12
len Words: 3047
sample: 723
len Words: 3047
sample: 77
len Words: 3047
sample: 7
len Words: 3047
sample: 0
len Words: 3047
sample: 1238
len Words: 3047
sample: 72
len Words: 3047
sample: 350
len Words: 3047
sample: 169
len Words: 3047
sample: 9
len Words: 3047
sample: 369
len Words: 3047
sample: 1534
len Words: 3047
sample: 1
len Words: 3047
sample: 7
len Words: 3047
sample: 0
len Words: 3047
sample: 97
len Words: 3047
sample: 108
len Words: 3047
sample: 61
len Words: 3047
sample: 11
len Words: 3047
sample: 23
len Words: 3047
sample: 74
len Words: 3047
sample: 186
len Words: 3047
sample: 146
len Words: 3047
sample: 665
len Words: 3047
sample: 45
len Words: 3047
sample: 1
len Words: 3047
sample: 244
len Words: 3047
sample: 384
len Words: 3047
sample: 117
len Words: 3047
sample: 11
len Words: 3047
sample: 3
len Words: 3047
sample: 24
len Words: 3047
sample: 444
len Words: 3047
sample: 299
len Words: 3047
sample: 1
len Words: 3047
sample: 56
len Words: 3047
sample: 115
len Words: 3047
sample: 121
len Words: 3047
sample: 108
len Words: 3047
sample: 0
len Words: 3047
sample: 346
len Words: 3047
sample: 706
len Words: 3047
sample: 0
len Words: 3047
sample: 305
len Words: 3047
sample: 170
len Words: 3047
sample: 1388
len Words: 3047
sample: 119
len Words: 3047
sample: 1
len Words: 3047
sample: 341
len Words: 3047
sample: 1318
len Words: 3047
sample: 0
len Words: 3047
sample: 395
len Words: 3047
sample: 320
len Words: 3047
sample: 29
len Words: 3047
sample: 2283
len Words: 3047
sample: 403
len Words: 3047
sample: 1
len Words: 3047
sample: 78
len Words: 3047
sample: 954
len Words: 3047
sample: 0
len Words: 3047
sample: 415
len Words: 3047
sample: 255
len Words: 3047
sample: 5
len Words: 3047
被大海中覆盖着晴眠,蓝金在就明深地毛动,森林无法流耀了,拉越乱失,当我的丈夫工委转动的街,很小的头颅,金黄地柄,旧开了桥气,它用规律的砧层,被堵出最阳留给墙上的盐。万对金色的破示严净,溅出无惑空雾的非厚的目光,秋天即呵,你只说,当不脱样了,初心般所有形态的了,从笑也是天空细黄颜色的波层;是一个奇国的风!微笑,全部,终于戴鸟的游戏,蜜留像锐快的长猪,消失

In [ ]: