In [1]:
import collections
import os
import sys
import codecs
import numpy as np
import tensorflow as tf
print tf.__version__
stdout = sys.stdout
reload(sys)
sys.setdefaultencoding('utf-8')
sys.stdout = stdout
start_token = '['
end_token = ']'
data_dir = '~/Documents/github/fastai-notes/deeplearning1/rnn/text/'
%cd $data_dir
filename = 'gucheng_compiled.txt'
poems = []
with codecs.open(filename, 'r', 'utf-8') as f:
for line in f.readlines():
try:
title, content = line.strip().split(':')
content = content.replace(' ', '')
if len(content) <= 10:
continue
content = start_token + content + end_token
poems.append(content)
except ValueError as e:
pass
%cd ..
poems = sorted(poems, key=lambda line: len(line))
In [2]:
print poems[0]
In [3]:
all_words = []
for poem in poems:
all_words += [word for word in poem]
# 这里根据包含了每个字对应的频率
counter = collections.Counter(all_words)
count_pairs = sorted(counter.items(), key=lambda x: -x[1])
words, _ = zip(*count_pairs)
In [4]:
# 取前多少个常用字
words = words[:len(words)] + (' ',)
# 每个字映射为一个数字ID
word_int_map = dict(zip(words, range(len(words))))
word2idfunc = lambda word: word_int_map.get(word,len(words))
poems_vector = [list(map(word2idfunc, poem)) for poem in poems]
In [5]:
print [ w for w in words[:5]]
In [6]:
#batch-wise padding:do padding to the same size(sequence length) of each batch
batch_size = 1
n_batch = (len(poems_vector)-1) // batch_size
X_data,Y_data = [],[]
for i in range(n_batch):
cur_vecs = poems_vector[i*batch_size:(i+1)*batch_size]
current_batch_max_length = max(map(len,cur_vecs))
batch_matrix = np.full((batch_size,current_batch_max_length),word2idfunc(" "),np.int32)
for j in range(batch_size):
batch_matrix[j,:len(cur_vecs[j])] = cur_vecs[j]
x = batch_matrix
X_data.append(x)
y = np.copy(x)
y[:,:-1] = x[:,1:]
# print x
# print y
Y_data.append(y)
In [7]:
#build rnn
tf.reset_default_graph()
vocab_size = len(words)+1
#input_size:(batch_size,feature_length)
input_sequences = tf.placeholder(tf.int32,shape=[batch_size,None])
output_sequences = tf.placeholder(tf.int32,shape=[batch_size,None])
In [8]:
def build_rnn(hidden_units=128, layers=2):
# embedding
with tf.variable_scope("embedding"):
embedding = tf.get_variable("embedding", [vocab_size, hidden_units], dtype=tf.float32)
input = tf.nn.embedding_lookup(embedding, input_sequences)
basic_cell = tf.nn.rnn_cell.BasicLSTMCell(hidden_units, state_is_tuple=True)
stack_cell = tf.nn.rnn_cell.MultiRNNCell([basic_cell]*layers)
_initial_state = stack_cell.zero_state(batch_size, tf.float32)
outputs, state = tf.nn.dynamic_rnn(stack_cell, input, initial_state=_initial_state, dtype=tf.float32)
outputs = tf.reshape(outputs, [-1, hidden_units])
with tf.variable_scope('softmax'):
softmax_w = tf.get_variable('softmax_w', [hidden_units, vocab_size])
softmax_b = tf.get_variable('softmax_b', [vocab_size])
logits = tf.matmul(outputs, softmax_w)+softmax_b
probs = tf.nn.softmax(logits)
return logits, probs, stack_cell, _initial_state, state
In [9]:
def train(reload=True):
logits, probs,_,_,_ = build_rnn()
targets = tf.reshape(output_sequences,[-1])
loss = tf.contrib.legacy_seq2seq.sequence_loss_by_example([logits], [targets],
[tf.ones_like(targets, dtype=tf.float32)],len(words))
cost = tf.reduce_mean(loss)
learning_rate = tf.Variable(0.002, trainable=False)
tvars = tf.trainable_variables()
grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars), 5)
optimizer = tf.train.AdamOptimizer(learning_rate)
train_op = optimizer.apply_gradients(zip(grads, tvars))
global_step = 0
with tf.Session() as sess:
sess.run(tf.initialize_all_variables())
saver = tf.train.Saver(write_version=tf.train.SaverDef.V2)
if reload:
module_file = tf.train.latest_checkpoint('.')
sess = saver.restore(module_file)
print "reload sess"
for epoch in range(50):
print "learning_rate decrease"
if global_step%80==0:
sess.run(tf.assign(learning_rate, 0.002 * (0.97 ** epoch)))
epoch_steps = len(zip(X_data,Y_data))
for step,(x,y) in enumerate(zip(X_data,Y_data)):
global_step = epoch * epoch_steps + step
_, los = sess.run([train_op, cost], feed_dict={
input_sequences:x,
output_sequences:y,
})
if global_step//1000 >= 1:
print "epoch:%d steps:%d/%d loss:%3f" % (epoch,step,epoch_steps,los)
print "save model"
saver.save(sess,"peotry",global_step=epoch)
def write_poem():
def to_word(weights):
t = np.cumsum(weights)
s = np.sum(weights)
sample = int(np.searchsorted(t, np.random.rand(1)*s))
print "sample:",sample
print "len Words:",len(words)
return words[sample]
logits, probs,stack_cell, _initial_state, last_state = build_rnn()
with tf.Session() as sess:
sess.run(tf.initialize_all_variables())
saver = tf.train.Saver(write_version=tf.train.SaverDef.V2)
module_file = tf.train.latest_checkpoint('model/')
print "load:",module_file
saver.restore(sess,module_file)
_state = sess.run(stack_cell.zero_state(1,dtype=tf.float32))
x = np.array([[word2idfunc('[')]])
prob_, _state = sess.run([probs,last_state],feed_dict={input_sequences:x,_initial_state:_state})
word = to_word(prob_)
poem = ''
import time
while word != ']':
poem += word
x = np.array([[word2idfunc(word)]])
[probs_, _state] = sess.run([probs, last_state], feed_dict={input_sequences: x, _initial_state: _state})
word = to_word(probs_)
# time.sleep(1)
return poem
In [10]:
# print(write_poem())
In [11]:
# train(False)
In [12]:
print(write_poem())
In [ ]: