In [82]:
import tensorflow as tf
import numpy as np
from collections import Counter
from tensorflow.contrib import seq2seq
In [53]:
print('TensorFlow Version: {}'.format(tf.__version__))
In [2]:
corpus_file = open('corpus.txt', mode='r', encoding="utf8")
corpus = corpus_file.read()
corpus = corpus.lower()
print(len(corpus))
In [3]:
vocab_char = set(corpus)
print(vocab_char)
In [4]:
dict_punctuation = {
'.':' ||Period|| ',
',':' ||Comma|| ',
'"':' ||Quotation_Mark|| ',
';':' ||Semicolon|| ',
'!':' ||Exclamation_Mark|| ',
'?':' ||Question_Mark|| ',
'(':' ||Left_Parenthesis|| ',
')':' ||Right_Parenthesis|| ',
'--':' ||Double_Dash|| ',
'-':' ||Dash|| ',
'_':' ||Underscore|| ',
'*':' ||Star|| ',
'\n':' ||Return|| ',
'’' :' ||Left_Quote|| ',
'“' :' ||Right_Quotation|| ',
'”' :' ||Left_Quotation|| ',
'‘' :' ||Right_Quote|| '
}
for key, token in dict_punctuation.items():
corpus = corpus.replace(key, token)
word_corpus = corpus.split(' ')
print(word_corpus[1:15])
print(len(word_corpus))
In [5]:
vocab = set(word_corpus)
num_classes = len(vocab)
print(num_classes)
vocab_to_int = {c:i for i,c in enumerate(vocab)}
int_to_vocab = {i:c for i,c in enumerate(vocab)}
print(int_to_vocab.get(vocab_to_int.get('||Period||')))
encoded = [vocab_to_int.get(i) for i in word_corpus]
print(encoded[1:10])
print(len(encoded))
In [6]:
steps = 50
In [7]:
X = []
y = []
for i in range(0, len(encoded) - steps, 1):
X.append(encoded[i : i + steps])
y.append(encoded[i + steps])
X = np.reshape(X, (len(X), steps))
X = X/float(num_classes)
X_train = X
y_train = np.eye(num_classes)[y]
print(X_train.shape)
print(y_train.shape)
In [19]:
dropout = 1
epochs = 250
batch_size = 1024
embed_dim = 256
learning_rate = 0.2
rnn_size = 512
keep_prob = 1
lstm_layers = 1
In [36]:
batch_count = X_train.shape[0]//batch_size
inputs_batches = []
for i in range(batch_count):
x = X_train[i * batch_size : (i + 1) * batch_size, ]
inputs_batches.append(x)
inputs_batches = np.array(inputs_batches)
print(inputs_batches.shape)
In [67]:
def get_inputs():
inputs = tf.placeholder(tf.int32, [None, None], name = 'inputs')
targets = tf.placeholder(tf.int32, [None, None], name = 'targets')
learning_rate = tf.placeholder(tf.float32, name = 'learning_rate')
return inputs, targets, learning_rate
In [70]:
def get_embed(inputs, num_classes, embed_dim):
embed_vec = tf.Variable(tf.random_uniform((num_classes, embed_dim), -1, 1), name='embed_vec')
embedding = tf.nn.embedding_lookup(embed_vec, inputs)
return embedding
In [77]:
def get_init_cell(batch_size, rnn_size):
lstm = tf.contrib.rnn.BasicLSTMCell(num_units=rnn_size, state_is_tuple=True)
drop = tf.contrib.rnn.DropoutWrapper(cell=lstm, output_keep_prob=keep_prob)
cell = tf.contrib.rnn.MultiRNNCell([drop] * lstm_layers)
cell_state = cell.zero_state(batch_size, tf.float32)
cell_state = tf.identity(cell_state, name = 'cell_state')
return cell, cell_state
In [78]:
def build_rnn(cell, embedding):
outputs, final_state = tf.nn.dynamic_rnn(cell, embedding, dtype=tf.float32)
final_state = tf.identity(final_state, name='final_state')
return outputs, final_state
In [79]:
def build_nn(cell, rnn_size, input_data, num_classes):
embed = get_embed(input_data, num_classes, embed_dim)
outputs, final_state = build_rnn(cell, embed)
logits = tf.contrib.layers.fully_connected(outputs, num_classes, weights_initializer=tf.truncated_normal_initializer(stddev=0.1))
return logits, final_state
In [88]:
train_graph = tf.Graph()
with train_graph.as_default():
inputs, targets, learning_rate = get_inputs()
inputs_shape = tf.shape(inputs)
cell, init_state = get_init_cell(inputs_shape[0], rnn_size)
outputs, final_state = build_nn(cell, rnn_size, inputs, num_classes)
probs = tf.nn.softmax(outputs, name = 'probs')
cost = seq2seq.sequence_loss(outputs, targets, tf.ones([inputs.shape[0], inputs.shape[1]]))
In [ ]: