In [1]:
import tensorflow as tf
import numpy as np
from collections import Counter
In [2]:
import string
f=open("TRAIN_FILE.TXT", 'r')
txt1=f.read().translate(str.maketrans("\t\r", " "))
txt1 = txt1.lower()
"".join(txt1.split())
txt=txt1.split('\n')
sentence_corpora=[]
sentence_labels=[]
words=[]
for i in range(0, 31984, 4):
txt[i]=txt[i].lstrip('0123456789')
txt[i]=txt[i].replace('\"','')
txt[i]=txt[i].replace('.','')
at=str(txt[i].strip())
for elem in at.split(" "):
words.append(elem.replace("<e1>","").replace("</e1>", "").replace("</e2>", "").replace("<e2>", ""))
sentence_corpora.append(txt[i].strip().replace("<e1>","").replace("</e1>", "").replace("</e2>", "").replace("<e2>", ""))
sentence_labels.append(txt[i+1].strip().replace("(e1,e2)", "").replace("(e2,e1)", ""))
print(sentence_corpora[0:10])
print("\n")
print(sentence_labels[:10])
print("\n")
words=list(set(words))
print(words[0:10])
In [3]:
counts = Counter(words)
vocab = sorted(counts, key=counts.get, reverse=True)
vocab_to_int = {word: ii for ii, word in enumerate(vocab,1)}
print(len(vocab_to_int))
#print(list(vocab_to_int.keys())[list(vocab_to_int.values()).index(22947)]) # Prints george
sentence_int=[]
for sentence in sentence_corpora:
sentence_int.append([vocab_to_int[word] for word in sentence.split()])
print(sentence_int[0])
print(len(sentence_int[0]))
print(len(sentence_int))
In [4]:
#Setting Label values for Softmax Classifier
label_dict={"cause-effect": 0,
"instrument-agency": 1,
"product-producer": 2,
"content-container": 3,
"entity-origin": 4,
"entity-destination": 5,
"component-whole": 6,
"member-collection": 7,
"message-topic": 8,
"other": 9}
final_labels=[]
for elem in sentence_labels:
final_labels.append(label_dict[elem])
#print(len(final_labels))
#final_labels = np.array(final_labels)
#print(final_labels[:10])
In [5]:
# Max sequnce length I could find was 10
#padding remaining spaces with 0
review_lens = Counter([len(x) for x in sentence_int])
print("Maximum review length: {}".format(max(review_lens)))
seq_len = 85
features = np.zeros((len(sentence_int), seq_len), dtype=int)
for i, row in enumerate(sentence_int):
features[i, -len(row):] = np.array(row)[:seq_len]
In [6]:
'''
seq_len = 85
features = np.ndarray((len(sentence_int), seq_len), dtype=int)
for i, row in enumerate(sentence_int):
features[i, -len(row):] = np.array(row)[:seq_len]
for i in range(0,7996):
for j in range(0,85):
if features[i,j]==0:
features[i,j]=-1
'''
Out[6]:
In [7]:
print(np.shape(features))
features[0:5,0:85]
Out[7]:
In [8]:
# Hyperparams
lstm_size= 256
lstm_layers= 2
batch_size= 7996 # no.of sentences fed in 1 epoch
learning_rate=0.003
In [9]:
#train test split
split_frac = 0.9
split_idx = int(len(features)*0.9)
train_x, val_x = features[:split_idx], features[split_idx:]
train_y, val_y = final_labels[:split_idx], final_labels[split_idx:]
test_idx = int(len(val_x)*0.5)
val_x, test_x = val_x[:test_idx], val_x[test_idx:]
val_y, test_y = val_y[:test_idx], val_y[test_idx:]
print("\t\t\tFeature Shapes:")
print("Train set: \t\t{}".format(train_x.shape),
"\nValidation set: \t{}".format(val_x.shape),
"\nTest set: \t\t{}".format(test_x.shape))
In [10]:
n_words = len(vocab_to_int)
#print(n_words)
# Create the graph object
graph = tf.Graph()
# Add nodes to the graph
with graph.as_default():
final_labels = tf.one_hot(final_labels, 10, 1, 0)
inputs_ = tf.placeholder(tf.int32, [None, None], name='inputs')
labels_ = tf.placeholder(tf.int32, [None, 10], name='labels')
keep_prob = tf.placeholder(tf.float32, name='keep_prob')
In [11]:
embed_size= 256
with graph.as_default():
embedding = tf.Variable(tf.random_uniform((n_words, embed_size), -1, 1))
embed=tf.nn.embedding_lookup(embedding, inputs_)
In [12]:
# Main Network
with graph.as_default():
lstm=tf.contrib.rnn.BasicLSTMCell(lstm_size)
drop=tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=keep_prob)
cell=tf.contrib.rnn.MultiRNNCell([drop]*lstm_layers)
initial_state = cell.zero_state(batch_size, tf.float32)
In [13]:
# RNN Forward pass
with graph.as_default():
outputs, final_state = tf.nn.dynamic_rnn(cell, embed, initial_state=initial_state)
In [14]:
with graph.as_default():
logits = tf.contrib.layers.fully_connected(outputs[:, -1], 10, activation_fn=None)
loss = tf.losses.softmax_cross_entropy(logits,final_labels)
cost = tf.losses.mean_squared_error(labels_, logits)
optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)
In [25]:
#Validation Accuracy
with graph.as_default():
correct_pred = tf.equal(tf.cast(tf.round(logits), tf.int32), final_labels)
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
In [26]:
#Batching
def get_batches(x, y, batch_size=100):
n_batches = len(x)//batch_size
x, y = x[:n_batches*batch_size], y[:n_batches*batch_size]
for ii in range(0, len(x), batch_size):
yield x[ii:ii+batch_size], y[ii:ii+batch_size]
In [28]:
#Training
epochs = 10
with graph.as_default():
saver = tf.train.Saver()
with tf.Session(graph=graph) as sess:
sess.run(tf.global_variables_initializer())
iteration = 1
for e in range(epochs):
state = sess.run(initial_state)
for ii, (x, y) in enumerate(get_batches(train_x, train_y, batch_size), 1):
feed = {inputs_: x,
labels_: y,
keep_prob: 0.5,
initial_state: state}
loss, state, _ = sess.run([cost, final_state, optimizer], feed_dict=feed)
if iteration%5==0:
print("Epoch: {}/{}".format(e, epochs),
"Iteration: {}".format(iteration),
"Train loss: {:.3f}".format(loss))
if iteration%25==0:
val_acc = []
val_state = sess.run(cell.zero_state(batch_size, tf.float32))
for x, y in get_batches(val_x, val_y, batch_size):
feed = {inputs_: x,
labels_: y[:, None],
keep_prob: 1,
initial_state: val_state}
batch_acc, val_state = sess.run([accuracy, final_state], feed_dict=feed)
val_acc.append(batch_acc)
print("Val acc: {:.3f}".format(np.mean(val_acc)))
iteration +=1
saver.save(sess, "checkpoints/sentiment.ckpt")
In [ ]: