SemEval-2010 Task 8: Multi-Way Classification of Semantic Relations Between Pairs of Nominals

In [1]:
import tensorflow as tf
import numpy as np
from collections import Counter

Preprocessing code here

In [2]:
import string
f=open("TRAIN_FILE.TXT", 'r')"\t\r", "  "))
txt1 = txt1.lower()
for i in range(0, 31984, 4):
    for elem in at.split(" "):
        words.append(elem.replace("<e1>","").replace("</e1>", "").replace("</e2>", "").replace("<e2>", ""))
    sentence_corpora.append(txt[i].strip().replace("<e1>","").replace("</e1>", "").replace("</e2>", "").replace("<e2>", ""))
    sentence_labels.append(txt[i+1].strip().replace("(e1,e2)", "").replace("(e2,e1)", ""))


['the system as described above has its greatest application in an arrayed configuration of antenna elements', 'the child was carefully wrapped and bound into the cradle by means of a cord', 'the author of a keygen uses a disassembler to look at the raw assembly code', 'a misty ridge uprises from the surge', 'the student association is the voice of the undergraduate student population of the state university of new york at buffalohello sir', "this is the sprawling complex that is peru's largest producer of silver", 'the current view is that the chronic inflammation in the distal part of the stomach caused by helicobacter pylori infection results in an increased acid production from the non-infected upper corpus region of the stomach', 'people have been moving back into downtown', 'the lawsonite was contained in a platinum crucible and the counter-weight was a plastic crucible with metal pieces', 'the solute was placed inside a beaker and 5 ml of the solvent was pipetted into a 25 ml glass flask for each trial']

['component-whole', 'other', 'instrument-agency', 'other', 'member-collection', 'other', 'cause-effect', 'entity-destination', 'content-container', 'entity-destination']

['', 'pack', 'carlton', 'matrimonial', 'burrowing', 'forty-eight', 'combustion', 'pre-made', 'zanzibar', 'destrcution']

In [3]:
counts = Counter(words)
vocab = sorted(counts, key=counts.get, reverse=True)
vocab_to_int = {word: ii for ii, word in enumerate(vocab,1)}
#print(list(vocab_to_int.keys())[list(vocab_to_int.values()).index(22947)]) # Prints george

for sentence in sentence_corpora:
    sentence_int.append([vocab_to_int[word] for word in sentence.split()])

[10845, 823, 16066, 8795, 10537, 15381, 20784, 11329, 19409, 10543, 9317, 6281, 3960, 15762, 3268, 6492]

In [4]:
#Setting Label values for Softmax Classifier
label_dict={"cause-effect": 0, 
            "instrument-agency": 1, 
            "product-producer": 2, 
            "content-container": 3, 
            "entity-origin": 4, 
            "entity-destination": 5, 
            "component-whole": 6,
            "member-collection": 7,
            "message-topic": 8,
            "other": 9}
for elem in sentence_labels:
#final_labels = np.array(final_labels)

In [5]:
# Max sequnce length I could find was 10
#padding remaining spaces with 0
review_lens = Counter([len(x) for x in sentence_int])
print("Maximum review length: {}".format(max(review_lens)))
seq_len = 85
features = np.zeros((len(sentence_int), seq_len), dtype=int)
for i, row in enumerate(sentence_int):
    features[i, -len(row):] = np.array(row)[:seq_len]

Maximum review length: 85

In [6]:
seq_len = 85
features = np.ndarray((len(sentence_int), seq_len), dtype=int)
for i, row in enumerate(sentence_int):
    features[i, -len(row):] = np.array(row)[:seq_len]
for i in range(0,7996):
    for j in range(0,85):
        if features[i,j]==0:

'\nseq_len = 85\nfeatures = np.ndarray((len(sentence_int), seq_len), dtype=int)\nfor i, row in enumerate(sentence_int):\n    features[i, -len(row):] = np.array(row)[:seq_len]\nfor i in range(0,7996):\n    for j in range(0,85):\n        if features[i,j]==0:\n            features[i,j]=-1\n            '

In [7]:

(7996, 85)
In [8]:
# Hyperparams
lstm_size= 256
lstm_layers= 2
batch_size= 7996 # no.of sentences fed in 1 epoch 

In [9]:
#train test split
split_frac = 0.9
split_idx = int(len(features)*0.9)
train_x, val_x = features[:split_idx], features[split_idx:]
train_y, val_y = final_labels[:split_idx], final_labels[split_idx:]
test_idx = int(len(val_x)*0.5)
val_x, test_x = val_x[:test_idx], val_x[test_idx:]
val_y, test_y = val_y[:test_idx], val_y[test_idx:]

print("\t\t\tFeature Shapes:")
print("Train set: \t\t{}".format(train_x.shape), 
      "\nValidation set: \t{}".format(val_x.shape),
      "\nTest set: \t\t{}".format(test_x.shape))

			Feature Shapes:
Train set: 		(7196, 85) 
Validation set: 	(400, 85) 
Test set: 		(400, 85)

In [10]:
n_words = len(vocab_to_int)
# Create the graph object
graph = tf.Graph()
# Add nodes to the graph
with graph.as_default():
    final_labels = tf.one_hot(final_labels, 10, 1, 0)  
    inputs_ = tf.placeholder(tf.int32, [None, None], name='inputs')
    labels_ = tf.placeholder(tf.int32, [None, 10], name='labels')
    keep_prob = tf.placeholder(tf.float32, name='keep_prob')

In [11]:
embed_size= 256
with graph.as_default():
    embedding = tf.Variable(tf.random_uniform((n_words, embed_size), -1, 1))
    embed=tf.nn.embedding_lookup(embedding, inputs_)

In [12]:
# Main Network
with graph.as_default():
    drop=tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=keep_prob)
    initial_state = cell.zero_state(batch_size, tf.float32)

In [13]:
# RNN Forward pass
with graph.as_default():
    outputs, final_state = tf.nn.dynamic_rnn(cell, embed, initial_state=initial_state)

In [14]:
with graph.as_default():
    logits = tf.contrib.layers.fully_connected(outputs[:, -1], 10, activation_fn=None)
    loss = tf.losses.softmax_cross_entropy(logits,final_labels)
    cost = tf.losses.mean_squared_error(labels_, logits)
    optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)

In [26]:
def get_batches(x, y, batch_size=100):
    n_batches = len(x)//batch_size
    x, y = x[:n_batches*batch_size], y[:n_batches*batch_size]
    for ii in range(0, len(x), batch_size):
        yield x[ii:ii+batch_size], y[ii:ii+batch_size]

In [28]:
epochs = 10

with graph.as_default():
    saver = tf.train.Saver()

with tf.Session(graph=graph) as sess:
    iteration = 1
    for e in range(epochs):
        state =
        for ii, (x, y) in enumerate(get_batches(train_x, train_y, batch_size), 1):
            feed = {inputs_: x,
                    labels_: y,
                    keep_prob: 0.5,
                    initial_state: state}
            loss, state, _ =[cost, final_state, optimizer], feed_dict=feed)
            if iteration%5==0:
                print("Epoch: {}/{}".format(e, epochs),
                      "Iteration: {}".format(iteration),
                      "Train loss: {:.3f}".format(loss))

            if iteration%25==0:
                val_acc = []
                val_state =, tf.float32))
                for x, y in get_batches(val_x, val_y, batch_size):
                    feed = {inputs_: x,
                            labels_: y[:, None],
                            keep_prob: 1,
                            initial_state: val_state}
                    batch_acc, val_state =[accuracy, final_state], feed_dict=feed)
                print("Val acc: {:.3f}".format(np.mean(val_acc)))
            iteration +=1, "checkpoints/sentiment.ckpt")

In [ ]: