SemEval-2010 Task 8: Multi-Way Classification of Semantic Relations Between Pairs of Nominals


In [1]:
import tensorflow as tf
import numpy as np
from collections import Counter

Preprocessing code here


In [2]:
import string
f=open("TRAIN_FILE.TXT", 'r')
txt1=f.read().translate(str.maketrans("\t\r", "  "))
txt1 = txt1.lower()
"".join(txt1.split())
txt=txt1.split('\n')
sentence_corpora=[]
sentence_labels=[]
words=[]
for i in range(0, 31984, 4):
    txt[i]=txt[i].lstrip('0123456789')
    txt[i]=txt[i].replace('\"','')
    txt[i]=txt[i].replace('.','')
    at=str(txt[i].strip())
    for elem in at.split(" "):
        words.append(elem.replace("<e1>","").replace("</e1>", "").replace("</e2>", "").replace("<e2>", ""))
    sentence_corpora.append(txt[i].strip().replace("<e1>","").replace("</e1>", "").replace("</e2>", "").replace("<e2>", ""))
    sentence_labels.append(txt[i+1].strip().replace("(e1,e2)", "").replace("(e2,e1)", ""))

print(sentence_corpora[0:10])
print("\n")
print(sentence_labels[:10])
print("\n")
words=list(set(words))
print(words[0:10])


['the system as described above has its greatest application in an arrayed configuration of antenna elements', 'the child was carefully wrapped and bound into the cradle by means of a cord', 'the author of a keygen uses a disassembler to look at the raw assembly code', 'a misty ridge uprises from the surge', 'the student association is the voice of the undergraduate student population of the state university of new york at buffalohello sir', "this is the sprawling complex that is peru's largest producer of silver", 'the current view is that the chronic inflammation in the distal part of the stomach caused by helicobacter pylori infection results in an increased acid production from the non-infected upper corpus region of the stomach', 'people have been moving back into downtown', 'the lawsonite was contained in a platinum crucible and the counter-weight was a plastic crucible with metal pieces', 'the solute was placed inside a beaker and 5 ml of the solvent was pipetted into a 25 ml glass flask for each trial']


['component-whole', 'other', 'instrument-agency', 'other', 'member-collection', 'other', 'cause-effect', 'entity-destination', 'content-container', 'entity-destination']


['', 'pack', 'carlton', 'matrimonial', 'burrowing', 'forty-eight', 'combustion', 'pre-made', 'zanzibar', 'destrcution']

In [3]:
counts = Counter(words)
vocab = sorted(counts, key=counts.get, reverse=True)
vocab_to_int = {word: ii for ii, word in enumerate(vocab,1)}
print(len(vocab_to_int))
#print(list(vocab_to_int.keys())[list(vocab_to_int.values()).index(22947)]) # Prints george

sentence_int=[]
for sentence in sentence_corpora:
    sentence_int.append([vocab_to_int[word] for word in sentence.split()])
print(sentence_int[0])
print(len(sentence_int[0]))
print(len(sentence_int))


22947
[10845, 823, 16066, 8795, 10537, 15381, 20784, 11329, 19409, 10543, 9317, 6281, 3960, 15762, 3268, 6492]
16
7996

In [4]:
#Setting Label values for Softmax Classifier
label_dict={"cause-effect": 0, 
            "instrument-agency": 1, 
            "product-producer": 2, 
            "content-container": 3, 
            "entity-origin": 4, 
            "entity-destination": 5, 
            "component-whole": 6,
            "member-collection": 7,
            "message-topic": 8,
            "other": 9}
final_labels=[]
for elem in sentence_labels:
    final_labels.append(label_dict[elem])
#print(len(final_labels))
#final_labels = np.array(final_labels)
#print(final_labels[:10])

In [5]:
# Max sequnce length I could find was 10
#padding remaining spaces with 0
review_lens = Counter([len(x) for x in sentence_int])
print("Maximum review length: {}".format(max(review_lens)))
seq_len = 85
features = np.zeros((len(sentence_int), seq_len), dtype=int)
for i, row in enumerate(sentence_int):
    features[i, -len(row):] = np.array(row)[:seq_len]


Maximum review length: 85

In [6]:
'''
seq_len = 85
features = np.ndarray((len(sentence_int), seq_len), dtype=int)
for i, row in enumerate(sentence_int):
    features[i, -len(row):] = np.array(row)[:seq_len]
for i in range(0,7996):
    for j in range(0,85):
        if features[i,j]==0:
            features[i,j]=-1
            '''


Out[6]:
'\nseq_len = 85\nfeatures = np.ndarray((len(sentence_int), seq_len), dtype=int)\nfor i, row in enumerate(sentence_int):\n    features[i, -len(row):] = np.array(row)[:seq_len]\nfor i in range(0,7996):\n    for j in range(0,85):\n        if features[i,j]==0:\n            features[i,j]=-1\n            '

In [7]:
print(np.shape(features))
features[0:5,0:85]


(7996, 85)
Out[7]:
array([[    0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0, 10845,   823, 16066,
         8795, 10537, 15381, 20784, 11329, 19409, 10543,  9317,  6281,
         3960, 15762,  3268,  6492],
       [    0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0, 10845, 12835,
        17674,  2391,   666, 17079, 16594,  3078, 10845,  4047, 14984,
        21522, 15762, 19724,  2820],
       [    0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0, 10845,  1562,
        15762, 19724, 15360, 17826, 19724,  2033,  5789,  6959,  6349,
        10845, 10687, 17230, 20620],
       [    0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0, 19724,  8822,  8135,
         3136,  3242, 10845,  8426],
       [    0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0, 10845, 16286,  3170, 20745, 10845,  5158, 15762, 10845,
        11461, 16286,  5041, 15762, 10845,  1202,  8327, 15762, 18869,
         7764,  6349,   127,  5068]])

In [8]:
# Hyperparams
lstm_size= 256
lstm_layers= 2
batch_size= 7996 # no.of sentences fed in 1 epoch 
learning_rate=0.003

In [9]:
#train test split
split_frac = 0.9
split_idx = int(len(features)*0.9)
train_x, val_x = features[:split_idx], features[split_idx:]
train_y, val_y = final_labels[:split_idx], final_labels[split_idx:]
test_idx = int(len(val_x)*0.5)
val_x, test_x = val_x[:test_idx], val_x[test_idx:]
val_y, test_y = val_y[:test_idx], val_y[test_idx:]

print("\t\t\tFeature Shapes:")
print("Train set: \t\t{}".format(train_x.shape), 
      "\nValidation set: \t{}".format(val_x.shape),
      "\nTest set: \t\t{}".format(test_x.shape))


			Feature Shapes:
Train set: 		(7196, 85) 
Validation set: 	(400, 85) 
Test set: 		(400, 85)

In [10]:
n_words = len(vocab_to_int)
#print(n_words)
# Create the graph object
graph = tf.Graph()
# Add nodes to the graph
with graph.as_default():
    final_labels = tf.one_hot(final_labels, 10, 1, 0)  
    inputs_ = tf.placeholder(tf.int32, [None, None], name='inputs')
    labels_ = tf.placeholder(tf.int32, [None, 10], name='labels')
    keep_prob = tf.placeholder(tf.float32, name='keep_prob')

In [11]:
embed_size= 256
with graph.as_default():
    embedding = tf.Variable(tf.random_uniform((n_words, embed_size), -1, 1))
    embed=tf.nn.embedding_lookup(embedding, inputs_)

In [12]:
# Main Network
with graph.as_default():
    lstm=tf.contrib.rnn.BasicLSTMCell(lstm_size)
    drop=tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=keep_prob)
    cell=tf.contrib.rnn.MultiRNNCell([drop]*lstm_layers)
    initial_state = cell.zero_state(batch_size, tf.float32)

In [13]:
# RNN Forward pass
with graph.as_default():
    outputs, final_state = tf.nn.dynamic_rnn(cell, embed, initial_state=initial_state)

In [14]:
with graph.as_default():
    logits = tf.contrib.layers.fully_connected(outputs[:, -1], 10, activation_fn=None)
    loss = tf.losses.softmax_cross_entropy(logits,final_labels)
    cost = tf.losses.mean_squared_error(labels_, logits)
    optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)


---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-14-66c4f0e9d8f0> in <module>()
      1 with graph.as_default():
      2     logits = tf.contrib.layers.fully_connected(outputs[:, -1], 10, activation_fn=None)
----> 3     loss = tf.losses.softmax_cross_entropy(logits,final_labels)
      4     cost = tf.losses.mean_squared_error(labels_, logits)
      5     optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)

~/anaconda2/envs/env/lib/python3.6/site-packages/tensorflow/python/ops/losses/losses_impl.py in softmax_cross_entropy(onehot_labels, logits, weights, label_smoothing, scope, loss_collection, reduction)
    644     losses = nn.softmax_cross_entropy_with_logits(labels=onehot_labels,
    645                                                   logits=logits,
--> 646                                                   name="xentropy")
    647     return compute_weighted_loss(
    648         losses, weights, scope, loss_collection, reduction=reduction)

~/anaconda2/envs/env/lib/python3.6/site-packages/tensorflow/python/ops/nn_ops.py in softmax_cross_entropy_with_logits(_sentinel, labels, logits, dim, name)
   1592   # _CrossEntropyGrad() in nn_grad but not here.
   1593   cost, unused_backprop = gen_nn_ops._softmax_cross_entropy_with_logits(
-> 1594       precise_logits, labels, name=name)
   1595 
   1596   # The output cost shape should be the input minus dim.

~/anaconda2/envs/env/lib/python3.6/site-packages/tensorflow/python/ops/gen_nn_ops.py in _softmax_cross_entropy_with_logits(features, labels, name)
   2378   """
   2379   result = _op_def_lib.apply_op("SoftmaxCrossEntropyWithLogits",
-> 2380                                 features=features, labels=labels, name=name)
   2381   return _SoftmaxCrossEntropyWithLogitsOutput._make(result)
   2382 

~/anaconda2/envs/env/lib/python3.6/site-packages/tensorflow/python/framework/op_def_library.py in apply_op(self, op_type_name, name, **keywords)
    587               _SatisfiesTypeConstraint(base_type,
    588                                        _Attr(op_def, input_arg.type_attr),
--> 589                                        param_name=input_name)
    590             attrs[input_arg.type_attr] = attr_value
    591             inferred_from[input_arg.type_attr] = input_name

~/anaconda2/envs/env/lib/python3.6/site-packages/tensorflow/python/framework/op_def_library.py in _SatisfiesTypeConstraint(dtype, attr_def, param_name)
     58           "allowed values: %s" %
     59           (param_name, dtypes.as_dtype(dtype).name,
---> 60            ", ".join(dtypes.as_dtype(x).name for x in allowed_list)))
     61 
     62 

TypeError: Value passed to parameter 'features' has DataType int32 not in list of allowed values: float16, float32, float64

In [25]:
#Validation Accuracy
with graph.as_default():
    correct_pred = tf.equal(tf.cast(tf.round(logits), tf.int32), final_labels)
    accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))


---------------------------------------------------------------------------
InvalidArgumentError                      Traceback (most recent call last)
~/anaconda2/envs/env/lib/python3.6/site-packages/tensorflow/python/framework/common_shapes.py in _call_cpp_shape_fn_impl(op, input_tensors_needed, input_tensors_as_shapes_needed, debug_python_shape_fn, require_shape_fn)
    670           graph_def_version, node_def_str, input_shapes, input_tensors,
--> 671           input_tensors_as_shapes, status)
    672   except errors.InvalidArgumentError as err:

~/anaconda2/envs/env/lib/python3.6/contextlib.py in __exit__(self, type, value, traceback)
     88             try:
---> 89                 next(self.gen)
     90             except StopIteration:

~/anaconda2/envs/env/lib/python3.6/site-packages/tensorflow/python/framework/errors_impl.py in raise_exception_on_not_ok_status()
    465           compat.as_text(pywrap_tensorflow.TF_Message(status)),
--> 466           pywrap_tensorflow.TF_GetCode(status))
    467   finally:

InvalidArgumentError: Dimensions must be equal, but are 512 and 7996 for 'Equal' (op: 'Equal') with input shapes: [512,10], [7996,10].

During handling of the above exception, another exception occurred:

ValueError                                Traceback (most recent call last)
<ipython-input-25-556c29b0e1b9> in <module>()
      1 #Validation Accuracy
      2 with graph.as_default():
----> 3     correct_pred = tf.equal(tf.cast(tf.round(logits), tf.int32), final_labels)
      4     accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

~/anaconda2/envs/env/lib/python3.6/site-packages/tensorflow/python/ops/gen_math_ops.py in equal(x, y, name)
    679     A `Tensor` of type `bool`.
    680   """
--> 681   result = _op_def_lib.apply_op("Equal", x=x, y=y, name=name)
    682   return result
    683 

~/anaconda2/envs/env/lib/python3.6/site-packages/tensorflow/python/framework/op_def_library.py in apply_op(self, op_type_name, name, **keywords)
    765         op = g.create_op(op_type_name, inputs, output_types, name=scope,
    766                          input_types=input_types, attrs=attr_protos,
--> 767                          op_def=op_def)
    768         if output_structure:
    769           outputs = op.outputs

~/anaconda2/envs/env/lib/python3.6/site-packages/tensorflow/python/framework/ops.py in create_op(self, op_type, inputs, dtypes, input_types, name, attrs, op_def, compute_shapes, compute_device)
   2506                     original_op=self._default_original_op, op_def=op_def)
   2507     if compute_shapes:
-> 2508       set_shapes_for_outputs(ret)
   2509     self._add_op(ret)
   2510     self._record_op_seen_by_control_dependencies(ret)

~/anaconda2/envs/env/lib/python3.6/site-packages/tensorflow/python/framework/ops.py in set_shapes_for_outputs(op)
   1871       shape_func = _call_cpp_shape_fn_and_require_op
   1872 
-> 1873   shapes = shape_func(op)
   1874   if shapes is None:
   1875     raise RuntimeError(

~/anaconda2/envs/env/lib/python3.6/site-packages/tensorflow/python/framework/ops.py in call_with_requiring(op)
   1821 
   1822   def call_with_requiring(op):
-> 1823     return call_cpp_shape_fn(op, require_shape_fn=True)
   1824 
   1825   _call_cpp_shape_fn_and_require_op = call_with_requiring

~/anaconda2/envs/env/lib/python3.6/site-packages/tensorflow/python/framework/common_shapes.py in call_cpp_shape_fn(op, input_tensors_needed, input_tensors_as_shapes_needed, debug_python_shape_fn, require_shape_fn)
    608     res = _call_cpp_shape_fn_impl(op, input_tensors_needed,
    609                                   input_tensors_as_shapes_needed,
--> 610                                   debug_python_shape_fn, require_shape_fn)
    611     if not isinstance(res, dict):
    612       # Handles the case where _call_cpp_shape_fn_impl calls unknown_shape(op).

~/anaconda2/envs/env/lib/python3.6/site-packages/tensorflow/python/framework/common_shapes.py in _call_cpp_shape_fn_impl(op, input_tensors_needed, input_tensors_as_shapes_needed, debug_python_shape_fn, require_shape_fn)
    674       missing_shape_fn = True
    675     else:
--> 676       raise ValueError(err.message)
    677 
    678   if missing_shape_fn:

ValueError: Dimensions must be equal, but are 512 and 7996 for 'Equal' (op: 'Equal') with input shapes: [512,10], [7996,10].

In [26]:
#Batching
def get_batches(x, y, batch_size=100):
    
    n_batches = len(x)//batch_size
    x, y = x[:n_batches*batch_size], y[:n_batches*batch_size]
    for ii in range(0, len(x), batch_size):
        yield x[ii:ii+batch_size], y[ii:ii+batch_size]

In [28]:
#Training
epochs = 10

with graph.as_default():
    saver = tf.train.Saver()

with tf.Session(graph=graph) as sess:
    sess.run(tf.global_variables_initializer())
    iteration = 1
    for e in range(epochs):
        state = sess.run(initial_state)
        
        for ii, (x, y) in enumerate(get_batches(train_x, train_y, batch_size), 1):
            feed = {inputs_: x,
                    labels_: y,
                    keep_prob: 0.5,
                    initial_state: state}
            loss, state, _ = sess.run([cost, final_state, optimizer], feed_dict=feed)
            
            if iteration%5==0:
                print("Epoch: {}/{}".format(e, epochs),
                      "Iteration: {}".format(iteration),
                      "Train loss: {:.3f}".format(loss))

            if iteration%25==0:
                val_acc = []
                val_state = sess.run(cell.zero_state(batch_size, tf.float32))
                for x, y in get_batches(val_x, val_y, batch_size):
                    feed = {inputs_: x,
                            labels_: y[:, None],
                            keep_prob: 1,
                            initial_state: val_state}
                    batch_acc, val_state = sess.run([accuracy, final_state], feed_dict=feed)
                    val_acc.append(batch_acc)
                print("Val acc: {:.3f}".format(np.mean(val_acc)))
            iteration +=1
    saver.save(sess, "checkpoints/sentiment.ckpt")


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-28-76560b539ec7> in <module>()
     16                     keep_prob: 0.5,
     17                     initial_state: state}
---> 18             loss, state, _ = sess.run([cost, final_state, optimizer], feed_dict=feed)
     19 
     20             if iteration%5==0:

NameError: name 'cost' is not defined

In [ ]: