Importing required libraries


In [ ]:
import operator
import os
import re
import numpy
import random
from random import shuffle
from keras.preprocessing.text import Tokenizer
from create_model_fittest import build_network

random.seed(3)

Loading and pre-processing the data


In [ ]:
window_size = 30
maximum_sense = 5
project_path = "/root/shared_folder/inputs/" # "/home/ali/projects/deep_wsd/inputs/"

def pad_truncate(terms_sequence):
    terms = terms_sequence.split()
    if len(terms) > window_size:
        terms = terms[0:window_size]
    else:
        for i in range(window_size - len(terms)):
            terms.append('T0')
    terms = " ".join(terms)
    return terms

concept_dic = {}
for file in os.listdir(project_path + "/contexts/"):

    if file.endswith(".txt") and file.startswith("_") is False:
        concept = file.split("_")[2]
        concept_dic[concept] = {}
        file_reader = open(project_path + "/contexts/" + file, "r")
        for instance in file_reader:
            instance = instance.rstrip()
            left, right = instance.replace(" RIGHT: ", "RIGHT:").split("RIGHT:")
            instance_no, left = left.replace("LEFT: ", "LEFT:").split("LEFT:")
            instance_no = instance_no.replace(" >> ", "")
            left = pad_truncate(left)
            right = pad_truncate(right)
            concept_dic[concept][instance_no] = [left, right]
        file_reader.close()

amb_term_dic = {}
file_reader = open(project_path + "ambiguous_terms.txt", "r")
for line in file_reader:
    elements = line.split(") ")[1].split("\t<<")[0].split("\t")
    amb_term_dic[elements[0]] = elements[1:]
file_reader.close()

amb_term_instance_dic = {}
for file in sorted(os.listdir(project_path + "/plain/")):

    concept = file.split("_")[0]
    amb_term_instance_dic[concept] = {}

    file_reader = open(project_path + "/plain/" + file, "r")
    for amb_instance in file_reader:
        head = re.search("<head.*</head>", amb_instance).group(0)
        instance_no = re.search('instance=".*?"', head).group(0)
        instance_no = instance_no.split('"')[1]
        sense = re.search('sense=".*?"', head).group(0).split('"')[1]
        candidates = re.search('candidates=".*?"', head).group(0).split('"')[1].split(",")

        amb_term_instance_dic[concept][instance_no] = []

        for candidate_concept in amb_term_dic[concept]:
            attr_list = concept_dic[concept][instance_no] + [candidate_concept]
            if candidate_concept == sense:
                amb_term_instance_dic[concept][instance_no].append(attr_list + [1.0])
            else:
                amb_term_instance_dic[concept][instance_no].append(attr_list + [0.0])
    file_reader.close()

'''
file_writer = open("text.txt", "w")
for amb_term, instance_no in amb_term_instance_dic.items():
    for key, values in instance_no.items():
        for value in values:
            file_writer.write(str(amb_term) + " " + str(key) + " " + str(value[0:5]) + "\n")
            print(str(amb_term) + " " + str(key) + " " + str(value))
file_writer.close()
'''

terms_concepts_dic = {}
file_reader = open(project_path + "/contexts/_terms_CUIs.txt", "r")
for line in file_reader:
    term, concepts = line.rstrip().split(": ")
    concepts_str = ""
    for i in range(maximum_sense - (len(concepts.split()))):
        concepts_str += " C0000000"
    concepts_str = concepts + concepts_str
    terms_concepts_dic[term] = concepts_str
file_reader.close()
print(len(terms_concepts_dic))

expanded_instances = []
# true_labels = []
for amb_term, instances in amb_term_instance_dic.items():
    for instance_key, inner_instances in instances.items():
        for value in inner_instances:
            # print(str(amb_term) + " " + str(instance_key) + " " + str(value))
            left_right_terms = value[0].split(" ") + value[1].split(" ")
            concepts = ""
            for term in left_right_terms:
                concepts += terms_concepts_dic[term] + " "
            concepts += value[2] + " C0000000" * 4
            # print(concepts)
            expanded_instances.append(concepts)
            # true_labels.append(value[3])

In [ ]:
for amb_term, instances in concept_dic.items():
    print amb_term
    for instance, values in instances.items():
        print instance

Preparing data for training with Keras


In [ ]:
tokenizer = Tokenizer(lower=False, num_words=0)
tokenizer.fit_on_texts(expanded_instances)
sequences = tokenizer.texts_to_sequences(expanded_instances)

print(len(sequences))
print("HI")
concept_dic = tokenizer.word_index

MOST_FREQUENT_LEVEL = 10
print('Top', MOST_FREQUENT_LEVEL, 'Most Frequent Concepts' + ":")
for concept_id, index in sorted(concept_dic.items(), key=operator.itemgetter(1))[:MOST_FREQUENT_LEVEL]:
    print('  >>>', concept_id, '   ', index)

"""Chunking the data"""
for i in range(0, len(sequences)):
    chunks = [sequences[i][x:x + maximum_sense] for x in range(0, len(sequences[i]), maximum_sense)]
    sequences[i] = []
    for element in chunks:
        sequences[i].append(element)

In [ ]:
def swap_axes(sequences):
    """
    We swap axes to a have proper input for Keras, 
    and also typecast the whole data as a list and each entry (i.e. input column) as numpy-array.
    """
    sequences_T = list(numpy.swapaxes(sequences, 0, 1))
    for i in range(0, len(sequences_T)):
        sequences_T[i] = numpy.asarray(sequences_T[i])
        
    return sequences_T

In [ ]:
def data_provider(dataset_dic):
    """
    We swap axes to a have proper input for Keras, 
    and also typecast the whole data as a list and each entry (i.e. input column) as numpy-array.
    """
    X = []
    y = []
    for amb_term, instances_no in dataset_dic.items():
        for instance_no, instance in instances_no.items():
            for inner_instance in instance:
                X.append(inner_instance[0][:])
                y.append(inner_instance[-1])
    # print(numpy.asarray(X).shape, len(y))
    X = swap_axes(X)
    return X, numpy.asarray(y)

Building training, validation, and test sets


In [ ]:
data_sequence_dic = {}
amb_term_instance_list_dic = {}
sequence_counter = 0
for amb_term, instances in amb_term_instance_dic.items():
    data_sequence_dic[amb_term] = {}
    amb_term_instance_list_dic[amb_term] = []
    for instance_key, inner_instances in instances.items():
        amb_term_instance_list_dic[amb_term].append(instance_key)
        data_sequence_dic[amb_term][instance_key] = []
        for value in inner_instances:
            data_sequence_dic[amb_term][instance_key].append([sequences[sequence_counter], value[-1]])
            sequence_counter += 1

training_set_dic = {}
test_set_dic = {}
validation_set_dic = {}

for amb_term, instances_no in amb_term_instance_list_dic.items():
    random = list(range(0, len(instances_no)))
    shuffle(random)
    
    test_indices = random[0:int(0.1 * len(random))]
    test_instances = numpy.asarray(instances_no)[test_indices]
    test_set_dic[amb_term] = {}
    for instance in test_instances:
        test_set_dic[amb_term][instance] = data_sequence_dic[amb_term][instance]
        
    validation_indices = random[len(test_indices):int(0.15 * len(random))]
    validation_instances = numpy.asarray(instances_no)[validation_indices]
    validation_set_dic[amb_term] = {}
    for instance in validation_instances:
        validation_set_dic[amb_term][instance] = data_sequence_dic[amb_term][instance]
        
    training_indices = random[len(test_indices) + len(validation_indices):]
    training_instances = numpy.asarray(instances_no)[training_indices]
    training_set_dic[amb_term] = {}
    for instance in training_instances:
        training_set_dic[amb_term][instance] = data_sequence_dic[amb_term][instance]

training_X, training_y = data_provider(training_set_dic)

Fitting the model


In [ ]:
def create_set_confusion_matrices(set_dic, amb_term_dic):
    confusion_matrices_dic = {}
    for amb_term, instance_no in set_dic.items():
        dim = len(amb_term_dic[amb_term])
        confusion_matrix = []
        for i in range(dim):
            confusion_matrix.append([])
            for j in range(dim):
                confusion_matrix[i].append(0)
        # print(numpy.asarray(confusion_matrix).shape)
        confusion_matrices_dic[amb_term] = confusion_matrix
    return confusion_matrices_dic

def print_confusion_matrix(matrix):
    for i in range(len(matrix[0])):
        for j in range(len(matrix[0])):
            print matrix[i][j], " ",
        print 
    
def print_confusion_matrices(confusion_matrices_dic):
    for term, matrix in sorted(confusion_matrices_dic.items()):
        print(term)
        print("===")
        print_confusion_matrix(matrix)
        print("")

In [ ]:
def cal_accuracy(matrix):
    count_total = 0
    count_diagonal = 0
    for i in range(len(matrix[0])):
        count_diagonal += matrix[i][i]
        for j in range(len(matrix[0])):
            count_total += matrix[i][j]
    accuracy = float(count_diagonal) / count_total
    return accuracy

def cal_set_accuracies(set_confusion_matrices_dic):
    all_accuracies = []
    for amb_term, matrix in set_confusion_matrices_dic.items():
        all_accuracies.append(cal_accuracy(matrix))
    final_accuracy = sum(all_accuracies) / len(all_accuracies)
    return final_accuracy

def cal_set_accuracy(set_dic, confusion_metrices_dic, set_predictions, label):
    i = 0
    for amb_term, instances_no in set_dic.items():
        for instance_no, instances in instances_no.items():
            # print(instance_no)
            true_labels = []
            predicted_labels = []
            for instance in instances:
                true_labels.append(instance[-1])
                predicted_labels.append(float(set_predictions[i]))
                i += 1
            argmax_true_labels = numpy.array(true_labels).argmax()
            argmax_predicated_labels = numpy.array(predicted_labels).argmax()
            confusion_metrices_dic[amb_term][argmax_true_labels][argmax_predicated_labels] += 1
        # break
        
    average_accuracies = "%.2f" % (cal_set_accuracies(confusion_metrices_dic) * 100)
    
    print(label + " " + "accuracy: " + average_accuracies + "%")
    
    return (average_accuracies)

In [ ]:
num_epocs = 1000
embedding_file = project_path + "embeddings/WSD_Embeddings_50D_0_7.emb"
model, embedding_layers = build_network(concept_dic=concept_dic, embeddings_file=embedding_file,
                                        MAX_SENSE_LENGTH=maximum_sense, CONTEXT_WINDOW_SIZE=window_size,
                                        PRE_TRAINED=True,
                                        UPDATABLE=True,
                                        optimizer='rmsprop',
                                        output_activation="linear",
                                        EMBEDDING_DIM=50,
                                        dropout_rate=0.3)

In [ ]:
best_epoch = 0
bests_validation_accuracy = 0

for epoc in range(num_epocs):

    print "\nEpoch", str(epoc + 1) + "/" + str(num_epocs)
    history = model.fit(training_X, training_y, batch_size=128, epochs=1)

    validation_X, validation_y = data_provider(validation_set_dic)
    training_pred = model.predict(training_X)
    validation_pred = model.predict(validation_X)

    training_confusion_matrices_dic = create_set_confusion_matrices(training_set_dic, amb_term_dic)
    validation_confusion_matrices_dic = create_set_confusion_matrices(validation_set_dic, amb_term_dic)
    test_confusion_matrices_dic = create_set_confusion_matrices(test_set_dic, amb_term_dic)

    training_epoch_accuracy = cal_set_accuracy(training_set_dic, training_confusion_matrices_dic, training_pred, "Training")
    validation_epoch_accuracy = cal_set_accuracy(validation_set_dic, validation_confusion_matrices_dic, validation_pred, "Validation")
    
    if validation_epoch_accuracy > bests_validation_accuracy:
        bests_validation_accuracy = validation_epoch_accuracy
        best_epoch = epoc + 1
        
    print "Maximum validation accuracy is", bests_validation_accuracy, "observed in epoch",  best_epoch, "."

In [ ]:
for amb_term, instances_no in sorted(validation_confusion_matrices_dic.items()):
    print amb_term
    print "-------"
    print_confusion_matrix(validation_confusion_matrices_dic[amb_term])
    print "-------"
    print "accuracy:", "%.2f" % (cal_accuracy(validation_confusion_matrices_dic[amb_term]) * 100) + "%" 
    print "\n"

In [ ]: