In [ ]:
import operator
import os
import re
import numpy
import random
from random import shuffle
from keras.preprocessing.text import Tokenizer
from create_model_fittest import build_network
random.seed(3)
In [ ]:
window_size = 30
maximum_sense = 5
project_path = "/root/shared_folder/inputs/" # "/home/ali/projects/deep_wsd/inputs/"
def pad_truncate(terms_sequence):
terms = terms_sequence.split()
if len(terms) > window_size:
terms = terms[0:window_size]
else:
for i in range(window_size - len(terms)):
terms.append('T0')
terms = " ".join(terms)
return terms
concept_dic = {}
for file in os.listdir(project_path + "/contexts/"):
if file.endswith(".txt") and file.startswith("_") is False:
concept = file.split("_")[2]
concept_dic[concept] = {}
file_reader = open(project_path + "/contexts/" + file, "r")
for instance in file_reader:
instance = instance.rstrip()
left, right = instance.replace(" RIGHT: ", "RIGHT:").split("RIGHT:")
instance_no, left = left.replace("LEFT: ", "LEFT:").split("LEFT:")
instance_no = instance_no.replace(" >> ", "")
left = pad_truncate(left)
right = pad_truncate(right)
concept_dic[concept][instance_no] = [left, right]
file_reader.close()
amb_term_dic = {}
file_reader = open(project_path + "ambiguous_terms.txt", "r")
for line in file_reader:
elements = line.split(") ")[1].split("\t<<")[0].split("\t")
amb_term_dic[elements[0]] = elements[1:]
file_reader.close()
amb_term_instance_dic = {}
for file in sorted(os.listdir(project_path + "/plain/")):
concept = file.split("_")[0]
amb_term_instance_dic[concept] = {}
file_reader = open(project_path + "/plain/" + file, "r")
for amb_instance in file_reader:
head = re.search("<head.*</head>", amb_instance).group(0)
instance_no = re.search('instance=".*?"', head).group(0)
instance_no = instance_no.split('"')[1]
sense = re.search('sense=".*?"', head).group(0).split('"')[1]
candidates = re.search('candidates=".*?"', head).group(0).split('"')[1].split(",")
amb_term_instance_dic[concept][instance_no] = []
for candidate_concept in amb_term_dic[concept]:
attr_list = concept_dic[concept][instance_no] + [candidate_concept]
if candidate_concept == sense:
amb_term_instance_dic[concept][instance_no].append(attr_list + [1.0])
else:
amb_term_instance_dic[concept][instance_no].append(attr_list + [0.0])
file_reader.close()
'''
file_writer = open("text.txt", "w")
for amb_term, instance_no in amb_term_instance_dic.items():
for key, values in instance_no.items():
for value in values:
file_writer.write(str(amb_term) + " " + str(key) + " " + str(value[0:5]) + "\n")
print(str(amb_term) + " " + str(key) + " " + str(value))
file_writer.close()
'''
terms_concepts_dic = {}
file_reader = open(project_path + "/contexts/_terms_CUIs.txt", "r")
for line in file_reader:
term, concepts = line.rstrip().split(": ")
concepts_str = ""
for i in range(maximum_sense - (len(concepts.split()))):
concepts_str += " C0000000"
concepts_str = concepts + concepts_str
terms_concepts_dic[term] = concepts_str
file_reader.close()
print(len(terms_concepts_dic))
expanded_instances = []
# true_labels = []
for amb_term, instances in amb_term_instance_dic.items():
for instance_key, inner_instances in instances.items():
for value in inner_instances:
# print(str(amb_term) + " " + str(instance_key) + " " + str(value))
left_right_terms = value[0].split(" ") + value[1].split(" ")
concepts = ""
for term in left_right_terms:
concepts += terms_concepts_dic[term] + " "
concepts += value[2] + " C0000000" * 4
# print(concepts)
expanded_instances.append(concepts)
# true_labels.append(value[3])
In [ ]:
for amb_term, instances in concept_dic.items():
print amb_term
for instance, values in instances.items():
print instance
In [ ]:
tokenizer = Tokenizer(lower=False, num_words=0)
tokenizer.fit_on_texts(expanded_instances)
sequences = tokenizer.texts_to_sequences(expanded_instances)
print(len(sequences))
print("HI")
concept_dic = tokenizer.word_index
MOST_FREQUENT_LEVEL = 10
print('Top', MOST_FREQUENT_LEVEL, 'Most Frequent Concepts' + ":")
for concept_id, index in sorted(concept_dic.items(), key=operator.itemgetter(1))[:MOST_FREQUENT_LEVEL]:
print(' >>>', concept_id, ' ', index)
"""Chunking the data"""
for i in range(0, len(sequences)):
chunks = [sequences[i][x:x + maximum_sense] for x in range(0, len(sequences[i]), maximum_sense)]
sequences[i] = []
for element in chunks:
sequences[i].append(element)
In [ ]:
def swap_axes(sequences):
"""
We swap axes to a have proper input for Keras,
and also typecast the whole data as a list and each entry (i.e. input column) as numpy-array.
"""
sequences_T = list(numpy.swapaxes(sequences, 0, 1))
for i in range(0, len(sequences_T)):
sequences_T[i] = numpy.asarray(sequences_T[i])
return sequences_T
In [ ]:
def data_provider(dataset_dic):
"""
We swap axes to a have proper input for Keras,
and also typecast the whole data as a list and each entry (i.e. input column) as numpy-array.
"""
X = []
y = []
for amb_term, instances_no in dataset_dic.items():
for instance_no, instance in instances_no.items():
for inner_instance in instance:
X.append(inner_instance[0][:])
y.append(inner_instance[-1])
# print(numpy.asarray(X).shape, len(y))
X = swap_axes(X)
return X, numpy.asarray(y)
In [ ]:
data_sequence_dic = {}
amb_term_instance_list_dic = {}
sequence_counter = 0
for amb_term, instances in amb_term_instance_dic.items():
data_sequence_dic[amb_term] = {}
amb_term_instance_list_dic[amb_term] = []
for instance_key, inner_instances in instances.items():
amb_term_instance_list_dic[amb_term].append(instance_key)
data_sequence_dic[amb_term][instance_key] = []
for value in inner_instances:
data_sequence_dic[amb_term][instance_key].append([sequences[sequence_counter], value[-1]])
sequence_counter += 1
training_set_dic = {}
test_set_dic = {}
validation_set_dic = {}
for amb_term, instances_no in amb_term_instance_list_dic.items():
random = list(range(0, len(instances_no)))
shuffle(random)
test_indices = random[0:int(0.1 * len(random))]
test_instances = numpy.asarray(instances_no)[test_indices]
test_set_dic[amb_term] = {}
for instance in test_instances:
test_set_dic[amb_term][instance] = data_sequence_dic[amb_term][instance]
validation_indices = random[len(test_indices):int(0.15 * len(random))]
validation_instances = numpy.asarray(instances_no)[validation_indices]
validation_set_dic[amb_term] = {}
for instance in validation_instances:
validation_set_dic[amb_term][instance] = data_sequence_dic[amb_term][instance]
training_indices = random[len(test_indices) + len(validation_indices):]
training_instances = numpy.asarray(instances_no)[training_indices]
training_set_dic[amb_term] = {}
for instance in training_instances:
training_set_dic[amb_term][instance] = data_sequence_dic[amb_term][instance]
training_X, training_y = data_provider(training_set_dic)
In [ ]:
def create_set_confusion_matrices(set_dic, amb_term_dic):
confusion_matrices_dic = {}
for amb_term, instance_no in set_dic.items():
dim = len(amb_term_dic[amb_term])
confusion_matrix = []
for i in range(dim):
confusion_matrix.append([])
for j in range(dim):
confusion_matrix[i].append(0)
# print(numpy.asarray(confusion_matrix).shape)
confusion_matrices_dic[amb_term] = confusion_matrix
return confusion_matrices_dic
def print_confusion_matrix(matrix):
for i in range(len(matrix[0])):
for j in range(len(matrix[0])):
print matrix[i][j], " ",
print
def print_confusion_matrices(confusion_matrices_dic):
for term, matrix in sorted(confusion_matrices_dic.items()):
print(term)
print("===")
print_confusion_matrix(matrix)
print("")
In [ ]:
def cal_accuracy(matrix):
count_total = 0
count_diagonal = 0
for i in range(len(matrix[0])):
count_diagonal += matrix[i][i]
for j in range(len(matrix[0])):
count_total += matrix[i][j]
accuracy = float(count_diagonal) / count_total
return accuracy
def cal_set_accuracies(set_confusion_matrices_dic):
all_accuracies = []
for amb_term, matrix in set_confusion_matrices_dic.items():
all_accuracies.append(cal_accuracy(matrix))
final_accuracy = sum(all_accuracies) / len(all_accuracies)
return final_accuracy
def cal_set_accuracy(set_dic, confusion_metrices_dic, set_predictions, label):
i = 0
for amb_term, instances_no in set_dic.items():
for instance_no, instances in instances_no.items():
# print(instance_no)
true_labels = []
predicted_labels = []
for instance in instances:
true_labels.append(instance[-1])
predicted_labels.append(float(set_predictions[i]))
i += 1
argmax_true_labels = numpy.array(true_labels).argmax()
argmax_predicated_labels = numpy.array(predicted_labels).argmax()
confusion_metrices_dic[amb_term][argmax_true_labels][argmax_predicated_labels] += 1
# break
average_accuracies = "%.2f" % (cal_set_accuracies(confusion_metrices_dic) * 100)
print(label + " " + "accuracy: " + average_accuracies + "%")
return (average_accuracies)
In [ ]:
num_epocs = 1000
embedding_file = project_path + "embeddings/WSD_Embeddings_50D_0_7.emb"
model, embedding_layers = build_network(concept_dic=concept_dic, embeddings_file=embedding_file,
MAX_SENSE_LENGTH=maximum_sense, CONTEXT_WINDOW_SIZE=window_size,
PRE_TRAINED=True,
UPDATABLE=True,
optimizer='rmsprop',
output_activation="linear",
EMBEDDING_DIM=50,
dropout_rate=0.3)
In [ ]:
best_epoch = 0
bests_validation_accuracy = 0
for epoc in range(num_epocs):
print "\nEpoch", str(epoc + 1) + "/" + str(num_epocs)
history = model.fit(training_X, training_y, batch_size=128, epochs=1)
validation_X, validation_y = data_provider(validation_set_dic)
training_pred = model.predict(training_X)
validation_pred = model.predict(validation_X)
training_confusion_matrices_dic = create_set_confusion_matrices(training_set_dic, amb_term_dic)
validation_confusion_matrices_dic = create_set_confusion_matrices(validation_set_dic, amb_term_dic)
test_confusion_matrices_dic = create_set_confusion_matrices(test_set_dic, amb_term_dic)
training_epoch_accuracy = cal_set_accuracy(training_set_dic, training_confusion_matrices_dic, training_pred, "Training")
validation_epoch_accuracy = cal_set_accuracy(validation_set_dic, validation_confusion_matrices_dic, validation_pred, "Validation")
if validation_epoch_accuracy > bests_validation_accuracy:
bests_validation_accuracy = validation_epoch_accuracy
best_epoch = epoc + 1
print "Maximum validation accuracy is", bests_validation_accuracy, "observed in epoch", best_epoch, "."
In [ ]:
for amb_term, instances_no in sorted(validation_confusion_matrices_dic.items()):
print amb_term
print "-------"
print_confusion_matrix(validation_confusion_matrices_dic[amb_term])
print "-------"
print "accuracy:", "%.2f" % (cal_accuracy(validation_confusion_matrices_dic[amb_term]) * 100) + "%"
print "\n"
In [ ]: