For two entities e1 and e2, predict whether the relation r holds between them, e.g., e1: Obama, e2: USA, r: born_in --> True.
For the link prediction datasets used here as well as several other models optimized for link prediction, check out https://github.com/ibalazevic/HypER and the associated paper.
A SimEc is first trained on all relations combined, then finetuned for individual relations, at which point we use early stopping based on the results on the validation dataset to avoid overfitting.
The performance might still improve with more careful hyperparameter tuning...
In [1]:
from __future__ import unicode_literals, division, print_function, absolute_import
from builtins import range
import numpy as np
np.random.seed(28)
import pandas as pd
import tensorflow as tf
tf.set_random_seed(28)
import keras
import keras.backend as K
from copy import deepcopy
from scipy.sparse import dok_matrix, csr_matrix
from simec import SimilarityEncoder
%load_ext autoreload
%autoreload 2
In [2]:
def load_data(dataset="WN18", split="train"):
"""
load the data
"""
# read in relation data: {relation: {e1: [e2, e4, ...], e2: [e3, e4, ...]}}
# where entities have new ids matching the matrix indices!
# always also save the reverse relation and the collected "all" relations that could be used for pretraining
rel_dict = {"all": {e: [] for e in sorted_entity_ids}}
df = pd.read_csv("data/link_prediction/%s/%s.txt" % (dataset, split), sep="\t", names=["e1", "rel", "e2"])
for i, d in df.iterrows():
if d["rel"] not in rel_dict:
rel_dict[d["rel"]] = {e: [] for e in sorted_entity_ids}
rel_dict[d["rel"]+"_reverse"] = {e: [] for e in sorted_entity_ids}
rel_dict[d["rel"]][entity_ids[d["e1"]]].append(entity_ids[d["e2"]])
rel_dict[d["rel"]+"_reverse"][entity_ids[d["e2"]]].append(entity_ids[d["e1"]])
rel_dict["all"][entity_ids[d["e1"]]].append(entity_ids[d["e2"]])
rel_dict["all"][entity_ids[d["e2"]]].append(entity_ids[d["e1"]])
return rel_dict
def relation_matrix(entity_dict):
"""
Build a relations matrix that has 1 where two entities have a relation and 0 else
Inputs:
- entity_dict: dict with relations: {e1: [e2, e3, ...]} means entity 1 has relations to 2 and 3
i.e. something like rel_train[rel] for one relation
Returns:
- n_entities x n_entities relations matrix: for each entity (row), all positions are 1 where this entity
has a relation with other entities
"""
n = len(sorted_entity_ids)
# transform entity relations into sparse matrix
relmat = dok_matrix((n, n), dtype=np.int8)
for e1 in entity_dict:
relmat[e1, entity_dict[e1]] = 1
relmat = csr_matrix(relmat)
return relmat
def eval_relpred(model, target_rels, rel, verbose=0):
"""
Evaluate the relation prediction
Inputs:
- model: trained model that gives a score for each entity how likely they have some relation
- target_rels: dict with all relations, e.g. rel_val
- rel: which relation we're currently dealing with
Returns:
- hits: list of lists, how often the correct entity ranked at or below 1 - 10
- ranks: actual ranks of target entites
"""
hits = [[] for i in range(10)]
ranks = []
# target_rels is e.g. rel_test
target_rels = target_rels[rel]
for e1 in target_rels:
if target_rels[e1]:
# predict for the single entity to limit memory consumption
# this gives one vector with scores
pred = model.predict(entity_embeddings[e1], warn=False)[0]
# for all target objects, save the predicted value
target_pred = {e2: pred[e2] for e2 in target_rels[e1]}
# set all true relations to -1 so they don't mess up the ranking
pred[rel_all[rel][e1]] = -1.
# get ranks for all object entities
for e2 in target_rels[e1]:
pred[e2] = target_pred[e2]
sort_idx = np.argsort(pred)[::-1]
rank = np.where(sort_idx == e2)[0][0]
pred[e2] = -1.
ranks.append(rank+1)
for i in range(10):
if rank <= i:
hits[i].append(1.)
else:
hits[i].append(0.)
if verbose:
print('Hits @10: {0}'.format(np.mean(hits[9])))
print('Hits @3: {0}'.format(np.mean(hits[2])))
print('Hits @1: {0}'.format(np.mean(hits[0])))
print('Median rank: {0}'.format(np.median(ranks)))
print('Mean rank: {0}'.format(np.mean(ranks)))
print('Mean reciprocal rank: {0}'.format(np.mean(1./np.array(ranks))))
return hits, ranks
In [3]:
# first get a list of all entities
dataset = "WN18"
sorted_entities = set()
for split in ["train", "valid", "test"]:
df = pd.read_csv("data/link_prediction/%s/%s.txt" % (dataset, split), sep="\t", names=["e1", "rel", "e2"])
sorted_entities |= set(df["e1"]) | set(df["e2"])
sorted_entities = sorted(sorted_entities)
sorted_entity_ids = list(range(len(sorted_entities)))
# get a mapping to the matrix ids for the entities
entity_ids = dict(zip(sorted_entities, sorted_entity_ids))
# read in relation data: {relation: {e1: [e2, e4, ...], e2: [e3, e4, ...]}}
rel_train = load_data(dataset, split="train")
rel_val = load_data(dataset, split="valid")
rel_test = load_data(dataset, split="test")
rels = sorted(rel_train)
# for the FB15k dataset, the validation and test sets are missing some relations
for rel in rels:
if rel not in rel_val:
rel_val[rel] = {}
if rel not in rel_test:
rel_test[rel] = {}
# for the evaluation we need all true relations between entities
rel_all = deepcopy(rel_train)
for rel in rel_all:
for e1 in rel_val[rel]:
rel_all[rel][e1].extend(rel_val[rel][e1])
for e1 in rel_test[rel]:
rel_all[rel][e1].extend(rel_test[rel][e1])
In [4]:
# get a sparse input matrix that just has ones on the diagonal
entity_embeddings = dok_matrix((len(sorted_entity_ids), len(sorted_entity_ids)), dtype=np.int8)
for i in sorted_entity_ids:
entity_embeddings[i, i] = 1
entity_embeddings = csr_matrix(entity_embeddings)
inputs = entity_embeddings
# get target matrix for all relations at once
rel = "all"
relmat = relation_matrix(rel_train[rel])
# pretraining of the simec to predict all relations
e_dim = 100
simec = SimilarityEncoder(inputs.shape[1], e_dim, relmat.shape[1], sparse_inputs=True, l2_reg=0., l2_reg_emb=0., l2_reg_out=0.,
hidden_layers=[(e_dim, 'linear')], ll_activation="sigmoid", loss="binary_crossentropy", opt=0.001)
# train the simec for a loooong time and save the weights for later
best_mrr = 0
best_epoch = 0
pretrained_weights = None
for i in range(1, 50):
simec.fit(inputs, relmat, epochs=50, batch_size=32, verbose=0)
# evaluate on validation data to avoid overfitting
_, ranks_rel = eval_relpred(simec, rel_val, rel)
mrr = np.mean(1./np.array(ranks_rel))
if mrr > best_mrr:
best_mrr = mrr
best_epoch = i*50
pretrained_weights = deepcopy(simec.model.get_weights())
elif mrr < best_mrr and i*50 > 250:
break
print("MRR after %i epochs: %.7f (best: %.7f; epoch %i)" % (i*50, mrr, best_mrr, best_epoch))
print("Final MRR after %i epochs: %.7f (best: %.7f; epoch %i)" % (i*50, mrr, best_mrr, best_epoch))
# evaluate: on all relations combined (like it was trained)
print("#####################")
print("rel = all")
print("#####################")
print("on training data")
hits_rel, ranks_rel = eval_relpred(simec, rel_train, rel, 1)
print("on validation data")
hits_rel, ranks_rel = eval_relpred(simec, rel_val, rel, 1)
print("on test data")
hits_rel, ranks_rel = eval_relpred(simec, rel_test, rel, 1)
# and on the individual relations
hits_train = [[] for i in range(10)]
ranks_train = []
hits_val = [[] for i in range(10)]
ranks_val = []
hits_test = [[] for i in range(10)]
ranks_test = []
for rel in rels:
if rel == "all":
continue
hits_rel, ranks_rel = eval_relpred(simec, rel_train, rel, 0)
ranks_train.extend(ranks_rel)
for i in range(10):
hits_train[i].extend(hits_rel[i])
hits_rel, ranks_rel = eval_relpred(simec, rel_val, rel, 0)
ranks_val.extend(ranks_rel)
for i in range(10):
hits_val[i].extend(hits_rel[i])
hits_rel, ranks_rel = eval_relpred(simec, rel_test, rel, 0)
ranks_test.extend(ranks_rel)
for i in range(10):
hits_test[i].extend(hits_rel[i])
print("#####################")
print("averaged results")
print("#####################")
print("on training data")
print('Hits @10: {0}'.format(np.mean(hits_train[9])))
print('Hits @3: {0}'.format(np.mean(hits_train[2])))
print('Hits @1: {0}'.format(np.mean(hits_train[0])))
print('Median rank: {0}'.format(np.median(ranks_train)))
print('Mean rank: {0}'.format(np.mean(ranks_train)))
print('Mean reciprocal rank: {0}'.format(np.mean(1./np.array(ranks_train))))
print("on validation data")
print('Hits @10: {0}'.format(np.mean(hits_val[9])))
print('Hits @3: {0}'.format(np.mean(hits_val[2])))
print('Hits @1: {0}'.format(np.mean(hits_val[0])))
print('Median rank: {0}'.format(np.median(ranks_val)))
print('Mean rank: {0}'.format(np.mean(ranks_val)))
print('Mean reciprocal rank: {0}'.format(np.mean(1./np.array(ranks_val))))
print("on testing data")
print('Hits @10: {0}'.format(np.mean(hits_test[9])))
print('Hits @3: {0}'.format(np.mean(hits_test[2])))
print('Hits @1: {0}'.format(np.mean(hits_test[0])))
print('Median rank: {0}'.format(np.median(ranks_test)))
print('Mean rank: {0}'.format(np.mean(ranks_test)))
print('Mean reciprocal rank: {0}'.format(np.mean(1./np.array(ranks_test))))
K.clear_session()
In [5]:
# simec finetuning for all other rels
hits_train = [[] for i in range(10)]
ranks_train = []
hits_val = [[] for i in range(10)]
ranks_val = []
hits_test = [[] for i in range(10)]
ranks_test = []
for rel in rels:
if rel == "all" or not rel_val[rel] or not rel_test[rel]:
continue
print(rel)
# get relation matrix e1 -> e2
relmat = relation_matrix(rel_train[rel])
# we only train on the entities for which we actually have relations
e1_idx = sorted(e1 for e1 in rel_train[rel] if rel_train[rel][e1])
relmat = relmat[e1_idx]
inputs = entity_embeddings[e1_idx]
simec = SimilarityEncoder(inputs.shape[1], e_dim, relmat.shape[1], sparse_inputs=True, l2_reg=0., l2_reg_emb=0., l2_reg_out=0.,
hidden_layers=[(e_dim, 'linear')], ll_activation="sigmoid", loss="binary_crossentropy", opt=0.001)
# set weights with pretrained weights
simec.model.set_weights(pretrained_weights)
# based on the validation data we do early stopping and save the best weights
_, ranks_rel = eval_relpred(simec, rel_val, rel)
best_mrr = np.mean(1./np.array(ranks_rel))
best_epoch = 0
best_weights = deepcopy(simec.model.get_weights())
for i in range(1, 51):
simec.fit(inputs, relmat, epochs=3, batch_size=128, verbose=0)
# evaluate on validation data to avoid overfitting
_, ranks_rel = eval_relpred(simec, rel_val, rel)
mrr = np.mean(1./np.array(ranks_rel))
if mrr > best_mrr:
best_mrr = mrr
best_epoch = i*3
best_weights = deepcopy(simec.model.get_weights())
elif mrr < best_mrr and i*3 > 25:
break
print("MRR after %i epochs: %.7f (best: %.7f; epoch %i)" % (i*3, mrr, best_mrr, best_epoch))
simec.model.set_weights(best_weights)
# evaluate
hits_rel, ranks_rel = eval_relpred(simec, rel_train, rel, 0)
ranks_train.extend(ranks_rel)
for i in range(10):
hits_train[i].extend(hits_rel[i])
hits_rel, ranks_rel = eval_relpred(simec, rel_val, rel, 0)
ranks_val.extend(ranks_rel)
for i in range(10):
hits_val[i].extend(hits_rel[i])
hits_rel, ranks_rel = eval_relpred(simec, rel_test, rel, 0)
ranks_test.extend(ranks_rel)
for i in range(10):
hits_test[i].extend(hits_rel[i])
K.clear_session()
In [6]:
print("#####################")
print("averaged results after fine tuning")
print("#####################")
print("on training data")
print('Hits @10: {0}'.format(np.mean(hits_train[9])))
print('Hits @3: {0}'.format(np.mean(hits_train[2])))
print('Hits @1: {0}'.format(np.mean(hits_train[0])))
print('Median rank: {0}'.format(np.median(ranks_train)))
print('Mean rank: {0}'.format(np.mean(ranks_train)))
print('Mean reciprocal rank: {0}'.format(np.mean(1./np.array(ranks_train))))
print("on validation data")
print('Hits @10: {0}'.format(np.mean(hits_val[9])))
print('Hits @3: {0}'.format(np.mean(hits_val[2])))
print('Hits @1: {0}'.format(np.mean(hits_val[0])))
print('Median rank: {0}'.format(np.median(ranks_val)))
print('Mean rank: {0}'.format(np.mean(ranks_val)))
print('Mean reciprocal rank: {0}'.format(np.mean(1./np.array(ranks_val))))
print("on testing data")
print('Hits @10: {0}'.format(np.mean(hits_test[9])))
print('Hits @3: {0}'.format(np.mean(hits_test[2])))
print('Hits @1: {0}'.format(np.mean(hits_test[0])))
print('Median rank: {0}'.format(np.median(ranks_test)))
print('Mean rank: {0}'.format(np.mean(ranks_test)))
print('Mean reciprocal rank: {0}'.format(np.mean(1./np.array(ranks_test))))
#####################
averaged results after fine tuning
#####################
on training data
Hits @10: 0.9899752403984569
Hits @3: 0.9276271088846664
Hits @1: 0.7581389992514539
Median rank: 1.0
Mean rank: 33.98896182414925
Mean reciprocal rank: 0.8443426347451821
on validation data
Hits @10: 0.4060646011865524
Hits @3: 0.34080421885299933
Hits @1: 0.19676994067237968
Median rank: 136.0
Mean rank: 5930.954185893211
Mean reciprocal rank: 0.27780731983544255
on testing data
Hits @10: 0.40874282067645185
Hits @3: 0.3439693682195278
Hits @1: 0.19591576260370133
Median rank: 123.0
Mean rank: 6076.159700063816
Mean reciprocal rank: 0.27886990190953775
#####################
averaged results after fine tuning
#####################
on training data
Hits @10: 0.9272930881599927
Hits @3: 0.8392788007210328
Hits @1: 0.5860503890329751
Median rank: 1.0
Mean rank: 25.683137949552684
Mean reciprocal rank: 0.719122298742699
on validation data
Hits @10: 0.7232114212383702
Hits @3: 0.569718479307026
Hits @1: 0.35256256015399423
Median rank: 3.0
Mean rank: 272.09629651908887
Mean reciprocal rank: 0.4827135253765891
on testing data
Hits @10: 0.7241171272220522
Hits @3: 0.5680466333571259
Hits @1: 0.34524659256993306
Median rank: 3.0
Mean rank: 269.1235427075898
Mean reciprocal rank: 0.47837383495927893
#####################
averaged results after fine tuning
#####################
on training data
Hits @10: 0.6585153360928204
Hits @3: 0.5070968923986973
Hits @1: 0.3510437816191235
Median rank: 3.0
Mean rank: 87.53925351759827
Mean reciprocal rank: 0.4558151167654896
on validation data
Hits @10: 0.4437535653166001
Hits @3: 0.3148317170564746
Hits @1: 0.21003993154592127
Median rank: 17.0
Mean rank: 432.95131203650885
Mean reciprocal rank: 0.2881203144254347
on testing data
Hits @10: 0.44074218940986654
Hits @3: 0.3085121986994573
Hits @1: 0.2008507309441158
Median rank: 18.0
Mean rank: 458.222216789713
Mean reciprocal rank: 0.28085737963917984
In [ ]: