To test with word sense embeddings you can use a pretrained model (sense vectors and sense probabilities). These sense vectors were induced from Wikipedia using word2vec similarities between words in ego-networks. Sense probabilities are stored in a separate file which is located next to the file with sense vectors.
In [ ]:
import sensegram
# see README for model download information
sense_vectors_fpath = "model/dewiki.txt.clusters.minsize5-1000-sum-score-20.sense_vectors"
sv = sensegram.SenseGram.load_word2vec_format(sense_vectors_fpath, binary=False)
In [ ]:
word = "Hund"
sv.get_senses(word)
In [ ]:
word = "Hund"
for sense_id, prob in sv.get_senses(word):
print(sense_id)
print("="*20)
for rsense_id, sim in sv.wv.most_similar(sense_id):
print("{} {:f}".format(rsense_id, sim))
print("\n")
To use our word sense disambiguation mechanism you also need word vectors or context vectors, depending on the dismabiguation strategy. Those word are located in the model
directory and has the extension .vectors
.
Our WSD mechanism is based on word similarities (sim
) and requires word vectors to represent context words. In following we provide a disambiguation example using similarity strategy.
First, load word vectors using gensim library:
In [ ]:
from gensim.models import KeyedVectors
word_vectors_fpath = "model/dewiki.txt.word_vectors"
wv = KeyedVectors.load_word2vec_format(word_vectors_fpath, binary=False, unicode_errors="ignore")
Then initialise the WSD object with sense and word vectors:
In [ ]:
from wsd import WSD
wsd_model = WSD(sv, wv, window=5, method='sim', filter_ctx=3)
The settings have the following meaning: it will extract at most window
*2 words around the target word from the sentence as context and it will use only three most discriminative context words for disambiguation.
Now you can disambiguate the word "table" in the sentence "They bought a table and chairs for kitchen" using dis_text
function. As input it takes a sentence with space separated tokens, a target word, and start/end indices of the target word in the given sentence.
In [ ]:
word = "Hund"
context = "Die beste Voraussetzung für die Hund-Katze-Freundschaft ist, dass keiner von beiden in der Vergangenheit unangenehme Erlebnisse mit der anderen Gattung hatte. Am einfachsten ist die ungleiche WG, wenn sich zwei Jungtiere ein Zuhause teilen. Bei erwachsenen Tieren ist es einfacher, wenn sich Miezi in Bellos Haushalt einnistet – nicht umgekehrt, da Hunde Rudeltiere sind. Damit ein Hund das Kätzchen aber auch als Rudelmitglied sieht und nicht als Futter sollten ein paar Regeln beachtet werden"
wsd_model.dis_text(context, word, 0, 4)
In [ ]:
import sensegram
from wsd import WSD
from gensim.models import KeyedVectors
# Input data and paths (see README for model download information)
sense_vectors_fpath = "model/wiki.txt.clusters.minsize5-1000-sum-score-20.sense_vectors"
word_vectors_fpath = "model/wiki.txt.word_vectors"
context_words_max = 3 # change this paramters to 1, 2, 5, 10, 15, 20 : it may improve the results
context_window_size = 5 # this parameters can be also changed during experiments
word = "python"
context = "Python is an interpreted high-level programming language for general-purpose programming. Created by Guido van Rossum and first released in 1991, Python has a design philosophy that emphasizes code readability, notably using significant whitespace."
ignore_case = True
lang = "en" # to filter out stopwords
# Load models (takes long time)
sv = sensegram.SenseGram.load_word2vec_format(sense_vectors_fpath, binary=False)
wv = KeyedVectors.load_word2vec_format(word_vectors_fpath, binary=False, unicode_errors="ignore")
# Play with the model (is quick)
print("Probabilities of the senses:\n{}\n\n".format(sv.get_senses(word, ignore_case=ignore_case)))
for sense_id, prob in sv.get_senses(word, ignore_case=ignore_case):
print(sense_id)
print("="*20)
for rsense_id, sim in sv.wv.most_similar(sense_id):
print("{} {:f}".format(rsense_id, sim))
print("\n")
# Disambiguate a word in a context
wsd_model = WSD(sv, wv, window=context_window_size, lang=lang,
filter_ctx=context_words_max, ignore_case=ignore_case)
print(wsd_model.disambiguate(context, word))
In [ ]:
import sensegram
from wsd import WSD
from gensim.models import KeyedVectors
# Input data and paths
sense_vectors_fpath = "model/sdewac-v3.corpus.clusters.minsize5-1000-sum-score-20.sense_vectors"
word_vectors_fpath = "model/sdewac-v3.corpus.word_vectors"
context_words_max = 3 # change this paramters to 1, 2, 5, 10, 15, 20 : it may improve the results
context_window_size = 5 # this parameters can be also changed during experiments
word = "Maus"
context = "Die Maus ist ein Eingabegerät (Befehlsgeber) bei Computern. Der allererste Prototyp wurde 1963 nach Zeichnungen von Douglas C. Engelbart gebaut; seit Mitte der 1980er Jahre bildet die Maus für fast alle Computertätigkeiten zusammen mit dem Monitor und der Tastatur eine der wichtigsten Mensch-Maschine-Schnittstellen. Die Entwicklung grafischer Benutzeroberflächen hat die Computermaus zu einem heute praktisch an jedem Desktop-PC verfügbaren Standardeingabegerät gemacht."
ignore_case = True
# Load models (takes long time)
sv = sensegram.SenseGram.load_word2vec_format(sense_vectors_fpath, binary=False)
wv = KeyedVectors.load_word2vec_format(word_vectors_fpath, binary=False, unicode_errors="ignore")
# Play with the model (is quick)
print("Probabilities of the senses:\n{}\n\n".format(sv.get_senses(word, ignore_case=ignore_case)))
for sense_id, prob in sv.get_senses(word, ignore_case=ignore_case):
print(sense_id)
print("="*20)
for rsense_id, sim in sv.wv.most_similar(sense_id):
print("{} {:f}".format(rsense_id, sim))
print("\n")
# Disambiguate a word in a context
wsd_model = WSD(sv, wv, window=context_window_size, lang="de",
filter_ctx=context_words_max, ignore_case=ignore_case)
print(wsd_model.disambiguate(context, word))
In [ ]:
import sensegram
from wsd import WSD
from gensim.models import KeyedVectors
# Input data and paths
sense_vectors_fpath = "model/wikipedia-ru-2018.txt.clusters.minsize5-1000-sum-score-20.sense_vectors"
word_vectors_fpath = "model/wikipedia-ru-2018.txt.word_vectors"
max_context_words = 3 # change this paramters to 1, 2, 5, 10, 15, 20 : it may improve the results
context_window_size = 20 # this parameters can be also changed during experiments
word = "ключ"
context = "Ключ — это секретная информация, используемая криптографическим алгоритмом при зашифровании/расшифровании сообщений, постановке и проверке цифровой подписи, вычислении кодов аутентичности (MAC). При использовании одного и того же алгоритма результат шифрования зависит от ключа. Для современных алгоритмов сильной криптографии утрата ключа приводит к практической невозможности расшифровать информацию."
ignore_case = True
lang = "ru" # to filter out stopwords
# Load models (takes long time)
# wv = KeyedVectors.load_word2vec_format(word_vectors_fpath, binary=False, unicode_errors="ignore")
# sv = sensegram.SenseGram.load_word2vec_format(sense_vectors_fpath, binary=False)
# Play with the model (is quick)
print("Probabilities of the senses:\n{}\n\n".format(sv.get_senses(word, ignore_case=ignore_case)))
for sense_id, prob in sv.get_senses(word, ignore_case=ignore_case):
print(sense_id)
print("="*20)
for rsense_id, sim in sv.wv.most_similar(sense_id):
print("{} {:f}".format(rsense_id, sim))
print("\n")
# Disambiguate a word in a context
wsd_model = WSD(sv, wv, window=context_window_size, lang=lang,
max_context_words=max_context_words, ignore_case=ignore_case)
print(wsd_model.disambiguate(context, word))
###########################
from pandas import read_csv
# you can download the WSI evaluation dataset with 'git clone https://github.com/nlpub/russe-wsi-kit.git'
wikiwiki_fpath = "../russe-wsi-kit/data/main/wiki-wiki/train.csv"
activedict_fpath = "../russe-wsi-kit/data/main/active-dict/test.csv"
btsrnc_fpath = "../russe-wsi-kit/data/main/bts-rnc/test.csv"
def evaluate(dataset_fpath):
output_fpath = dataset_fpath + ".pred.csv"
df = read_csv(dataset_fpath, sep="\t", encoding="utf-8")
for i, row in df.iterrows():
sense_id, _ = wsd_model.disambiguate(row.context, row.word)
df.loc[i, "predict_sense_id"] = sense_id
df.to_csv(output_fpath, sep="\t", encoding="utf-8")
print("Output:", output_fpath)
return output_fpath
evaluate(wikiwiki_fpath)
evaluate(btsrnc_fpath)
evaluate(activedict_fpath)
In [ ]:
In [ ]:
%load_ext autoreload
%autoreload 2