In [1]:
%matplotlib inline
### Interactive medword testing tool ###
import medword_pipeline
import embedding_fasttext
import embedding_word2vec
import shared.load_config
import json
import pylab
import os
import model_validation as mv
import preprocess as pp
In [2]:
### define filepaths and names ###
#
# Note: the embedding model must fit the embedding_method in
# the config file ("word2vec" or "fasttext")
#
# where to get the configuration file corresponding to the embedding model
config_data_dir = 'data/embeddings/emb4/'
config_filename = 'emb_model_w2v_4_configuration.json'
# where to get the trained embedding model
emb_model_dir = 'data/embeddings/emb4/'
emb_model_filename = 'emb_model_w2v_4.bin'
# Load configuration (don't change)
config_src = os.path.join(config_data_dir, config_filename)
config = shared.load_config.Configuration(config_src)
In [4]:
### initialize embedding object (don't change)
if config.config['embedding_method'] == 'fasttext':
embedding = embedding_fasttext.EmbeddingFasttext(config)
elif config.config['embedding_method'] == 'word2vec':
embedding = embedding_word2vec.EmbeddingWord2vec(config)
else:
print('embedding_algorithm (in config) must be "fasttext" or "word2vec"')
raise AttributeError
### Load model (don't change)
embedding.load_model(emb_model_dir, emb_model_filename)
print("Model loaded.")
In [5]:
### Get model information ###
print("vocab size:", len(embedding.get_vocab()))
print("vector dim:", embedding.vec_dim())
In [6]:
### Play with model ###
# define a word from which you like to see it's n-nearest neighbours
# in the embedding space
word = 'arzt'
n_nearest_neighbours = 10
# check if word is in model vocab
print("word is in model vocab: ", word in embedding.get_vocab())
# create list of neighbours
embedding.most_similar_n(word, n_nearest_neighbours)
Out[6]:
In [7]:
### Analogy
# embedding.analogy([positives], [negatives], topn)
embedding.analogy(['koenig', 'frau'], ['mann'], 10)
Out[7]:
In [8]:
# Get a vector of a word
vec = embedding.word_vec('das')
print(vec[:10])
In [9]:
### Validation of the embedding model ###
# where to get the validation data
config.config['val_data_dir'] = 'data/validation_data/'
# Choose DOESNTFIT test file from your validation folder
# note: - the doesntfit file has a 'question' on each line
# - a 'question' is of the format
#
# word_1 word_2 word_3 ... word_N word_not_fitting
#
# -> word_1 to word_N belong to a category but word_not_fitting doesn't
#
# Example question:
#
# Auto Motorrad Fahrrad Fußgänger
#
#
config.config['doesntfit_file'] = 'german_doesntfit1.txt'
# Choose SYNONYM test file from your validation folder
# note: - the synonym file has a 'question' on each line
# - a 'question' is of the format
#
# word_1 word_2
#
# -> word_1 and word_N are synonyms
#
# Example question:
#
# Blutgerinnsel Thrombus
#
#
config.config['synonyms_file'] = 'german_synonyms3.txt'
mv.validate_model(embedding, emb_model_dir, emb_model_filename)
In [13]:
### Visualization ###
# From a list of words, plot all words and it's neighbours
# define the words you would like to visualize
word_list = ['arzt', 'sonne', 'der']
# define the number of closest neighbors to display per word
n_nearest_neighbours = 4
# change figure size if desired
width = 10.0
height = 8.0
pylab.rcParams['figure.figsize'] = (width, height)
# plotting
mv.visualize_words(embedding, word_list, n_nearest_neighbours)
In [10]:
### test your used tokenizer ###
tokenizer = pp.get_tokenizer(config)
example = """!! „asdf“ 'asdf' abk. hey "asdf" ??Ein Arzt bzw. eine Ärztin ist ein medizinisch\n \n
ausgebildeter Heilkundiger, beschäftigt sich mit der
Vorbeugung (Prävention), [Erkennung] (Diagnose),
Behandlung (Therapie) [...] und Nachsorge von Krankheiten
und Leiden bzw. gesundheitlichen Beeinträchtigungen,
und stellt sich">
damit in den Dienst der Gesundheit.
stae-asdf"""
tk_example = tokenizer.tokenize(str(example))
print(tk_example)
In [11]:
import importlib
importlib.reload(mv)
file_src = 'data/validation_data/human_similarity.csv'
In [ ]:
In [ ]: