Data can be found at http://ltdata1.informatik.uni-hamburg.de/joint/hyperwatset/konvens/
Download all files and put them into the data
directory to run the code below.
In [ ]:
from gensim.models import KeyedVectors
import logging
from time import time
from os.path import exists
def try_print(w2v, test_word):
try:
for word, score in w2v.most_similar(test_word):
print(word, score)
except:
print("Warning: word '{}' not found.".format(test_word))
def load_and_pickle(w2v_fpath, binary=False):
tic = time()
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
w2v_pkl_fpath = w2v_fpath + ".pkl"
if exists(w2v_pkl_fpath):
w2v = KeyedVectors.load(w2v_pkl_fpath)
else:
w2v = KeyedVectors.load_word2vec_format(w2v_fpath, binary=binary, unicode_errors='ignore')
w2v.init_sims(replace=True)
try_print(w2v, "for")
try_print(w2v, "для")
w2v.save(w2v_pkl_fpath)
print(time()- tic, "sec.")
return w2v, w2v_pkl_fpath
w2v_en_original_fpath = "data/GoogleNews-vectors-negative300.txt" # standard google news word2vec model
w2v_ru_original_fpath = "data/all.norm-sz500-w10-cb0-it3-min5.w2v" # RDT word2vec model
w2v_en, w2v_en_fpath = load_and_pickle(w2v_en_original_fpath)
w2v_ru, w2v_ru_fpath = load_and_pickle(w2v_ru_original_fpath, binary=True)
from glob import glob
from vector_representations.build_sense_vectors import run
for lang in ["ru", "en"]:
sensegram_fpaths = "data/{}/*-sensegram.tsv".format(lang)
w2v_fpath = w2v_ru_original_fpath if "ru" else w2v_en_original_fpath
for inventory_fpath in glob(sensegram_fpaths):
run(inventory_fpath, w2v_fpath)
In [ ]:
import codecs
import operator
from multiprocessing import Pool
from vector_representations.dense_sense_vectors import DenseSenseVectors
from traceback import format_exc
from glob import glob
def generate_binary_hypers(output_dir, max_synsets=1, hyper_synset_max_size=10, hc_max=0):
output_fpath = output_dir + ".vector-link-s%d-hmx%d-hc%d.csv" % (
max_synsets, hyper_synset_max_size, hc_max)
bin_count = 0
out = codecs.open(output_fpath, "w", "utf-8")
log = codecs.open(output_fpath + ".log", "w", "utf-8")
for i, h_id in enumerate(dsv.pcz.data):
try:
if i % 10000 == 0: print(i)
if "h" in h_id:
hypo_h_senses = dsv.pcz.data[h_id][0]["cluster"]
tmp = sorted(dsv.pcz.data[h_id][0]["cluster"].items(), key=operator.itemgetter(1), reverse=True)
s_id = "s" + h_id[1:]
hypo_senses = dsv.pcz.data[s_id][0]["cluster"]
log.write("\n{}\t{}\n".format(
h_id, ", ".join(hypo_h_senses)
))
log.write("{}\n".format(
", ".join(["{}:{}".format(k,v) for k,v in tmp])
))
log.write("{}\t{}\n".format(
s_id, ", ".join(hypo_senses)
))
# save relations from the hierarchical context
for hypo_sense in hypo_senses:
for hc_num, hyper_sense in enumerate(hypo_h_senses):
if hc_num == hc_max: break
hypo_word = hypo_sense.split("#")[0]
hyper_word = hyper_sense.split("#")[0]
if hypo_word != hyper_word:
out.write("{}\t{}\tfrom-original-labels\n".format(hypo_word, hyper_word))
bin_count += 1
# save binary relations from a synset
s_synsets = 0
for rh_id, s in dsv.sense_vectors.most_similar(h_id + "#0"):
if "s" in rh_id:
hyper_senses = dsv.pcz.data[rh_id.split("#")[0]][0]["cluster"]
if len(hyper_senses) > hyper_synset_max_size: continue
rh_str = ", ".join(hyper_senses)
log.write("\t{}:{:.3f} {}\n".format(rh_id, s, rh_str))
for hypo_sense in hypo_senses:
for hyper_sense in hyper_senses:
hypo_word = hypo_sense.split("#")[0]
hyper_word = hyper_sense.split("#")[0]
if hypo_word != hyper_word:
out.write("{}\t{}\tfrom-vector-linkage\n".format(hypo_word, hyper_word))
bin_count += 1
s_synsets += 1
if s_synsets >= max_synsets: break
except KeyboardInterrupt:
break
except:
print("Error", i, h_id)
print(format_exc())
out.close()
log.close()
print("# binary relations:", bin_count)
print("binary relations:", output_fpath)
print("log of binary relations:", output_fpath + ".log")
return bin_count, output_fpath
for pcz_fpath in glob("data/ru/*tsv"):
print(pcz_fpath)
reload = True
try: dsv
except NameError: reload = True
if reload:
dsv = DenseSenseVectors(
pcz_fpath=pcz_fpath,
word_vectors_obj=None,
save_pkl=True,
sense_dim_num=1000,
norm_type="sum",
weight_type="score",
max_cluster_words=20)
for max_top_synsets in [1, 2, 3]:
for max_hyper_synset_size in [3, 5, 10, 20]:
for hc_max in [1, 3, 5]:
print("="*50)
print("max number of synsets:", max_top_synsets)
print("max hyper synset size:", max_hyper_synset_size)
print("hc_max:", hc_max)
generate_binary_hypers(pcz_fpath, max_top_synsets, max_hyper_synset_size, hc_max)
In [ ]: