notebook.community

Edit and run



In [ ]:

    
%matplotlib inline
%load_ext autoreload
%autoreload 2



In [ ]:

    
%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}



In [ ]:

    
from disambiguator import WSD
from pandas import read_csv 


def evaluate(dataset_fpath, max_context_words):
    """ Evaluates the model using the global variable wsd_model """
    
    output_fpath = dataset_fpath + ".filter{}.pred.csv".format(max_context_words)
    df = read_csv(dataset_fpath, sep="\t", encoding="utf-8")

    for i, row in df.iterrows():
        sense_id, _ = wsd_model.get_best_sense_id(row.context, row.word, max_context_words)
        df.loc[i, "predict_sense_id"] = sense_id

    df.to_csv(output_fpath, sep="\t", encoding="utf-8")
    print("Output:", output_fpath)
    
    return output_fpath

# Parameters
context_window_size = 20 # this parameters can be also changed during experiments 
target_word = "замок"
context = "Замок Нойшванштайн буквально романтический замок баварского короля Людвига II около городка Фюссен и замка Хоэншвангау в юго-западной Баварии, недалеко от австрийской границы. Одно из самых популярных среди туристов мест на юге Германии."
# ignore_case = True
lang = "ru" # to filter out stopwords
skip_unknown_words = True
# you can download the WSI evaluation dataset with 'git clone https://github.com/nlpub/russe-wsi-kit.git'
wikiwiki_fpath = "../../russe-wsi-kit/data/main/wiki-wiki/train.csv"
activedict_fpath = "../../russe-wsi-kit/data/main/active-dict/test.csv"
btsrnc_fpath = "../../russe-wsi-kit/data/main/bts-rnc/test.csv"
inventory_fpath = "../model/wsi/cc.ru.300.vec.gz.top50.wsi-inventory.tsv" 

# Load the model
try:
    wsd_model
except NameError:
    wsd_model = WSD(inventory_fpath, language=lang, verbose=True, skip_unknown_words=skip_unknown_words)

# Sanity check of the model
for sense, score in wsd_model.disambiguate(context, target_word):
    print("*\t", score, sense.keyword, ", ".join(sense.cluster))

# Evaluate
for max_context_words in [15, 20, 25]:
    evaluate(wikiwiki_fpath, max_context_words)
    # evaluate(btsrnc_fpath)
    # evaluate(activedict_fpath)



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:

    
from gensim.models.fasttext import FastText
model = FastText.load_fasttext_format("cc.ru.300.bin.pkl")



In [8]:

    
ls









    



Untitled.ipynb
__pycache__/
cc.ru.300.bin
cc.ru.300.bin.pkl
cc.ru.300.bin.pkl.wv.vectors.npy
cc.ru.300.bin.pkl.wv.vectors_ngrams.npy
cc.ru.300.vec.gz
cc.ru.300.vec.gz.pkl
cc.ru.300.vec.gz.pkl.vectors.npy
disambiguator.py
egvi.ipynb
induction.py
test_disambiguator.py



In [8]:

    
from gensim.models.fasttext import FastText
model = FastText.load("cc.ru.300.bin.pkl")
#model.save("cc.ru.300.bin.pkl")









    



---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
~/anaconda3/lib/python3.6/site-packages/gensim/models/fasttext.py in load(cls, *args, **kwargs)
    717             if not hasattr(model.trainables, 'vectors_vocab_lockf') and hasattr(model.wv, 'vectors_vocab'):
--> 718                 model.trainables.vectors_vocab_lockf = ones(len(model.trainables.vectors), dtype=REAL)
    719             if not hasattr(model.trainables, 'vectors_ngrams_lockf') and hasattr(model.wv, 'vectors_ngrams'):

AttributeError: 'FastTextTrainables' object has no attribute 'vectors'

During handling of the above exception, another exception occurred:

AttributeError                            Traceback (most recent call last)
<ipython-input-8-358df5bb2088> in <module>()
      1 from gensim.models.fasttext import FastText
----> 2 model = FastText.load("cc.ru.300.bin.pkl")
      3 #model.save("cc.ru.300.bin.pkl")

~/anaconda3/lib/python3.6/site-packages/gensim/models/fasttext.py in load(cls, *args, **kwargs)
    723             logger.info('Model saved using code from earlier Gensim Version. Re-loading old model in a compatible way.')
    724             from gensim.models.deprecated.fasttext import load_old_fasttext
--> 725             return load_old_fasttext(*args, **kwargs)
    726 
    727     @deprecated("Method will be removed in 4.0.0, use self.wv.accuracy() instead")

~/anaconda3/lib/python3.6/site-packages/gensim/models/deprecated/fasttext.py in load_old_fasttext(*args, **kwargs)
     51 
     52 def load_old_fasttext(*args, **kwargs):
---> 53     old_model = FastText.load(*args, **kwargs)
     54     params = {
     55         'size': old_model.vector_size,

~/anaconda3/lib/python3.6/site-packages/gensim/models/deprecated/word2vec.py in load(cls, *args, **kwargs)
   1614     @classmethod
   1615     def load(cls, *args, **kwargs):
-> 1616         model = super(Word2Vec, cls).load(*args, **kwargs)
   1617         # update older models
   1618         if hasattr(model, 'table'):

~/anaconda3/lib/python3.6/site-packages/gensim/models/deprecated/old_saveload.py in load(cls, fname, mmap)
     85         compress, subname = SaveLoad._adapt_by_suffix(fname)
     86 
---> 87         obj = unpickle(fname)
     88         obj._load_specials(fname, mmap, compress, subname)
     89         logger.info("loaded %s", fname)

~/anaconda3/lib/python3.6/site-packages/gensim/models/deprecated/old_saveload.py in unpickle(fname)
    378             b'gensim.models.wrappers.fasttext', b'gensim.models.deprecated.fasttext_wrapper')
    379         if sys.version_info > (3, 0):
--> 380             return _pickle.loads(file_bytes, encoding='latin1')
    381         else:
    382             return _pickle.loads(file_bytes)

AttributeError: Can't get attribute 'FastTextKeyedVectors' on <module 'gensim.models.deprecated.keyedvectors' from '/Users/panchenko/anaconda3/lib/python3.6/site-packages/gensim/models/deprecated/keyedvectors.py'>



In [ ]:



In [ ]:

    
# implement the wsi by looking at the inventory file 

# for methon in keyword or mean

for nns in [50, 100, 200]:
    inventory_fpath = "/home/panchenko/sensegram/model/cc.ru.300.vec.gz.top{}.inventory.tsv".format(nns)
    print(inventory_fpath)
    
    # load the inventory
    
    # load the evaluation dataset
    
    # for each context in the evaluation dataset
        # find the most suitale context



In [ ]:



In [ ]:



In [ ]:



In [ ]: