notebook.community

Edit and run



In [1]:

    
import logging
from web.datasets.analogy import fetch_google_analogy
from web.embeddings import fetch_SG_GoogleNews



In [2]:

    
# Configure logging
logging.basicConfig(format='%(asctime)s %(levelname)s:%(message)s', level=logging.DEBUG, datefmt='%I:%M:%S')



In [3]:

    
# Fetch skip-gram trained on GoogleNews corpus and clean it slightly
w = fetch_SG_GoogleNews(lower=True, clean_words=True)

# Fetch analogy dataset
data = fetch_google_analogy()









    



05:53:11 INFO:loading projection weights from /home/pocha/web_data/embeddings/GoogleNews-vectors-negative300.bin.gz
05:53:11 INFO:Loading #3000000 words with 300 dim






    



File already downloaded, skipping






    



05:55:25 INFO:Tranformed 3000000 into 2665071 words



In [4]:

    
for cat in (set(data.category)):
    print(cat)









    



gram3-comparative
gram8-plural
capital-common-countries
city-in-state
family
gram9-plural-verbs
gram2-opposite
currency
gram4-superlative
gram6-nationality-adjective
gram7-past-tense
gram5-present-participle
capital-world
gram1-adjective-to-adverb



In [5]:

    
# Pick a sample of data and calculate answers
subset = [50, 1000, 4000, 10000, 14000]
for id in subset:
    w1, w2, w3 = data.X[id][0], data.X[id][1], data.X[id][2]
    print("Question: {} is to {} as {} is to ?".format(w1, w2, w3))
    print("Answer: " + data.y[id])
    print("Predicted: " + " ".join(w.nearest_neighbors(w[w2] - w[w1] + w[w3], exclude=[w1, w2, w3])))









    



Question: bangkok is to thailand as havana is to ?
Answer: cuba
Predicted: asi
Question: baku is to azerbaijan as dushanbe is to ?
Answer: tajikistan
Predicted: tajikistan
Question: rome is to italy as windhoek is to ?
Answer: namibia
Predicted: otjiwarongo
Question: comfortable is to uncomfortable as clear is to ?
Answer: unclear
Predicted: abundantly_clear
Question: slow is to slowing as describe is to ?
Answer: describing
Predicted: describing



In [ ]: