In [1]:
import logging
from web.datasets.analogy import fetch_google_analogy
from web.embeddings import fetch_SG_GoogleNews

In [2]:
# Configure logging
logging.basicConfig(format='%(asctime)s %(levelname)s:%(message)s', level=logging.DEBUG, datefmt='%I:%M:%S')

In [3]:
# Fetch skip-gram trained on GoogleNews corpus and clean it slightly
w = fetch_SG_GoogleNews(lower=True, clean_words=True)

# Fetch analogy dataset
data = fetch_google_analogy()


05:53:11 INFO:loading projection weights from /home/pocha/web_data/embeddings/GoogleNews-vectors-negative300.bin.gz
05:53:11 INFO:Loading #3000000 words with 300 dim
File already downloaded, skipping
05:55:25 INFO:Tranformed 3000000 into 2665071 words

In [4]:
for cat in (set(data.category)):
    print(cat)


gram3-comparative
gram8-plural
capital-common-countries
city-in-state
family
gram9-plural-verbs
gram2-opposite
currency
gram4-superlative
gram6-nationality-adjective
gram7-past-tense
gram5-present-participle
capital-world
gram1-adjective-to-adverb

In [5]:
# Pick a sample of data and calculate answers
subset = [50, 1000, 4000, 10000, 14000]
for id in subset:
    w1, w2, w3 = data.X[id][0], data.X[id][1], data.X[id][2]
    print("Question: {} is to {} as {} is to ?".format(w1, w2, w3))
    print("Answer: " + data.y[id])
    print("Predicted: " + " ".join(w.nearest_neighbors(w[w2] - w[w1] + w[w3], exclude=[w1, w2, w3])))


Question: bangkok is to thailand as havana is to ?
Answer: cuba
Predicted: asi
Question: baku is to azerbaijan as dushanbe is to ?
Answer: tajikistan
Predicted: tajikistan
Question: rome is to italy as windhoek is to ?
Answer: namibia
Predicted: otjiwarongo
Question: comfortable is to uncomfortable as clear is to ?
Answer: unclear
Predicted: abundantly_clear
Question: slow is to slowing as describe is to ?
Answer: describing
Predicted: describing

In [ ]: