In [2]:
import logging
from six import iteritems
from web.datasets.similarity import fetch_MEN, fetch_WS353, fetch_SimLex999
from web.embeddings import fetch_GloVe
from web.evaluate import evaluate_similarity

In [3]:
# Configure logging
logging.basicConfig(format='%(asctime)s %(levelname)s:%(message)s', level=logging.DEBUG, datefmt='%I:%M:%S')

In [4]:
# Fetch GloVe embedding (warning: it might take few minutes)
w_glove = fetch_GloVe(corpus="wiki-6B", dim=300)


File already downloaded, skipping
05:49:40 INFO:Tranformed 400000 into 381871 words

In [6]:
# Define tasks
tasks = {
    "MEN": fetch_MEN(),
    "WS353": fetch_WS353(),
    "SIMLEX999": fetch_SimLex999()
}

In [7]:
# Print sample data
for name, data in iteritems(tasks):
    print("Sample data from {}: pair \"{}\" and \"{}\" is assigned score {}".format(name, data.X[0][0], data.X[0][1], data.y[0]))


Sample data from SIMLEX999: pair "old" and "new" is assigned score 1.58
Sample data from MEN: pair "sun" and "sunlight" is assigned score [ 10.]
Sample data from WS353: pair "love" and "sex" is assigned score 6.77

In [8]:
# Calculate results using helper function
for name, data in iteritems(tasks):
    print "Spearman correlation of scores on {} {}".format(name, evaluate_similarity(w_glove, data.X, data.y))


05:51:18 WARNING:Missing 24 words. Will replace them with mean vector
Spearman correlation of scores on SIMLEX999 0.370500357109
Spearman correlation of scores on MEN 0.737464696981
Spearman correlation of scores on WS353 0.521712569525

In [ ]: