This is a text cell.
In [ ]:
# This is a code cell. It can be executed by pressing CTRL+Enter
print('Hello')
Task: modify the cell above so it greets you, in my case the cell output should be Hi, Dima.
We need couple of things before getting started
In [ ]:
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
import pandas
pandas.options.display.max_columns = 11
pandas.options.display.max_rows = 5
import matplotlib
matplotlib.rcParams['font.size'] = 15
matplotlib.rcParams['figure.figsize'] = 15, 9
matplotlib.rcParams['savefig.dpi'] = 227
In [ ]:
from random import sample
from urllib.request import urlretrieve
import pandas as pd
import seaborn as sns
import numpy as np
In [ ]:
def get_space(url, key='space'):
"""Download the co-occurrence data."""
frame_file, _ = urlretrieve(url)
return pd.read_hdf(frame_file, key=key)
To demonstrate the idea, we try to cluster few words by their meaning. The words are boy, man, car, brother, uncle, son, father, dad, grandfather, cousin, parent, boss, owner, staff, adult, manager, director, person, kid, girl, woman, doll, sister, aunt, daughter, mother, mom, grandmother, idea, concept, notion, blue and pink.
Task How would you group this words? Are there words that share same theme?
In [ ]:
# Load the space into the memory
toy_space = get_space(
'http://www.eecs.qmul.ac.uk/~dm303/static/eecs_open14/space_frame_eecs14.h5'
)
See some of the co-occrrence statistics
In [ ]:
# So far we are interested in just these words
interesting_words = ['idea', 'notion', 'boy', 'girl']
In [ ]:
# Query the vector space for the words of interest
toy_space.loc[interesting_words]
this says us that idea was seen with time 258 times in the corpus I've used.
In [ ]:
# We are going to use pairwise_distances function from the sklearn package
from sklearn.metrics.pairwise import pairwise_distances
# Compute distances for the words of interest
distances = pairwise_distances(
toy_space.loc[interesting_words].values,
metric='cosine',
)
# Show the result
np.round(
pd.DataFrame(distances, index=interesting_words, columns=interesting_words),
3,
)
Task: change metric='cosine' to metric='euclidean'. How will distances change? Why is cosine distance preferred to Euclidean?
http://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.pairwise_distances.html
Similarity 1 means that items are identical, 0 means that they are different. It's possible to convert distances to similarities, we use np.exp(-distances) here.
In [ ]:
# np.exp(-distances) is a fancy way of converting distances to similarities
pd.DataFrame(np.exp(-distances), index=interesting_words, columns=interesting_words)
We are going to use scikit-learn's Manifold learning implementation.
In [ ]:
from sklearn import manifold
from sklearn.preprocessing import MinMaxScaler
# clf will be able to "project" word vectors to 2 dimensions
clf = manifold.MDS(n_components=2, dissimilarity='precomputed')
# in X we store the projection results
X = MinMaxScaler().fit_transform( # Normalize the values between 0 and 1 so it's easier to plot.
clf.fit_transform(pairwise_distances(toy_space.values, metric='cosine'))
)
Now we have word vector embedding to a low dimensional space!
In [ ]:
pd.DataFrame(X, index=toy_space.index)
In [ ]:
import pylab as pl
pl.figure()
for word, (x, y) in zip(toy_space.index, X):
pl.text(x, y, word)
pl.tight_layout()
Task Do the cluster you see align with your grouping of words?
In [ ]:
space = get_space(
'http://www.eecs.qmul.ac.uk/~dm303/static/data/bigo_matrix.h5.gz'
)
Just an example to see what we've got there.
In [ ]:
space.loc[
['John', 'Mary', 'girl', 'boy'],
['tree', 'car', 'face', 'England', 'France']
]
In [ ]:
def plot(space, words, file_name=None):
"""Plot the `words` from the given `space`."""
cooc = space.loc[words]
missing_words = list(cooc[cooc.isnull().all(axis=1)].index)
assert not missing_words, '{0} are not in the space'.format(missing_words)
distances = pairwise_distances(cooc, metric='cosine')
clf = manifold.MDS(n_components=2, dissimilarity='precomputed', n_jobs=2)
X = MinMaxScaler().fit_transform(
clf.fit_transform(distances)
)
for word, (x, y) in zip(words, X):
pl.text(x, y, word)
pl.tight_layout()
if file_name is not None:
pl.savefig(file_name)
In [ ]:
matplotlib.rcParams['font.size'] = 20
x= plot(
space,
(
'red orange pink green blue white yellow black '
'mother father son daughter aunt uncle '
'concept research theory '
'car bus tube road bicycle train '
'karate fight fencing '
'apple company fruit train set '
''.split()
)
)
In [ ]: