In [1]:
import codecs
import glob
import logging
import multiprocessing
import os
import pprint
import re
In [2]:
import nltk
import gensim.models.word2vec as w2v
import sklearn.manifold
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from bhtsne import tsne
In [3]:
# get pretrained word vectors trained on the entire GoT text using word2vec in gensim
thrones2vec = w2v.Word2Vec.load(os.path.join("trained", "thrones2vec.w2v"))
In [4]:
# get all word vectors
all_word_vecs = thrones2vec.syn0
In [9]:
def best_avgs(words, all_vecs,k=10):
from operator import itemgetter
## get word embeddings for the words in our input array
embs = np.array([thrones2vec[word] for word in words])
#calculate its average
avg = np.sum(embs,axis=0)/len(words)
# Cosine Similarity with every word vector in the corpus
denom = np.sqrt(np.sum(all_vecs*all_vecs,axis=1,keepdims=True)) \
* np.sqrt(np.sum(avg*avg))
similarity = all_vecs.dot(avg.T).reshape(all_vecs.shape[0],1) \
/ denom
similarity = similarity.reshape(1,all_vecs.shape[0])[0]
# Finding the 10 largest words with highest similarity
# Since we are averaging we might end up getting the input words themselves
# among the top values
# we need to make sure we get back len(words)+k closest words and then
# remove all input words we supplied
nClosest = k + len(words)
# Get indices of the most similar word vectors to our avgvector
ind = np.argpartition(similarity, -(nClosest))[-nClosest:]
names = [thrones2vec.index2word[indx] for indx in ind]
similarity = similarity[ind]
uniq = [(person,similar) for person,similar in zip(names,similarity) if person not in words]
return sorted(uniq,key=itemgetter(1),reverse=True)[:k]
In [8]:
children = ["Arya","Robb","Sansa","Bran","Jon"]
best_avgs(children, all_word_vecs, 10)
Out[8]:
And the top two best averages? Their parents: Ned and Catelyn. Math is Beautiful :)
In [10]:
families = ["Lannister","Stark"]
best_avgs(families, all_word_vecs, 10)
Out[10]:
Spoilers
In [32]:
families = ["Tully","Stark"]
best_avgs(families, all_word_vecs, 10)
Out[32]:
In [11]:
families = ["Lannister","Baratheon"]
best_avgs(families, all_word_vecs, 10)
Out[11]:
Who's the usurper? a person who takes a position of power or importance illegally or by force.
In [14]:
thrones2vec.most_similar("usurper")
Out[14]:
Here we obtain words that are used in the same context as usurper or than have some similarity of usage with it. So the model is able to capture this kind of relationship as well.
In [38]:
thrones2vec.most_similar("Tyrion")
Out[38]:
In [43]:
thrones2vec.most_similar("Dothraki")
Out[43]:
In [20]:
def nearest_similarity_cosmul(start1, end1, end2):
similarities = thrones2vec.most_similar_cosmul(
positive=[end2, start1],
negative=[end1]
)
start2 = similarities[0][0]
print("{start1} is related to {end1}, as {start2} is related to {end2}".format(**locals()))
return start2
In [30]:
nearest_similarity_cosmul("woman","man","king")
Out[30]:
In [49]:
nearest_similarity_cosmul("Jaime","Lannister","Stark")
Out[49]:
In [46]:
thrones2vec.most_similar("Jaime")
Out[46]:
In [70]:
thrones2vec.most_similar(positive=['Ghost', 'Arya'], negative=['Jon'])
Out[70]:
In [6]:
Y = tsne(all_word_vecs.astype('float64'))
In [112]:
points = pd.DataFrame(
[
(word, coords[0], coords[1])
for word, coords in [
(word, Y[thrones2vec.vocab[word].index])
for word in thrones2vec.vocab
]
],
columns=["word", "x", "y"]
)
In [113]:
points.head(10)
Out[113]:
In [114]:
sns.set_context("poster")
In [115]:
%pylab inline
In [116]:
points.plot.scatter("x", "y", s=10, figsize=(20, 12))
Out[116]:
In [117]:
def plot_region(x_bounds, y_bounds):
slice = points[
(x_bounds[0] <= points.x) &
(points.x <= x_bounds[1]) &
(y_bounds[0] <= points.y) &
(points.y <= y_bounds[1])
]
inwords=[]
ax = slice.plot.scatter("x", "y", s=35, figsize=(10, 8))
for i, point in slice.iterrows():
inwords.append(point.word)
ax.text(point.x + 0.005, point.y + 0.005, point.word, fontsize=11)
print(", ".join(inwords))
In [118]:
plot_region(x_bounds=(-8.0,-6.0), y_bounds=(-29.0, -26.0))
In [119]:
points.loc[points["word"]=="Jaime",:]
Out[119]:
In [120]:
plot_region(x_bounds=(28,34), y_bounds=(-5.0,-2.0))
In [87]:
def coords(word):
coord = points.loc[points["word"]==word,:].values[0]
return coord[1],coord[2]
In [80]:
coords("Jon")
Out[80]:
In [82]:
def plot_close_to(word):
x,y = coords(word)
plot_region(x_bounds=(x-1.0,x+1.0), y_bounds=(y-1.0,y+1.0))
In [121]:
plot_close_to("apples")
In [106]:
plot_close_to("Winterfell")
In [108]:
plot_close_to("Payne")
In [122]:
for i in ["king","queen","man","woman"]:
print(coords(i))
In [124]:
plot_close_to("Needle")
In [ ]: