Wayne Nixalo - 12 June 2017
Practical Deep Learning I
Lesson 5: NLP
wordvectors.ipynb code along
In [1]:
%matplotlib inline
import os, sys
sys.path.insert(1, os.path.join('utils'))
import utils; reload(utils)
from utils import *
from __future__ import division, print_function
In [5]:
path = 'data/glove.6B/'
res_path = path + 'results/'
if not os.path.exists(path): os.mkdir(path)
if not os.path.exists(res_path): os.mkdir(res_path)
This section shows how we processed the original glove text files. However, there's no need for you to do this, since we provide the pre-processed glove data --- Link broken; instead I'm downloading from nlp.stanford.edu/projects/glove/glove.6B.zip
In [6]:
def get_glove(name):
with open(path+ 'glove.' + name + '.txt', 'r') as f: lines = [line.split() for line in f]
words = [d[0] for d in lines]
vecs = np.stack(np.array(d[1:], dtype=np.float32) for d in lines)
wordidx = {o:i for i,o in enumerate(words)}
save_array(res_path+name+'.dat', vecs)
pickle.dump(words, open(res_path+name+'_words.pkl','wb'))
pickle.dump(wordidx, open(res_path+name+'_idx.pkl','wb'))
In [7]:
get_glove('6B.50d')
# get_glove('6B.100d')
# get_glove('6B.200d')
# get_glove('6B.300d')
In [8]:
def load_glove(loc):
return (load_array(loc + '.dat'),
pickle.load(open(loc + '_words.pkl', 'rb')),
pickle.load(open(loc + '_idx.pkl', 'rb')))
In [9]:
vecs, words, wirdidx = load_glove(res_path + '6B.50d')
vecs.shape
Out[9]:
Here's the first 25 "words" in glove.
In [10]:
' '.join(words[:25])
Out[10]:
This is how you can look up a word vector.
In [11]:
def w2v(w): return vecs[wirdidx[w]] # mispelled 'wordidx' up above... lol
In [12]:
w2v('of')
Out[12]:
Just for fun, let's take a look at a 2d projection of the first 350 words, using T-SNE
In [13]:
reload(sys)
sys.setdefaultencoding('utf8')
In [14]:
tsne = TSNE(n_components=2, random_state=0)
Y = tsne.fit_transform(vecs[:500])
start=0; end=350
dat = Y[start:end]
plt.figure(figsize=(15,15))
plt.scatter(dat[:, 0], dat[:, 1])
for label, x, y in zip(words[start:end], dat[:, 0], dat[:, 1]):
plt.text(x,y,label, color=np.random.rand(3)*0.7,
fontsize=14)
plt.show()
In [ ]: