In [2]:
import numpy as np
a = np.array([0,0,0,0,0,0,0,0,0,1,0]) # we might represent 'hotel' as this
b = np.array([0,0,0,0,0,0,0,1,0,0,0]) # we might represent 'motel' as this
print np.dot(a,b)
The problem here is that by representing words in 1-hot encoding, we lose alot about it's context and relationship with otherwords. Instead, we can get a lot of value by represnting a word by means of it's neighbors.
The cat purrs when she eats The cat memows when she is hungry
Options: full document vs window
In [30]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
''' Example Corpus:
1. I like deep learning.
2. I like NLP.
3. I enjoy flying.
'''
terms_only = ["I", "like", "enjoy", "deep",
"learning", "NLP", "flying", "."]
# for a window of 1, let's look at the prev one and next one word insert how
# many times you see that word surrounding the current i,j word in the matrix X:
X = np.array([[0,2,1,0,0,0,0,0],
[2,0,0,1,0,1,0,0],
[1,0,0,0,0,0,1,0],
[0,1,0,0,1,0,0,0],
[0,0,0,1,0,0,0,1],
[0,1,0,0,0,0,0,1],
[0,0,1,0,0,0,0,1],
[0,0,0,0,1,1,1,0]])
U, s, Vh = np.linalg.svd(X, full_matrices=False)
print U
for i in xrange(len(words)):
print U[i,0], U[i,0], words[i]
plt.text(U[i,0], U[i,1], words[i])
v = [-1,0.1,-1,1]
plt.axis(v)
Out[30]:
In [ ]: