Matrix Factorization with Gradient Descent
$X$ : Observed Matrix
$M$ : Mask Matrix (1 if observed, 0 otherwise)
\begin{eqnarray} E(W, H) = \frac{1}{2} \trace (X - WH)^\top (X - WH) \end{eqnarray}With missing values \begin{eqnarray} E(W, H) = \frac{1}{2} \trace (M\odot (X - WH))^\top (M\odot (X - WH) ) \end{eqnarray}
Partial derivatives \begin{eqnarray} \frac{\partial E(W, H)}{\partial W} = -(M\odot(X - WH))H^\top \end{eqnarray}
\begin{eqnarray} \frac{\partial E(W, H)}{\partial H} = -W^\top(M\odot(X - WH)) \end{eqnarray}
In [38]:
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
M = 5
N = 6
K = 2
W_true = np.random.randn(M,K)
H_true = np.random.randn(K,N)
X = W_true.dot(H_true)
X = X+0.05*np.random.randn(M,N)
p_on = 0.6
Mask = (np.random.rand(M,N)<p_on)
W = np.random.randn(M,K)
H = np.random.randn(K,N)
EPOCH = 2000
eta = 0.05
for i in range(EPOCH):
dW = -(Mask*(X-W.dot(H))).dot(H.T)
W = W - eta*dW
dH = -W.T.dot((Mask*(X-W.dot(H))))
H = H - eta*dH
if (i%100 == 0):
print(0.5*np.sum((Mask*(X-W.dot(H)))**2))
plt.imshow(Mask, interpolation='nearest',cmap=plt.cm.gray_r)
plt.title('Mask')
plt.show()
MX = X.copy()
MX[Mask==0] = np.nan
plt.imshow(MX, interpolation='nearest')
plt.title('Observed Data')
plt.show()
plt.imshow(W.dot(H), interpolation='nearest')
plt.title('Approximation')
plt.show()
plt.imshow(X, interpolation='nearest')
plt.title('True')
plt.show()
In [66]:
import scipy.sparse as sp
m = sp.coo.coo_matrix(Mask)
I,J = m.nonzero()
for i,j in zip(I,J):
print('[%d,%d,%2.3f],' % (i, j, X[i,j]))
print('---')
m2 = sp.coo.coo_matrix(1-Mask)
I,J = m2.nonzero()
for i,j in zip(I,J):
print('[%d,%d, %2.2f],' % (i, j, X[i,j]))
In [ ]: