In [1]:
from random import random
import math
import numpy as np
import copy
In [2]:
def loadMovieLens(path='./data/movielens'):
#Get movie titles
movies={}
rev_movies={}
for idx,line in enumerate(open(path+'/u.item')):
idx,title=line.split('|')[0:2]
movies[idx]=title
rev_movies[title]=idx
# Load data
prefs={}
for line in open(path+'/u.data'):
(user,movieid,rating,ts)=line.split('\t')
prefs.setdefault(user,{})
prefs[user][movies[movieid]]=float(rating)
return prefs,rev_movies
In [3]:
data,movies = loadMovieLens("data/ml-100k")
Content example
In [4]:
data['3']
Out[4]:
In [5]:
def getRawArray(data):
d = []
for u in data.keys():
for i in data[u].keys():
d.append([u,i,data[u][i]])
return np.array(d)
In [6]:
# splitting while avoiding to reduce the dataset too much
def split_train_test(data,percent_test):
test={}
train={}
movie={}
for u in data.keys():
test.setdefault(u,{})
train.setdefault(u,{})
for movie in data[u]:
#print(data[u][movie])
if (random()<percent_test):
test[u][movie]=data[u][movie]
else:
train[u][movie]=data[u][movie]
return train, test
In [7]:
def split_train_test_by_movies(data,percent_test):
test={}
train={}
movie={}
for u in data.keys():
for movie in data[u]:
if (random()<percent_test):
try:
test[movie][u]=data[u][movie]
except KeyError:
test.setdefault(movie,{})
test[movie][u]=data[u][movie]
else:
try:
train[movie][u]=data[u][movie]
except KeyError:
train.setdefault(movie,{})
train[movie][u]=data[u][movie]
return train, test
In [8]:
percent_test=0.2
train,test=split_train_test(data,percent_test)
split used for convenience on the average by movie baseline
In [9]:
percent_test=0.2
m_train,m_test=split_train_test_by_movies(data,percent_test)
cleaning
In [10]:
def deleteUnseenInTest(train,test):
for k in test.keys():
try:
train[k]
except KeyError:
test.pop(k,None)
In [11]:
deleteUnseenInTest(train,test)
deleteUnseenInTest(m_train,m_test)
Matrix used for fast evaluation
In [12]:
evalArrayAll = getRawArray(data)
evalArrayTest = getRawArray(test)
In [13]:
evalArrayTest[:10,:10]
Out[13]:
In [14]:
class baselineMeanUser:
def __init__(self):
self.users={}
def fit(self,train):
for user in train.keys():
note=0.0
for movie in train[user].keys():
note+=train[user][movie]
note=note/len(train[user])
self.users[user]=note
def predict(self,users):
return [self.users[u] for u in users]
In [15]:
baseline_mu= baselineMeanUser()
baseline_mu.fit(train)
pred = baseline_mu.predict(evalArrayTest[:,0])
print("Mean Error %0.6f" %(
(np.array(pred) - np.array(evalArrayTest[:,2], float)) ** 2).mean())
In [16]:
class baselineMeanMovie:
def __init__(self):
self.movies={}
def fit(self,train):
for movie in train.keys():
note=0.0
for user in train[movie].keys():
note+=train[movie][user]
note=note/len(train[movie])
self.movies[movie]=note
def predict(self,movies):
res=[]
for m in movies:
try:
res.append(self.movies[m])
except:
res.append(3)
return res
In [17]:
baseline_mm= baselineMeanMovie()
baseline_mm.fit(m_train)
pred = baseline_mm.predict(evalArrayTest[:,1])
print("Mean Error %0.6f" %(
(np.array(pred) - np.array(evalArrayTest[:,2], float)) ** 2).mean())
Raw matrix used for convenience and clarity.
Structure like scipy sparse matrix or python dictionnaries may be used for speedup.
Complete dataset
In [18]:
rawMatrix = np.zeros((len(data.keys()),1682))
for u in data:
for m in data[u]:
rawMatrix[int(u)-1][int(movies[m])-1] = data[u][m]
In [19]:
print(np.shape(rawMatrix))
rawMatrix[:10,:10]
Out[19]:
Train and test dataset
In [20]:
rawMatrixTrain = np.zeros((len(data.keys()),1682))
for u in train:
for m in train[u]:
rawMatrixTrain[int(u)-1][int(movies[m])-1] = train[u][m]
rawMatrixTest = np.zeros((len(data.keys()),1682))
for u in test:
for m in test[u]:
rawMatrixTest[int(u)-1][int(movies[m])-1] = test[u][m]
Fast implementation using numpy's matrix processing. Sadly, I can't avoid the overfitting starting after few iterations.
In [21]:
#from scipy import linalg
def nmf(X, latent_features, max_iter=100, eps = 1e-5,printevery=100):
print "NMF with %d latent features, %d iterations."%(latent_features, max_iter)
# mask used to ignore null element (coded by zero)
mask = np.sign(X)
# randomly initialized matrix
rows, columns = X.shape
A = np.random.rand(rows, latent_features)
Y = np.random.rand(latent_features, columns)
# Not used as I couldn't find significant improvments
#Y = linalg.lstsq(A, X)[0] # initializing that way as recommanded in a blog post
#Y = np.maximum(Y, eps) # avoiding too low values
masked_X = mask * X
masktest = np.sign(rawMatrixTest) # used for prints
masktrain = np.sign(rawMatrixTrain) # used for prints
for i in range(1, max_iter + 1):
top = np.dot(masked_X, Y.T)
bottom = (np.dot((mask * np.dot(A, Y)), Y.T)) + eps
A *= top / bottom
top = np.dot(A.T, masked_X)
bottom = np.dot(A.T, mask * np.dot(A, Y)) + eps
Y *= top / bottom
# evaluation
if i % printevery == 0 or i == 1 or i == max_iter:
X_est = np.dot(A, Y)
q = masktest*X_est - rawMatrixTest
q_train = masktrain*X_est - rawMatrixTrain
print "Iteration %d, Err %.05f, Err train %.05f"%( i, (q*q).sum()/ masktest.sum(), (q_train*q_train).sum()/ masktest.sum() )
return A, Y
In [22]:
%%time
A,Y = nmf(rawMatrixTrain,100,eps = 1e-5,max_iter=5,printevery=1)
resMatrix = A.dot(Y)
We see that it quickly get better than the baseline.
However, we see below that it overfit after that:
In [25]:
%%time
A,Y = nmf(rawMatrixTrain,10,eps = 1e-5,max_iter=500,printevery=100)
resMatrix = A.dot(Y)
This is due to the high sparsity of the matrix.
We can of course reduce the features matrix size to avoid overfitting, but that will limit further improvments.
In [24]:
%%time
A,Y = nmf(rawMatrixTrain,1,eps = 1e-5,max_iter=100,printevery=20)
resMatrix = A.dot(Y)
Despite good results in few seconds on this dataset, this can only get us so far.
We then have to add regularization to the cost function.
In [51]:
class evalMF:
def __init__(self,resMatrix,dicU,dicI):
self.resMatrix=resMatrix
self.dicU = dicU
self.dicI = dicI
def fit(self):
pass
def predict(self,user,movie):
return self.resMatrix[int(user)][int(self.dicI[movie])]
In [37]:
mf= evalMF(resMatrix,data,movies)
In [52]:
np.array([ (float(ra[2]) - mf.predict(ra[0],ra[1]))**2 for ra in evalArrayTest]).mean()
Out[52]:
We usualy see an important difference between users, so we need to take the bias into account.
In [53]:
summ=0
for i in data["1"]:
summ+=(float(data["1"][i]) - mf.predict("1",i))**2
summ/len(data["1"])
Out[53]:
In [54]:
summ=0
for i in data["3"]:
summ+=(float(data["3"][i]) - mf.predict("3",i))**2
summ/len(data["3"])
Out[54]:
In [470]:
#self.lamb*np.linalg.norm(self.theta)
#from scipy import linalg
def nmf(X, latent_features, max_iter=100, eps = 1e-5, printevery=100):
print "NMF with %d latent features, %d iterations."%(latent_features, max_iter)
#lamb = 0.2
#X = copy.deepcopy(rawMatrix)
#with np.errstate(all='ignore'):
#avg_m = X.sum(0)/(X != 0).sum(0)
avg_u = X.sum(1)/(X != 0).sum(1)
#diff_m = avg_m - avg_m.mean()
diff_u = avg_u - avg_u.mean()
print(avg_u.mean())
#X = X - diff_m
for idxi,i in enumerate(X):
for idxj,j in enumerate(i):
if X[idxi,idxj]!=0:
X[idxi,idxj]+=diff_u[idxi]
# mask used to ignore null element (coded by zero)
mask = np.sign(X)
# randomly initialized matrix
rows, columns = X.shape
A = np.random.rand(rows, latent_features)
Y = np.random.rand(latent_features, columns)
# Not used as I couldn't find significant improvments
#Y = linalg.lstsq(A, X)[0] # initializing that way as recommanded in a blog post
#Y = np.maximum(Y, eps) # avoiding too low values
masked_X = mask * X
masktest = np.sign(rawMatrixTest) # used for prints
masktrain = np.sign(rawMatrixTrain) # used for prints
prev_A = A
prev_Y = Y
#diff_u = (avg_u - avg_u.mean().T).T
#(np.array([1,5])-mat.T).T
for i in range(1, max_iter + 1):
top = np.dot(masked_X, Y.T)
esti = np.dot((mask * np.dot(A, Y)), Y.T)
#esti = esti - diff_u
bottom = esti + eps
#print("val",np.shape(top/bottom))
A *= top / bottom
top = np.dot(A.T, masked_X)
esti = np.dot(A.T, mask * np.dot(A, Y))
#esti = esti - diff_m
bottom = esti + eps
#print("lav",np.shape(top/bottom))
tb = top / bottom
#print(np.linalg.norm(tb))
no = np.linalg.norm(tb)
Y *= tb
#Y *= (0.9 * tb) + ( 0.1 * ( tb + (1/no) ) )
"""
if i % 10 == 0:
diff = np.abs(Y - prev_Y)
diff = diff - 0.1
Y = np.sign(diff)*Y
prev_Y = Y
"""
# evaluation
if i % 10 == 0 or i == 1 or i == max_iter:
X_est = np.dot(A, Y)
q = masktest*X_est - rawMatrixTest
q_train = masktrain*X_est - rawMatrixTrain
#print(np.linalg.norm(tb))
print "Iteration %d, Err %.05f, Err train %.05f"%( i, (q*q).sum()/ masktest.sum(), (q_train*q_train).sum()/ masktest.sum() )
return A, Y
In [450]:
mat = np.array([[1,2,3],
[4,5,6]])
print(np.array([1,5,10])-mat)
print (np.array([1,5])-mat.T).T
In [454]:
X = copy.deepcopy(rawMatrix)
#with np.errstate(all='ignore'):
#avg_m = X.sum(0)/(X != 0).sum(0)
avg_u = X.sum(1)/(X != 0).sum(1)
#diff_m = avg_m - avg_m.mean()
diff_u = avg_u - avg_u.mean()
print(avg_u.mean())
#X = X - diff_m
for idxi,i in enumerate(X):
for idxj,j in enumerate(i):
if X[idxi,idxj]!=0:
X[idxi,idxj]+=diff_u[idxi]
In [469]:
%%time
X = copy.deepcopy(rawMatrixTrain)
A,Y = nmf(X,1,eps = 1e-5,max_iter=100,printevery=10)
resMatrix = A.dot(Y)
In [355]:
%%time
A,Y = nmf(rawMatrixTrain,1,eps = 1e-5,max_iter=100,printevery=10)
resMatrix = A.dot(Y)
In [321]:
with np.errstate(all='ignore'):
avg_m = rawMatrix.sum(0)/(rawMatrix != 0).sum(0)
avg_u = rawMatrix.sum(1)/(rawMatrix != 0).sum(1)
In [144]:
np.shape(t)
Out[144]:
In [124]:
tt = t[0]
In [125]:
tt
Out[125]:
In [226]:
resMatrix = A.dot(Y)
In [227]:
a=np.array((1,2,4))
b=np.array((1,3,6))
(a-b).dot(a-b)
masqueTest=np.sign(rawMatrixTest)
masqueTest[:10,:10]
A=masqueTest*rawMatrix
In [228]:
aa = masqueTest*resMatrix
In [229]:
for idxi,i in enumerate(aa):
for idxj,j in enumerate(i):
if j>5:
aa[idxi][idxj]=5
In [235]:
q = masqueTest*resMatrix - rawMatrixTest
In [236]:
(q*q).sum()/ masqueTest.sum()
Out[236]:
In [33]:
masqueTest=np.sign(rawMatrixTest)
q = masqueTest*resMatrix - rawMatrixTest
(q*q).sum()/ masqueTest.sum()
Out[33]:
In [59]:
mf = evalMF(resMatrix,data,movies)
In [69]:
print data["1"]["Akira (1988)"]
print mf.predict("1","Akira (1988)")
print data["1"]["All Dogs Go to Heaven 2 (1996)"]
print mf.predict("1","All Dogs Go to Heaven 2 (1996)")
In [61]:
print train["1"]["All Dogs Go to Heaven 2 (1996)"]
print test["1"]["Akira (1988)"]
In [80]:
len(rawMatrixTest)
Out[80]:
In [78]:
t = []
c = 10
for idxi,i in enumerate(rawMatrixTest):
for idxj,j in enumerate(i):
if rawMatrixTest[idxi][idxj] != 0:
t.append( (resMatrix[idxi][idxj] - float(rawMatrixTest[idxi][idxj]))**2 )
if c>0:
print(rawMatrixTest[idxi][idxj],resMatrix[idxi][idxj])
c-=1
np.array(t).mean()
Out[78]:
In [87]:
t = []
c = 10
for idxi,i in enumerate(resMatrix):
for idxj,j in enumerate(i):
if rawMatrixTest[idxi][idxj] != 0:
t.append( (resMatrix[idxi][idxj] - float(rawMatrix[idxi][idxj]))**2 )
if c>0:
print(rawMatrix[idxi][idxj],resMatrix[idxi][idxj])
c-=1
np.array(t).mean()
Out[87]:
In [108]:
t = []
c = 3
for idxi,i in enumerate(rawMatrixTrain):
for idxj,j in enumerate(i):
if rawMatrixTrain[idxi][idxj] != 0:
t.append( (float(rawMatrixTrain[idxi][idxj]) - resMatrix[idxi][idxj])**2 )
if c>0:
print(rawMatrixTrain[idxi][idxj],resMatrix[idxi][idxj])
c-=1
np.array(t).mean()
Out[108]:
In [80]:
np.array([ (float(ra[2]) - mf.predict(ra[0],ra[1]))**2 for ra in rawMatrixTest]).mean()
In [60]:
R = [
[5,3,5,3],
[4,0,0,1],
[1,5,1,5],
[1,0,1,4],
[0,4,5,4],
]
R = np.array(R)
K = 10
np.random.rand
Out[60]:
In [87]:
%%time
nP, nQ = matrix_factorization(R, K, steps=1000)
In [88]:
nR = np.dot(nP, nQ.T)
In [89]:
((nR-R)**2).sum()/np.sign(R).sum()
Out[89]:
In [59]:
R = rawMatrixTrain
In [74]:
def matrix_factorization(R, K, steps=100, eps=0.0001, beta=0.02, decay=0.95):
N,M = np.shape(R)
P = np.random.rand(N,K)
#P = np.maximum(P, eps)
#Q = np.random.rand(M,K).T
Q = linalg.lstsq(P, R)[0]
Q = np.maximum(Q, eps)
#masked_X = mask * X
#X_est_prev = dot(A, Y)
#mask = np.sign(R)
#masked_R = mask * R
masktest = np.sign(rawMatrixTest)
masktrain = np.sign(rawMatrixTrain)
for step in xrange(1,steps+1):
#"""
for i in xrange(len(R)):
for j in xrange(len(R[i])):
if R[i][j] > 0:
eij = R[i][j] - np.dot(P[i,:],Q[:,j])
P[i] = P[i] + eps * (2 * eij * Q.T[j] - beta * P[i])
#Q[i] = P[i] + eps * (2 * eij * Q.T[j] - beta * P[i])
Q.T[j] = Q.T[j] + eps * (2 * eij * P[i] - beta * Q.T[j])
#for k in xrange(K):
# P[i][k] = P[i][k] + eps * (2 * eij * Q[k][j] - beta * P[i][k])
#Q[k][j] = Q[k][j] + eps * (2 * eij * P[i][k] - beta * Q[k][j])
"""
###
top = np.dot(masked_R, Q.T)
bottom = (np.dot((mask * np.dot(P, Q)), Q.T)) + eps
P *= top / bottom
P = np.maximum(P, eps)
# print 'A', np.round(A, 2)
top = np.dot(P.T, masked_R)
bottom = np.dot(P.T, mask * np.dot(P, Q)) + eps
Q *= top / bottom
Q = np.maximum(Q, eps)
# print 'Y', np.round(Y, 2)
"""
if step%5:
eps=eps*decay
if step % 10 == 0 or step == 1 or step == steps:
X_est = dot(P, Q)
q = masktest*X_est - rawMatrixTest
q_train = masktrain*X_est - rawMatrixTrain
print "Iteration %d, Err %.05f, Err on train %.05f"%( step, (q*q).sum()/ masktest.sum(), (q_train*q_train).sum()/ masktest.sum() )
return P, Q.T
In [67]:
%%time
K = 10
nP, nQ = matrix_factorization(R, K, steps=20,eps=1e-3)
In [70]:
%%time
K = 10
nP, nQ = matrix_factorization(R, K, steps=100,eps=1e-5)
In [72]:
nR = np.dot(nP, nQ.T)
((nR-R)**2).sum()/np.sign(R).sum()
Out[72]:
In [79]:
%%time
K = 10
nP, nQ = matrix_factorization(R, K, steps=50,eps=1e-2)
In [ ]:
%%time
N,M = np.shape(R)
P = np.random.rand(N,K)
#P = np.maximum(P, eps)
#Q = np.random.rand(M,K).T
Q = linalg.lstsq(P, R)[0]
Q = np.maximum(Q, eps)
#masked_X = mask * X
#X_est_prev = dot(A, Y)
#mask = np.sign(R)
#masked_R = mask * R
for i in xrange(len(R)):
for j in xrange(len(R[i])):
if R[i][j] > 0:
eij = R[i][j] - np.dot(P[i,:],Q[:,j])
for k in xrange(K):
P[i][k] = P[i][k] + eps * (2 * eij * Q[k][j] - beta * P[i][k])
Q[k][j] = Q[k][j] + eps * (2 * eij * P[i][k] - beta * Q[k][j])
In [161]:
for _ in range(1,5):
nP, nQ = matrix_factorization(R, K, steps=1000,eps=1e-3)
nR = np.dot(nP, nQ.T)
print ((nR-GR)**2).sum()/np.sign(GR).sum()
In [141]:
GR = [
[0,0,0,0],
[0,1,1,0],
[0,0,0,0],
[0,4,0,0],
[4,0,0,0],
]
In [139]:
R
Out[139]:
In [149]:
nR
Out[149]:
In [55]:
R[1,1]
Out[55]:
In [98]:
from scipy import linalg
In [100]:
rows, columns = R.shape
A = np.random.rand(rows, 2)
In [ ]:
mask = np.sign(X)
# initial matrices. A is random [0,1] and Y is A\X.
rows, columns = X.shape
A = np.random.rand(rows, latent_features)
A = np.maximum(A, eps)
Y = linalg.lstsq(A, X)[0]
Y = np.maximum(Y, eps)
masked_X = mask * X
X_est_prev = dot(A, Y)
# updates
top = dot(masked_X, Y.T)
bottom = (dot((mask * dot(A, Y)), Y.T)) + eps
A *= top / bottom
A = np.maximum(A, eps)
# print 'A', np.round(A, 2)
top = dot(A.T, masked_X)
bottom = dot(A.T, mask * dot(A, Y)) + eps
Y *= top / bottom
Y = np.maximum(Y, eps)
# print 'Y', np.round(Y, 2)
# evaluation
if i % 200 == 0 or i == 1 or i == max_iter:
print 'Iteration {}:'.format(i),
X_est = dot(A, Y)
err = mask * (X_est_prev - X_est)
fit_residual = np.sqrt(np.sum(err ** 2))
X_est_prev = X_est
curRes = linalg.norm(mask * (X - X_est), ord='fro')
print 'fit residual', np.round(fit_residual, 4),
print 'total residual', np.round(curRes, 4)
if curRes < error_limit or fit_residual < fit_error_limit:
break
In [190]:
%%time
R = rawMatrixTrain
nP, nQ = matrix_factorization(R, 10, steps=40,eps=1e-3)
nR = np.dot(nP, nQ.T)
In [191]:
masqueTest=np.sign(rawMatrixTest)
aa=masqueTest*rawMatrix
"""
for idxi,i in enumerate(aa):
for idxj,j in enumerate(i):
if j>5:
aa[idxi][idxj]=5
"""
q = masqueTest*nR - rawMatrixTest
(q*q).sum()/ masqueTest.sum()
Out[191]:
In [30]:
nR[:5,:5]
Out[30]:
In [31]:
rawMatrix[:5,:5]
Out[31]:
In [32]:
mf= evalMF(nR,data,movies)
mf.predict("1","Akira (1988)")
In [47]:
np.array([ (float(ra[2]) - mf.predict(ra[0],ra[1]))**2 for ra in rawArrayTest]).mean()
Out[47]:
In [ ]: