In [66]:
from random import random
import math
import numpy as np
import copy
In [67]:
def loadMovieLens(path='./data/movielens'):
#Get movie titles
movies={}
rev_movies={}
for idx,line in enumerate(open(path+'/u.item')):
idx,title=line.split('|')[0:2]
movies[idx]=title
rev_movies[title]=idx
# Load data
prefs={}
for line in open(path+'/u.data'):
(user,movieid,rating,ts)=line.split('\t')
prefs.setdefault(user,{})
prefs[user][movies[movieid]]=float(rating)
return prefs,rev_movies
In [68]:
data,movies = loadMovieLens("data/ml-100k")
Content example
In [69]:
data['3']
Out[69]:
In [70]:
def getRawArray(data):
d = []
for u in data.keys():
for i in data[u].keys():
d.append([u,i,data[u][i]])
return np.array(d)
In [71]:
# splitting while avoiding to reduce the dataset too much
def split_train_test(data,percent_test):
test={}
train={}
movie={}
for u in data.keys():
test.setdefault(u,{})
train.setdefault(u,{})
for movie in data[u]:
#print(data[u][movie])
if (random()<percent_test):
test[u][movie]=data[u][movie]
else:
train[u][movie]=data[u][movie]
return train, test
In [72]:
def split_train_test_by_movies(data,percent_test):
test={}
train={}
movie={}
for u in data.keys():
for movie in data[u]:
if (random()<percent_test):
try:
test[movie][u]=data[u][movie]
except KeyError:
test.setdefault(movie,{})
test[movie][u]=data[u][movie]
else:
try:
train[movie][u]=data[u][movie]
except KeyError:
train.setdefault(movie,{})
train[movie][u]=data[u][movie]
return train, test
In [73]:
percent_test=0.2
train,test=split_train_test(data,percent_test)
split used for convenience on the average by movie baseline
In [74]:
percent_test=0.2
m_train,m_test=split_train_test_by_movies(data,percent_test)
cleaning
In [75]:
def deleteUnseenInTest(train,test):
for k in test.keys():
try:
train[k]
except KeyError:
test.pop(k,None)
In [76]:
deleteUnseenInTest(train,test)
deleteUnseenInTest(m_train,m_test)
Matrix used for fast evaluation
In [77]:
evalArrayAll = getRawArray(data)
evalArrayTest = getRawArray(test)
In [78]:
evalArrayTest[:10,:10]
Out[78]:
In [79]:
class baselineMeanUser:
def __init__(self):
self.users={}
def fit(self,train):
for user in train.keys():
note=0.0
for movie in train[user].keys():
note+=train[user][movie]
note=note/len(train[user])
self.users[user]=note
def predict(self,users):
return [self.users[u] for u in users]
In [80]:
baseline_mu= baselineMeanUser()
baseline_mu.fit(train)
pred = baseline_mu.predict(evalArrayTest[:,0])
print("Mean Error %0.6f" %(
(np.array(pred) - np.array(evalArrayTest[:,2], float)) ** 2).mean())
In [81]:
class baselineMeanMovie:
def __init__(self):
self.movies={}
def fit(self,train):
for movie in train.keys():
note=0.0
for user in train[movie].keys():
note+=train[movie][user]
note=note/len(train[movie])
self.movies[movie]=note
def predict(self,movies):
res=[]
for m in movies:
try:
res.append(self.movies[m])
except:
res.append(3)
return res
In [82]:
baseline_mm= baselineMeanMovie()
baseline_mm.fit(m_train)
pred = baseline_mm.predict(evalArrayTest[:,1])
print("Mean Error %0.6f" %(
(np.array(pred) - np.array(evalArrayTest[:,2], float)) ** 2).mean())
Raw matrix used for convenience and clarity.
Structure like scipy sparse matrix or python dictionnaries may be used for speedup.
Complete dataset
In [83]:
rawMatrix = np.zeros((len(data.keys()),1682))
for u in data:
for m in data[u]:
rawMatrix[int(u)-1][int(movies[m])-1] = data[u][m]
In [84]:
print(np.shape(rawMatrix))
rawMatrix[:10,:10]
Out[84]:
Train and test dataset
In [85]:
rawMatrixTrain = np.zeros((len(data.keys()),1682))
for u in train:
for m in train[u]:
rawMatrixTrain[int(u)-1][int(movies[m])-1] = train[u][m]
rawMatrixTest = np.zeros((len(data.keys()),1682))
for u in test:
for m in test[u]:
rawMatrixTest[int(u)-1][int(movies[m])-1] = test[u][m]
In [38]:
#from scipy import linalg
def nmf(X, latent_features, max_iter=100, eps = 1e-5,printevery=100):
print "NMF with %d latent features, %d iterations."%(latent_features, max_iter)
# mask used to ignore null element (coded by zero)
mask = np.sign(X)
# randomly initialized matrix
rows, columns = X.shape
A = np.random.rand(rows, latent_features)
Y = np.random.rand(latent_features, columns)
# Not used as I couldn't find significant improvments
#Y = linalg.lstsq(A, X)[0] # initializing that way as recommanded in a blog post
#Y = np.maximum(Y, eps) # avoiding too low values
masked_X = mask * X
masktest = np.sign(rawMatrixTest) # used for prints
masktrain = np.sign(rawMatrixTrain) # used for prints
for i in range(1, max_iter + 1):
top = np.dot(masked_X, Y.T)
bottom = (np.dot((mask * np.dot(A, Y)), Y.T)) + eps
A *= top / bottom
top = np.dot(A.T, masked_X)
bottom = np.dot(A.T, mask * np.dot(A, Y)) + eps
Y *= top / bottom
# evaluation
if i % printevery == 0 or i == 1 or i == max_iter:
X_est = np.dot(A, Y)
q = masktest*X_est - rawMatrixTest
q_train = masktrain*X_est - rawMatrixTrain
print "Iteration %d, Err %.05f, Err train %.05f"%( i, (q*q).sum()/ masktest.sum(), (q_train*q_train).sum()/ masktest.sum() )
return A, Y
In [86]:
%%time
A,Y = nmf(rawMatrixTrain,100,eps = 1e-5,max_iter=5,printevery=1)
resMatrix = A.dot(Y)
We see that it quickly get better than the baseline.
However, we see below that it overfit after that:
In [25]:
%%time
A,Y = nmf(rawMatrixTrain,100,eps = 1e-5,max_iter=500,printevery=100)
resMatrix = A.dot(Y)
This is due to the high sparsity of the matrix.
We can of course reduce the features matrix size to avoid overfitting, but that will limit further improvments.
In [87]:
%%time
A,Y = nmf(rawMatrixTrain,1,eps = 1e-5,max_iter=100,printevery=20)
resMatrix = A.dot(Y)
Despite good results in few seconds on this dataset, this can only get us so far.
We then have to add regularization to the cost function.
In [61]:
## This class is used to make predictions
class evalMF:
def __init__(self,resMatrix,dicU,dicI):
self.resMatrix=resMatrix
self.dicU = dicU
self.dicI = dicI
def fit(self):
pass
def predict(self,user,movie):
return self.resMatrix[int(user)-1][int(self.dicI[movie])-1]
In [45]:
mf = evalMF(resMatrix,data,movies)
In [62]:
# np.array([ (float(ra[2]) - mf.predict(ra[0],ra[1]))**2 for ra in evalArrayTest]).mean()
# faster evaluation
masqueTest=np.sign(rawMatrixTest)
q = masqueTest*resMatrix - rawMatrixTest
(q*q).sum()/ masqueTest.sum()
Out[62]:
In [63]:
print data["1"]["Akira (1988)"]
print mf.predict("1","Akira (1988)")
print data["1"]["I.Q. (1994)"]
print mf.predict("1","I.Q. (1994)")
We usualy see an important difference between users, so we need to take the bias into account.
In [33]:
summ=0
for i in data["1"]:
summ+=(float(data["1"][i]) - mf.predict("1",i))**2
summ/len(data["1"])
Out[33]:
In [34]:
summ=0
for i in data["3"]:
summ+=(float(data["3"][i]) - mf.predict("3",i))**2
summ/len(data["3"])
Out[34]:
Various atempts to incoporate the bias and the L1 regularization can be found below.
However, we have not been very successful with them yet...
A simpler yet slower model can be found at the bottom of the page, in which the bias and L1 regularization can be added easly.
In [96]:
#self.lamb*np.linalg.norm(self.theta)
#from scipy import linalg
def nmf(X, latent_features, max_iter=100, eps = 1e-5, printevery=100):
print "NMF with %d latent features, %d iterations."%(latent_features, max_iter)
#lamb = 0.2
## User and Item bais
#X = copy.deepcopy(rawMatrix)
#with np.errstate(all='ignore'):
#avg_m = X.sum(0)/(X != 0).sum(0)
#avg_u = X.sum(1)/(X != 0).sum(1)
#diff_m = avg_m - avg_m.mean()
#diff_u = avg_u - avg_u.mean()
#print(avg_u.mean())
#X = X - diff_m
#for idxi,i in enumerate(X):
# for idxj,j in enumerate(i):
# if X[idxi,idxj]!=0:
# X[idxi,idxj]+=diff_u[idxi]
mask = np.sign(X)
rows, columns = X.shape
A = np.random.rand(rows, latent_features)
Y = np.random.rand(latent_features, columns)
# Not used as I couldn't find significant improvments
#Y = linalg.lstsq(A, X)[0] # initializing that way as recommanded in a blog post
#Y = np.maximum(Y, eps) # avoiding too low values
masked_X = mask * X
masktest = np.sign(rawMatrixTest) # used for prints
masktrain = np.sign(rawMatrixTrain) # used for prints
prev_A = A
prev_Y = Y
#diff_u = (avg_u - avg_u.mean().T).T
#(np.array([1,5])-mat.T).T
for i in range(1, max_iter + 1):
top = np.dot(masked_X, Y.T)
esti = np.dot((mask * np.dot(A, Y)), Y.T)
#esti = esti - diff_u
bottom = esti + eps
#print("val",np.shape(top/bottom))
A *= top / bottom
top = np.dot(A.T, masked_X)
esti = np.dot(A.T, mask * np.dot(A, Y))
#esti = esti - diff_m
bottom = esti + eps
#print("lav",np.shape(top/bottom))
tb = top / bottom
#print(np.linalg.norm(tb))
no = np.linalg.norm(tb)
#Y *= tb
Y *= (0.9 * tb) + ( 0.1 * ( tb + (1/no) ) )
"""
## Regularization
if i % 10 == 0:
diff = np.abs(Y - prev_Y)
diff = diff - 0.001
Y = np.sign(diff)*Y
prev_Y = Y
"""
# evaluation
if i % 10 == 0 or i == 1 or i == max_iter:
X_est = np.dot(A, Y)
q = masktest*X_est - rawMatrixTest
q_train = masktrain*X_est - rawMatrixTrain
print(np.linalg.norm(tb))
print "Iteration %d, Err %.05f, Err train %.05f"%( i, (q*q).sum()/ masktest.sum(), (q_train*q_train).sum()/ masktest.sum() )
return A, Y
In [99]:
%%time
X = copy.deepcopy(rawMatrixTrain)
A,Y = nmf(X,1,eps = 1e-5,max_iter=100,printevery=10)
resMatrix = A.dot(Y)
In [321]:
with np.errstate(all='ignore'):
avg_m = rawMatrix.sum(0)/(rawMatrix != 0).sum(0)
avg_u = rawMatrix.sum(1)/(rawMatrix != 0).sum(1)
In [33]:
masqueTest=np.sign(rawMatrixTest)
q = masqueTest*resMatrix - rawMatrixTest
(q*q).sum()/ masqueTest.sum()
Out[33]:
In [59]:
mf = evalMF(resMatrix,data,movies)
In [69]:
print data["1"]["Akira (1988)"]
print mf.predict("1","Akira (1988)")
print data["1"]["All Dogs Go to Heaven 2 (1996)"]
print mf.predict("1","All Dogs Go to Heaven 2 (1996)")
In [64]:
len(rawMatrixTest)
Out[64]:
In [65]:
t = []
c = 10
for idxi,i in enumerate(rawMatrixTest):
for idxj,j in enumerate(i):
if rawMatrixTest[idxi][idxj] != 0:
t.append( (resMatrix[idxi][idxj] - float(rawMatrixTest[idxi][idxj]))**2 )
if c>0:
print(rawMatrixTest[idxi][idxj],resMatrix[idxi][idxj])
c-=1
np.array(t).mean()
Out[65]:
In [59]:
R = rawMatrixTrain
In [74]:
def matrix_factorization(R, K, steps=100, eps=0.0001, beta=0.02, decay=0.95):
N,M = np.shape(R)
P = np.random.rand(N,K)
#P = np.maximum(P, eps)
#Q = np.random.rand(M,K).T
Q = linalg.lstsq(P, R)[0]
Q = np.maximum(Q, eps)
#masked_X = mask * X
#X_est_prev = dot(A, Y)
#mask = np.sign(R)
#masked_R = mask * R
masktest = np.sign(rawMatrixTest)
masktrain = np.sign(rawMatrixTrain)
for step in xrange(1,steps+1):
#"""
for i in xrange(len(R)):
for j in xrange(len(R[i])):
if R[i][j] > 0:
eij = R[i][j] - np.dot(P[i,:],Q[:,j])
P[i] = P[i] + eps * (2 * eij * Q.T[j] - beta * P[i])
#Q[i] = P[i] + eps * (2 * eij * Q.T[j] - beta * P[i])
Q.T[j] = Q.T[j] + eps * (2 * eij * P[i] - beta * Q.T[j])
#for k in xrange(K):
# P[i][k] = P[i][k] + eps * (2 * eij * Q[k][j] - beta * P[i][k])
#Q[k][j] = Q[k][j] + eps * (2 * eij * P[i][k] - beta * Q[k][j])
if step%5:
eps=eps*decay
if step % 10 == 0 or step == 1 or step == steps:
X_est = dot(P, Q)
q = masktest*X_est - rawMatrixTest
q_train = masktrain*X_est - rawMatrixTrain
print "Iteration %d, Err %.05f, Err on train %.05f"%( step, (q*q).sum()/ masktest.sum(), (q_train*q_train).sum()/ masktest.sum() )
return P, Q.T
In [70]:
%%time
K = 10
nP, nQ = matrix_factorization(R, K, steps=100,eps=1e-5)
In [72]:
nR = np.dot(nP, nQ.T)
((nR-R)**2).sum()/np.sign(R).sum()
Out[72]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
def matrix_factorization(R, K, steps=100, eps=0.0001, beta=0.02, decay=0.95):
N,M = np.shape(R)
P = np.random.rand(N,K)
Q = linalg.lstsq(P, R)[0]
Q = np.maximum(Q, eps)
masktest = np.sign(rawMatrixTest)
masktrain = np.sign(rawMatrixTrain)
for step in xrange(1,steps+1):
for i in xrange(len(R)):
for j in xrange(len(R[i])):
if R[i][j] > 0:
eij = R[i][j] - np.dot(P[i,:],Q[:,j])
P[i] = P[i] + eps * (2 * eij * Q.T[j] - beta * P[i])
Q.T[j] = Q.T[j] + eps * (2 * eij * P[i] - beta * Q.T[j])
if step%5:
eps=eps*decay
if step % 10 == 0 or step == 1 or step == steps:
X_est = dot(P, Q)
q = masktest*X_est - rawMatrixTest
q_train = masktrain*X_est - rawMatrixTrain
print "Iteration %d, Err %.05f, Err on train %.05f"%( step, (q*q).sum()/ masktest.sum(), (q_train*q_train).sum()/ masktest.sum() )
return P, Q.T