In [1]:
from random import random
import math
import numpy as np
import copy
In [2]:
def loadMovieLens(path='./data/movielens'):
#Get movie titles
movies={}
rev_movies={}
for idx,line in enumerate(open(path+'/u.item')):
idx,title=line.split('|')[0:2]
movies[idx]=title
rev_movies[title]=idx
# Load data
prefs={}
for line in open(path+'/u.data'):
(user,movieid,rating,ts)=line.split('\t')
prefs.setdefault(user,{})
prefs[user][movies[movieid]]=float(rating)
return prefs,rev_movies
In [3]:
data,movies = loadMovieLens("data/ml-100k")
In [4]:
data['3']
Out[4]:
In [5]:
def getRawArray(data):
d = []
for u in data.keys():
for i in data[u].keys():
d.append([u,i,data[u][i]])
return np.array(d)
In [6]:
# splitting while avoiding to reduce the dataset too much
def split_train_test(data,percent_test):
test={}
train={}
movie={}
for u in data.keys():
test.setdefault(u,{})
train.setdefault(u,{})
for movie in data[u]:
#print(data[u][movie])
if (random()<percent_test):
test[u][movie]=data[u][movie]
else:
train[u][movie]=data[u][movie]
return train, test
In [7]:
def split_train_test_by_movies(data,percent_test):
test={}
train={}
movie={}
for u in data.keys():
for movie in data[u]:
if (random()<percent_test):
try:
test[movie][u]=data[u][movie]
except KeyError:
test.setdefault(movie,{})
test[movie][u]=data[u][movie]
else:
try:
train[movie][u]=data[u][movie]
except KeyError:
train.setdefault(movie,{})
train[movie][u]=data[u][movie]
return train, test
In [8]:
percent_test=0.2
train,test=split_train_test(data,percent_test)
In [9]:
percent_test=0.2
m_train,m_test=split_train_test_by_movies(data,percent_test)
In [10]:
def deleteUnseenInTest(train,test):
for k in test.keys():
try:
train[k]
except KeyError:
test.pop(k,None)
In [11]:
deleteUnseenInTest(train,test)
deleteUnseenInTest(m_train,m_test)
In [12]:
rawArray = getRawArray(data)
rawArrayTest = getRawArray(test)
In [13]:
class baselineMeanUser:
def __init__(self):
self.users={}
def fit(self,train):
for user in train.keys():
note=0.0
for movie in train[user].keys():
note+=train[user][movie]
note=note/len(train[user])
self.users[user]=note
def predict(self,users):
return [self.users[u] for u in users]
In [14]:
baseline_mu= baselineMeanUser()
baseline_mu.fit(train)
pred = baseline_mu.predict(rawArray[:,0])
print("Mean Error %0.6f" %(
(np.array(pred) - np.array(rawArray[:,2], float)) ** 2).mean())
In [15]:
class baselineMeanMovie:
def __init__(self):
self.movies={}
def fit(self,train):
for movie in train.keys():
note=0.0
for user in train[movie].keys():
note+=train[movie][user]
note=note/len(train[movie])
self.movies[movie]=note
def predict(self,movies):
res=[]
for m in movies:
try:
res.append(self.movies[m])
except:
res.append(3)
return res
In [16]:
baseline_mm= baselineMeanMovie()
baseline_mm.fit(m_train)
pred = baseline_mm.predict(rawArrayTest[:,1])
print("Mean Error %0.6f" %(
(np.array(pred) - np.array(rawArrayTest[:,2], float)) ** 2).mean())
In [17]:
m_test['Adventures of Pinocchio, The (1996)']
Out[17]:
In [18]:
rawArray[:5]
Out[18]:
In [19]:
len(m_train['Birdcage, The (1996)'])
Out[19]:
In [27]:
class matrixFactorisation():
def __init__(self, k, lambd=0.2, eps=1e-5, maxIter=2000, alternate=0):
self.k = k
self.lambd = lambd
self.eps = eps
self.maxIter = maxIter
self.alternate = alternate
def fit(self, dataUsers, dataItems, couples):
self.p = {}
self.q = {}
self.couples = couples
self.loss = []
optimP = True
optimQ = (self.alternate == 0)
for i in xrange(self.maxIter):
loss = 0
for j in xrange(len(couples)):
r = np.random.randint(len(couples))
user = couples[r][0]
item = couples[r][1]
if not user in self.p:
self.p[user] = np.random.rand(1,self.k)
if not item in self.q:
self.q[item] = np.random.rand(self.k,1)
tmp = dataUsers[user][item] - self.p[user].dot(self.q[item])[0][0]
if (optimP):
self.p[user] = (1 - self.lambd * self.eps) * self.p[user] + self.eps * 2 * tmp * self.q[item].transpose()
if (optimQ):
self.q[item] = (1 - self.lambd * self.eps) * self.q[item] + self.eps * 2 * tmp * self.p[user].transpose()
loss = loss + tmp*tmp #Sans régularisation
self.loss.append(loss)
if (self.alternate != 0):
if (i % self.alternate == 0):
oprimP = False if optimQ else True
print i, loss / len(couples)
else:
if (i % 100 == 0):
print i, loss / len(couples)
def predict(self, couplesTest):
pred = np.zeros(len(couplesTest))
for ind,c in enumerate(couplesTest):
pred[ind] = self.p[c[0]].dot(self.q[c[1]])[0][0]
return pred
In [176]:
model3 = matrixFactorisation(10, alternate=0)
model3.fit(trainUsers, trainItems, trainCouples)
In [22]:
dm = np.dok_matrix(train)
In [28]:
print(len(movies))
print(len(data.keys()))
In [21]:
movies["Adventures of Pinocchio, The (1996)"]
Out[21]:
In [22]:
rawMatrix = np.zeros((len(data.keys())+1,1682+1))
for u in data:
for m in data[u]:
rawMatrix[int(u)][int(movies[m])] = data[u][m]
In [23]:
np.shape(rawMatrix)
Out[23]:
In [24]:
train["1"]["101 Dalmatians (1996)"]
Out[24]:
In [25]:
rawMatrixTrain = np.zeros((len(data.keys())+1,1682+1))
for u in train:
for m in train[u]:
rawMatrixTrain[int(u)][int(movies[m])] = train[u][m]
rawMatrixTest = np.zeros((len(data.keys())+1,1682+1))
for u in test:
for m in test[u]:
rawMatrixTest[int(u)][int(movies[m])] = test[u][m]
In [26]:
rawMatrixTrain[:10,:10]
Out[26]:
In [27]:
rawMatrixTest[:10,:10]
Out[27]:
In [28]:
np.shape(rawMatrixTest)
Out[28]:
In [73]:
import numpy as np
from scipy import linalg
from numpy import dot
def nmf(X, latent_features, max_iter=100, error_limit=1e-6, fit_error_limit=1e-6, eps = 1e-5):
"""
Decompose X to A*Y
"""
eps = 1e-5
print 'Starting NMF decomposition with {} latent features and {} iterations.'.format(latent_features, max_iter)
#X = X.toarray() # I am passing in a scipy sparse matrix
# mask
mask = np.sign(X)
# initial matrices. A is random [0,1] and Y is A\X.
rows, columns = X.shape
A = np.random.rand(rows, latent_features)
A = np.maximum(A, eps)
Y = linalg.lstsq(A, X)[0]
Y = np.maximum(Y, eps)
masked_X = mask * X
X_est_prev = dot(A, Y)
for i in range(1, max_iter + 1):
# updates
top = dot(masked_X, Y.T)
bottom = (dot((mask * dot(A, Y)), Y.T)) + eps
A *= top / bottom
A = np.maximum(A, eps)
# print 'A', np.round(A, 2)
top = dot(A.T, masked_X)
bottom = dot(A.T, mask * dot(A, Y)) + eps
Y *= top / bottom
Y = np.maximum(Y, eps)
# print 'Y', np.round(Y, 2)
# evaluation
if i % 200 == 0 or i == 1 or i == max_iter:
print 'Iteration {}:'.format(i),
X_est = dot(A, Y)
err = mask * (X_est_prev - X_est)
fit_residual = np.sqrt(np.sum(err ** 2))
X_est_prev = X_est
curRes = linalg.norm(mask * (X - X_est), ord='fro')
print 'fit residual', np.round(fit_residual, 4),
print 'total residual', np.round(curRes, 4)
if curRes < error_limit or fit_residual < fit_error_limit:
break
return A, Y
In [170]:
cpr = copy.deepcopy(rawMatrixTrain)
In [118]:
cpr[:10,:10]
Out[118]:
In [119]:
t1 = np.array([[7,1],
[1,1]])
t2 = np.array([[1,2],
[3,4]])
In [120]:
t1.dot(t2)
Out[120]:
In [121]:
(t1*t2).sum()
Out[121]:
In [122]:
cpr[1,1]=0
In [171]:
%%time
A,Y = nmf(cpr,100,max_iter=4000)
In [131]:
resMatrix = A.dot(Y)
In [132]:
resMatrix[1,1]
Out[132]:
In [33]:
class evalMF:
def __init__(self,resMatrix,dicU,dicI):
self.resMatrix=resMatrix
self.dicU = dicU
self.dicI = dicI
def fit(self):
pass
def predict(self,user,movie):
return self.resMatrix[int(user)][int(self.dicI[movie])]
In [134]:
mf= evalMF(resMatrix,data,movies)
In [139]:
data["200"]
Out[139]:
In [142]:
print data["1"]["Akira (1988)"]
print mf.predict("1","Akira (1988)")
print data["1"]["All Dogs Go to Heaven 2 (1996)"]
print mf.predict("1","All Dogs Go to Heaven 2 (1996)")
print "***"
print data["18"]["Don Juan DeMarco (1995)"]
print mf.predict("1","Don Juan DeMarco (1995)")
print data["18"]["Winnie the Pooh and the Blustery Day (1968)"]
print mf.predict("1","Winnie the Pooh and the Blustery Day (1968)")
print "***"
print data["200"]["Assassins (1995)"]
print mf.predict("1","Assassins (1995)")
print data["200"]["Casablanca (1942)"]
print mf.predict("1","Casablanca (1942)")
In [143]:
summ=0
for i in data["1"]:
summ+=(float(data["1"][i]) - mf.predict("1",i))**2
summ/len(data["1"])
Out[143]:
In [144]:
summ=0
for i in data["3"]:
summ+=(float(data["3"][i]) - mf.predict("3",i))**2
summ/len(data["3"])
Out[144]:
In [175]:
tot=[]
ttt=[]
for j in test:
summ=0
for i in test[j]:
summ+=(float(test[j][i]) - mf.predict(j,i))**2
#print j, ">>", summ/len(data[j])
ttt.append(len(test[j]))
tot.append(summ)
#import pdb
#pdb.set_trace()
In [176]:
t = np.array(tot)
tt = np.array(ttt)
In [177]:
t.mean()/tt.mean()
Out[177]:
In [162]:
tt.std()
Out[162]:
In [169]:
t.sum()/tt.sum()
Out[169]:
In [161]:
np.array([ (float(ra[2]) - mf.predict(ra[0],ra[1]))**2 for ra in rawArrayTest]).mean()
Out[161]:
In [225]:
%%time
A,Y = nmf(rawMatrixTrain,500,max_iter=300)
In [226]:
resMatrix = A.dot(Y)
In [227]:
a=np.array((1,2,4))
b=np.array((1,3,6))
(a-b).dot(a-b)
masqueTest=np.sign(rawMatrixTest)
masqueTest[:10,:10]
A=masqueTest*rawMatrix
In [228]:
aa = masqueTest*resMatrix
In [229]:
for idxi,i in enumerate(aa):
for idxj,j in enumerate(i):
if j>5:
aa[idxi][idxj]=5
In [235]:
q = masqueTest*resMatrix - rawMatrixTest
In [236]:
(q*q).sum()/ masqueTest.sum()
Out[236]:
In [ ]:
masqueTest=np.sign(rawMatrixTest)
aa=masqueTest*rawMatrix
for idxi,i in enumerate(aa):
for idxj,j in enumerate(i):
if j>5:
aa[idxi][idxj]=5
q = masqueTest*resMatrix - rawMatrixTest
(q*q).sum()/ masqueTest.sum()
In [232]:
aa[:10,:10]
Out[232]:
In [111]:
rawMatrix[:10,:10]
Out[111]:
In [65]:
resMatrix[:10,:10]
Out[65]:
In [59]:
mf = evalMF(resMatrix,data,movies)
In [69]:
print data["1"]["Akira (1988)"]
print mf.predict("1","Akira (1988)")
print data["1"]["All Dogs Go to Heaven 2 (1996)"]
print mf.predict("1","All Dogs Go to Heaven 2 (1996)")
In [61]:
print train["1"]["All Dogs Go to Heaven 2 (1996)"]
print test["1"]["Akira (1988)"]
In [80]:
len(rawMatrixTest)
Out[80]:
In [78]:
t = []
c = 10
for idxi,i in enumerate(rawMatrixTest):
for idxj,j in enumerate(i):
if rawMatrixTest[idxi][idxj] != 0:
t.append( (resMatrix[idxi][idxj] - float(rawMatrixTest[idxi][idxj]))**2 )
if c>0:
print(rawMatrixTest[idxi][idxj],resMatrix[idxi][idxj])
c-=1
np.array(t).mean()
Out[78]:
In [87]:
t = []
c = 10
for idxi,i in enumerate(resMatrix):
for idxj,j in enumerate(i):
if rawMatrixTest[idxi][idxj] != 0:
t.append( (resMatrix[idxi][idxj] - float(rawMatrix[idxi][idxj]))**2 )
if c>0:
print(rawMatrix[idxi][idxj],resMatrix[idxi][idxj])
c-=1
np.array(t).mean()
Out[87]:
In [108]:
t = []
c = 3
for idxi,i in enumerate(rawMatrixTrain):
for idxj,j in enumerate(i):
if rawMatrixTrain[idxi][idxj] != 0:
t.append( (float(rawMatrixTrain[idxi][idxj]) - resMatrix[idxi][idxj])**2 )
if c>0:
print(rawMatrixTrain[idxi][idxj],resMatrix[idxi][idxj])
c-=1
np.array(t).mean()
Out[108]:
In [80]:
np.array([ (float(ra[2]) - mf.predict(ra[0],ra[1]))**2 for ra in rawMatrixTest]).mean()
In [124]:
R = [
[5,3,0,1],
[4,0,0,1],
[1,1,0,5],
[1,0,0,4],
[0,1,5,4],
]
R = numpy.array(R)
N = len(R)
M = len(R[0])
K = 2
P = numpy.random.rand(N,K)
Q = numpy.random.rand(M,K)
nP, nQ = matrix_factorization(R, P, Q, K)
nR = numpy.dot(nP, nQ.T)
In [125]:
nR
Out[125]:
In [65]:
import numpy
def matrix_factorization(R, K, steps=100, alpha=0.0002, beta=0.02):
N = len(R)
M = len(R[0])
P = numpy.random.rand(N,K)
Q = numpy.random.rand(M,K)
Q = Q.T
for step in xrange(steps):
for i in xrange(len(R)):
for j in xrange(len(R[i])):
if R[i][j] > 0:
eij = R[i][j] - numpy.dot(P[i,:],Q[:,j])
for k in xrange(K):
P[i][k] = P[i][k] + alpha * (2 * eij * Q[k][j] - beta * P[i][k])
Q[k][j] = Q[k][j] + alpha * (2 * eij * P[i][k] - beta * Q[k][j])
#eR = numpy.dot(P,Q)
#e = 0
#for i in xrange(len(R)):
# for j in xrange(len(R[i])):
# if R[i][j] > 0:
# e = e + pow(R[i][j] - numpy.dot(P[i,:],Q[:,j]), 2)
# for k in xrange(K):
# e = e + (beta/2) * (pow(P[i][k],2) + pow(Q[k][j],2))
#if e < 0.001:
# break
return P, Q.T
In [44]:
N = len(R)
M = len(R[0])
K = 10
P = numpy.random.rand(N,K)
Q = numpy.random.rand(M,K)
Q = Q.T
In [55]:
R[1,1]
Out[55]:
In [51]:
for i in xrange(len(R)):
for j in xrange(len(R[i])):
if R[i][j] > 0:
eij = R[i][j] - numpy.dot(P[i,:],Q[:,j])
for k in xrange(K):
P[i][k] = P[i][k] + alpha * (2 * eij * Q[k][j] - beta * P[i][k])
Q[k][j] = Q[k][j] + alpha * (2 * eij * P[i][k] - beta * Q[k][j])
print(i,j)
break
break
In [ ]:
for step in xrange(steps):
for i in xrange(len(R)):
for j in xrange(len(R[i])):
if R[i][j] > 0:
eij = R[i][j] - numpy.dot(P[i,:],Q[:,j])
for k in xrange(K):
P[i][k] = P[i][k] + alpha * (2 * eij * Q[k][j] - beta * P[i][k])
Q[k][j] = Q[k][j] + alpha * (2 * eij * P[i][k] - beta * Q[k][j])
In [60]:
%%time
R = rawMatrixTrain
nP, nQ = matrix_factorization(R, 10, steps=120)
nR = numpy.dot(nP, nQ.T)
In [61]:
masqueTest=np.sign(rawMatrixTest)
aa=masqueTest*rawMatrix
q = masqueTest*nR - rawMatrixTest
(q*q).sum()/ masqueTest.sum()
Out[61]:
In [72]:
%%time
R = rawMatrixTrain
nP, nQ = matrix_factorization(R, 10, alpha=1e-5,steps=40)
nR = numpy.dot(nP, nQ.T)
In [73]:
masqueTest=np.sign(rawMatrixTest)
aa=masqueTest*rawMatrix
q = masqueTest*nR - rawMatrixTest
(q*q).sum()/ masqueTest.sum()
Out[73]:
In [30]:
nR[:5,:5]
Out[30]:
In [31]:
rawMatrix[:5,:5]
Out[31]:
In [32]:
mf= evalMF(nR,data,movies)
mf.predict("1","Akira (1988)")
In [47]:
np.array([ (float(ra[2]) - mf.predict(ra[0],ra[1]))**2 for ra in rawArrayTest]).mean()
Out[47]:
In [ ]: