In [1]:
from random import random
import math
import numpy as np
import copy
In [2]:
def loadMovieLens(path='./data/movielens'):
#Get movie titles
movies={}
rev_movies={}
for idx,line in enumerate(open(path+'/u.item')):
idx,title=line.split('|')[0:2]
movies[idx]=title
rev_movies[title]=idx
# Load data
prefs={}
for line in open(path+'/u.data'):
(user,movieid,rating,ts)=line.split('\t')
prefs.setdefault(user,{})
prefs[user][movies[movieid]]=float(rating)
return prefs,rev_movies
In [3]:
data,movies = loadMovieLens("data/ml-100k")
In [4]:
data['3']
Out[4]:
In [5]:
def getRawArray(data):
d = []
for u in data.keys():
for i in data[u].keys():
d.append([u,i,data[u][i]])
return np.array(d)
In [6]:
# splitting while avoiding to reduce the dataset too much
def split_train_test(data,percent_test):
test={}
train={}
movie={}
for u in data.keys():
test.setdefault(u,{})
train.setdefault(u,{})
for movie in data[u]:
#print(data[u][movie])
if (random()<percent_test):
test[u][movie]=data[u][movie]
else:
train[u][movie]=data[u][movie]
return train, test
In [7]:
def split_train_test_by_movies(data,percent_test):
test={}
train={}
movie={}
for u in data.keys():
for movie in data[u]:
if (random()<percent_test):
try:
test[movie][u]=data[u][movie]
except KeyError:
test.setdefault(movie,{})
test[movie][u]=data[u][movie]
else:
try:
train[movie][u]=data[u][movie]
except KeyError:
train.setdefault(movie,{})
train[movie][u]=data[u][movie]
return train, test
In [8]:
percent_test=0.2
train,test=split_train_test(data,percent_test)
In [9]:
percent_test=0.2
m_train,m_test=split_train_test_by_movies(data,percent_test)
In [10]:
def deleteUnseenInTest(train,test):
for k in test.keys():
try:
train[k]
except KeyError:
test.pop(k,None)
In [11]:
deleteUnseenInTest(train,test)
deleteUnseenInTest(m_train,m_test)
In [12]:
rawArray = getRawArray(data)
rawArrayTest = getRawArray(test)
In [13]:
class baselineMeanUser:
def __init__(self):
self.users={}
def fit(self,train):
for user in train.keys():
note=0.0
for movie in train[user].keys():
note+=train[user][movie]
note=note/len(train[user])
self.users[user]=note
def predict(self,users):
return [self.users[u] for u in users]
In [14]:
baseline_mu= baselineMeanUser()
baseline_mu.fit(train)
pred = baseline_mu.predict(rawArray[:,0])
print("Mean Error %0.6f" %(
(np.array(pred) - np.array(rawArray[:,2], float)) ** 2).mean())
In [15]:
class baselineMeanMovie:
def __init__(self):
self.movies={}
def fit(self,train):
for movie in train.keys():
note=0.0
for user in train[movie].keys():
note+=train[movie][user]
note=note/len(train[movie])
self.movies[movie]=note
def predict(self,movies):
res=[]
for m in movies:
try:
res.append(self.movies[m])
except:
res.append(3)
return res
In [16]:
baseline_mm= baselineMeanMovie()
baseline_mm.fit(m_train)
pred = baseline_mm.predict(rawArrayTest[:,1])
print("Mean Error %0.6f" %(
(np.array(pred) - np.array(rawArrayTest[:,2], float)) ** 2).mean())
In [17]:
m_test['Adventures of Pinocchio, The (1996)']
Out[17]:
In [18]:
rawArray[:5]
Out[18]:
In [19]:
len(m_train['Birdcage, The (1996)'])
Out[19]:
In [20]:
print(len(movies))
print(len(data.keys()))
In [21]:
movies["Adventures of Pinocchio, The (1996)"]
Out[21]:
In [22]:
rawMatrix = np.zeros((len(data.keys())+1,1682+1))
for u in data:
for m in data[u]:
rawMatrix[int(u)][int(movies[m])] = data[u][m]
In [23]:
np.shape(rawMatrix)
Out[23]:
In [24]:
train["1"]["101 Dalmatians (1996)"]
Out[24]:
In [25]:
rawMatrixTrain = np.zeros((len(data.keys())+1,1682+1))
for u in train:
for m in train[u]:
rawMatrixTrain[int(u)][int(movies[m])] = train[u][m]
rawMatrixTest = np.zeros((len(data.keys())+1,1682+1))
for u in test:
for m in test[u]:
rawMatrixTest[int(u)][int(movies[m])] = test[u][m]
In [26]:
rawMatrixTrain[:10,:10]
Out[26]:
In [27]:
rawMatrixTest[:10,:10]
Out[27]:
In [28]:
np.shape(rawMatrixTest)
Out[28]:
In [30]:
import numpy as np
from scipy import linalg
from numpy import dot
def nmf(X, latent_features, max_iter=100, error_limit=1e-6, fit_error_limit=1e-6, eps = 1e-5):
"""
Decompose X to A*Y
"""
eps = 1e-5
print 'Starting NMF decomposition with {} latent features and {} iterations.'.format(latent_features, max_iter)
#X = X.toarray() # I am passing in a scipy sparse matrix
# mask
mask = np.sign(X)
# initial matrices. A is random [0,1] and Y is A\X.
rows, columns = X.shape
A = np.random.rand(rows, latent_features)
A = np.maximum(A, eps)
Y = linalg.lstsq(A, X)[0]
Y = np.maximum(Y, eps)
masked_X = mask * X
X_est_prev = dot(A, Y)
for i in range(1, max_iter + 1):
# updates
top = dot(masked_X, Y.T)
bottom = (dot((mask * dot(A, Y)), Y.T)) + eps
A *= top / bottom
A = np.maximum(A, eps)
# print 'A', np.round(A, 2)
top = dot(A.T, masked_X)
bottom = dot(A.T, mask * dot(A, Y)) + eps
Y *= top / bottom
Y = np.maximum(Y, eps)
# print 'Y', np.round(Y, 2)
# evaluation
if i % 200 == 0 or i == 1 or i == max_iter:
print 'Iteration {}:'.format(i),
X_est = dot(A, Y)
err = mask * (X_est_prev - X_est)
fit_residual = np.sqrt(np.sum(err ** 2))
X_est_prev = X_est
curRes = linalg.norm(mask * (X - X_est), ord='fro')
print 'fit residual', np.round(fit_residual, 4),
print 'total residual', np.round(curRes, 4)
if curRes < error_limit or fit_residual < fit_error_limit:
break
return A, Y
In [170]:
cpr = copy.deepcopy(rawMatrixTrain)
In [118]:
cpr[:10,:10]
Out[118]:
In [119]:
t1 = np.array([[7,1],
[1,1]])
t2 = np.array([[1,2],
[3,4]])
In [120]:
t1.dot(t2)
Out[120]:
In [121]:
(t1*t2).sum()
Out[121]:
In [122]:
cpr[1,1]=0
In [171]:
%%time
A,Y = nmf(cpr,100,max_iter=4000)
In [131]:
resMatrix = A.dot(Y)
In [132]:
resMatrix[1,1]
Out[132]:
In [29]:
class evalMF:
def __init__(self,resMatrix,dicU,dicI):
self.resMatrix=resMatrix
self.dicU = dicU
self.dicI = dicI
def fit(self):
pass
def predict(self,user,movie):
return self.resMatrix[int(user)][int(self.dicI[movie])]
In [134]:
mf= evalMF(resMatrix,data,movies)
In [139]:
data["200"]
Out[139]:
In [142]:
print data["1"]["Akira (1988)"]
print mf.predict("1","Akira (1988)")
print data["1"]["All Dogs Go to Heaven 2 (1996)"]
print mf.predict("1","All Dogs Go to Heaven 2 (1996)")
print "***"
print data["18"]["Don Juan DeMarco (1995)"]
print mf.predict("1","Don Juan DeMarco (1995)")
print data["18"]["Winnie the Pooh and the Blustery Day (1968)"]
print mf.predict("1","Winnie the Pooh and the Blustery Day (1968)")
print "***"
print data["200"]["Assassins (1995)"]
print mf.predict("1","Assassins (1995)")
print data["200"]["Casablanca (1942)"]
print mf.predict("1","Casablanca (1942)")
In [143]:
summ=0
for i in data["1"]:
summ+=(float(data["1"][i]) - mf.predict("1",i))**2
summ/len(data["1"])
Out[143]:
In [144]:
summ=0
for i in data["3"]:
summ+=(float(data["3"][i]) - mf.predict("3",i))**2
summ/len(data["3"])
Out[144]:
In [175]:
tot=[]
ttt=[]
for j in test:
summ=0
for i in test[j]:
summ+=(float(test[j][i]) - mf.predict(j,i))**2
#print j, ">>", summ/len(data[j])
ttt.append(len(test[j]))
tot.append(summ)
#import pdb
#pdb.set_trace()
In [176]:
t = np.array(tot)
tt = np.array(ttt)
In [177]:
t.mean()/tt.mean()
Out[177]:
In [162]:
tt.std()
Out[162]:
In [169]:
t.sum()/tt.sum()
Out[169]:
In [161]:
np.array([ (float(ra[2]) - mf.predict(ra[0],ra[1]))**2 for ra in rawArrayTest]).mean()
Out[161]:
In [225]:
%%time
A,Y = nmf(rawMatrixTrain,500,max_iter=300)
In [226]:
resMatrix = A.dot(Y)
In [227]:
a=np.array((1,2,4))
b=np.array((1,3,6))
(a-b).dot(a-b)
masqueTest=np.sign(rawMatrixTest)
masqueTest[:10,:10]
A=masqueTest*rawMatrix
In [228]:
aa = masqueTest*resMatrix
In [229]:
for idxi,i in enumerate(aa):
for idxj,j in enumerate(i):
if j>5:
aa[idxi][idxj]=5
In [235]:
q = masqueTest*resMatrix - rawMatrixTest
In [236]:
(q*q).sum()/ masqueTest.sum()
Out[236]:
In [ ]:
masqueTest=np.sign(rawMatrixTest)
aa=masqueTest*rawMatrix
for idxi,i in enumerate(aa):
for idxj,j in enumerate(i):
if j>5:
aa[idxi][idxj]=5
q = masqueTest*resMatrix - rawMatrixTest
(q*q).sum()/ masqueTest.sum()
In [232]:
aa[:10,:10]
Out[232]:
In [111]:
rawMatrix[:10,:10]
Out[111]:
In [65]:
resMatrix[:10,:10]
Out[65]:
In [59]:
mf = evalMF(resMatrix,data,movies)
In [69]:
print data["1"]["Akira (1988)"]
print mf.predict("1","Akira (1988)")
print data["1"]["All Dogs Go to Heaven 2 (1996)"]
print mf.predict("1","All Dogs Go to Heaven 2 (1996)")
In [61]:
print train["1"]["All Dogs Go to Heaven 2 (1996)"]
print test["1"]["Akira (1988)"]
In [80]:
len(rawMatrixTest)
Out[80]:
In [78]:
t = []
c = 10
for idxi,i in enumerate(rawMatrixTest):
for idxj,j in enumerate(i):
if rawMatrixTest[idxi][idxj] != 0:
t.append( (resMatrix[idxi][idxj] - float(rawMatrixTest[idxi][idxj]))**2 )
if c>0:
print(rawMatrixTest[idxi][idxj],resMatrix[idxi][idxj])
c-=1
np.array(t).mean()
Out[78]:
In [87]:
t = []
c = 10
for idxi,i in enumerate(resMatrix):
for idxj,j in enumerate(i):
if rawMatrixTest[idxi][idxj] != 0:
t.append( (resMatrix[idxi][idxj] - float(rawMatrix[idxi][idxj]))**2 )
if c>0:
print(rawMatrix[idxi][idxj],resMatrix[idxi][idxj])
c-=1
np.array(t).mean()
Out[87]:
In [108]:
t = []
c = 3
for idxi,i in enumerate(rawMatrixTrain):
for idxj,j in enumerate(i):
if rawMatrixTrain[idxi][idxj] != 0:
t.append( (float(rawMatrixTrain[idxi][idxj]) - resMatrix[idxi][idxj])**2 )
if c>0:
print(rawMatrixTrain[idxi][idxj],resMatrix[idxi][idxj])
c-=1
np.array(t).mean()
Out[108]:
In [80]:
np.array([ (float(ra[2]) - mf.predict(ra[0],ra[1]))**2 for ra in rawMatrixTest]).mean()
In [60]:
R = [
[5,3,5,3],
[4,0,0,1],
[1,5,1,5],
[1,0,1,4],
[0,4,5,4],
]
R = np.array(R)
K = 10
np.random.rand
Out[60]:
In [87]:
%%time
nP, nQ = matrix_factorization(R, K, steps=1000)
In [88]:
nR = np.dot(nP, nQ.T)
In [89]:
((nR-R)**2).sum()/np.sign(R).sum()
Out[89]:
In [110]:
np.shape(R)
Out[110]:
In [189]:
def matrix_factorization(R, K, steps=100, eps=0.0001, beta=0.02):
N,M = np.shape(R)
P = np.random.rand(N,K)
#P = np.maximum(P, eps)
#Q = np.random.rand(M,K).T
Q = linalg.lstsq(P, R)[0]
Q = np.maximum(Q, eps)
#masked_X = mask * X
#X_est_prev = dot(A, Y)
#mask = np.sign(R)
#masked_R = mask * R
for step in xrange(steps):
#"""
for i in xrange(len(R)):
for j in xrange(len(R[i])):
if R[i][j] > 0:
eij = R[i][j] - np.dot(P[i,:],Q[:,j])
for k in xrange(K):
P[i][k] = P[i][k] + eps * (2 * eij * Q[k][j] - beta * P[i][k])
Q[k][j] = Q[k][j] + eps * (2 * eij * P[i][k] - beta * Q[k][j])
"""
###
top = np.dot(masked_R, Q.T)
bottom = (np.dot((mask * np.dot(P, Q)), Q.T)) + eps
P *= top / bottom
P = np.maximum(P, eps)
# print 'A', np.round(A, 2)
top = np.dot(P.T, masked_R)
bottom = np.dot(P.T, mask * np.dot(P, Q)) + eps
Q *= top / bottom
Q = np.maximum(Q, eps)
# print 'Y', np.round(Y, 2)
"""
return P, Q.T
In [145]:
%%time
nP, nQ = matrix_factorization(R, K, steps=4000,eps=1e-5)
In [146]:
nR = np.dot(nP, nQ.T)
((nR-R)**2).sum()/np.sign(R).sum()
Out[146]:
In [ ]:
%%time
N,M = np.shape(R)
P = np.random.rand(N,K)
#P = np.maximum(P, eps)
#Q = np.random.rand(M,K).T
Q = linalg.lstsq(P, R)[0]
Q = np.maximum(Q, eps)
#masked_X = mask * X
#X_est_prev = dot(A, Y)
#mask = np.sign(R)
#masked_R = mask * R
for i in xrange(len(R)):
for j in xrange(len(R[i])):
if R[i][j] > 0:
eij = R[i][j] - np.dot(P[i,:],Q[:,j])
for k in xrange(K):
P[i][k] = P[i][k] + eps * (2 * eij * Q[k][j] - beta * P[i][k])
Q[k][j] = Q[k][j] + eps * (2 * eij * P[i][k] - beta * Q[k][j])
In [161]:
for _ in range(1,5):
nP, nQ = matrix_factorization(R, K, steps=1000,eps=1e-3)
nR = np.dot(nP, nQ.T)
print ((nR-GR)**2).sum()/np.sign(GR).sum()
In [141]:
GR = [
[0,0,0,0],
[0,1,1,0],
[0,0,0,0],
[0,4,0,0],
[4,0,0,0],
]
In [139]:
R
Out[139]:
In [149]:
nR
Out[149]:
In [55]:
R[1,1]
Out[55]:
In [98]:
from scipy import linalg
In [100]:
rows, columns = R.shape
A = np.random.rand(rows, 2)
In [ ]:
mask = np.sign(X)
# initial matrices. A is random [0,1] and Y is A\X.
rows, columns = X.shape
A = np.random.rand(rows, latent_features)
A = np.maximum(A, eps)
Y = linalg.lstsq(A, X)[0]
Y = np.maximum(Y, eps)
masked_X = mask * X
X_est_prev = dot(A, Y)
# updates
top = dot(masked_X, Y.T)
bottom = (dot((mask * dot(A, Y)), Y.T)) + eps
A *= top / bottom
A = np.maximum(A, eps)
# print 'A', np.round(A, 2)
top = dot(A.T, masked_X)
bottom = dot(A.T, mask * dot(A, Y)) + eps
Y *= top / bottom
Y = np.maximum(Y, eps)
# print 'Y', np.round(Y, 2)
# evaluation
if i % 200 == 0 or i == 1 or i == max_iter:
print 'Iteration {}:'.format(i),
X_est = dot(A, Y)
err = mask * (X_est_prev - X_est)
fit_residual = np.sqrt(np.sum(err ** 2))
X_est_prev = X_est
curRes = linalg.norm(mask * (X - X_est), ord='fro')
print 'fit residual', np.round(fit_residual, 4),
print 'total residual', np.round(curRes, 4)
if curRes < error_limit or fit_residual < fit_error_limit:
break
In [190]:
%%time
R = rawMatrixTrain
nP, nQ = matrix_factorization(R, 10, steps=40,eps=1e-3)
nR = np.dot(nP, nQ.T)
In [191]:
masqueTest=np.sign(rawMatrixTest)
aa=masqueTest*rawMatrix
"""
for idxi,i in enumerate(aa):
for idxj,j in enumerate(i):
if j>5:
aa[idxi][idxj]=5
"""
q = masqueTest*nR - rawMatrixTest
(q*q).sum()/ masqueTest.sum()
Out[191]:
In [30]:
nR[:5,:5]
Out[30]:
In [31]:
rawMatrix[:5,:5]
Out[31]:
In [32]:
mf= evalMF(nR,data,movies)
mf.predict("1","Akira (1988)")
In [47]:
np.array([ (float(ra[2]) - mf.predict(ra[0],ra[1]))**2 for ra in rawArrayTest]).mean()
Out[47]:
In [ ]: