In [1]:
from random import random
import math
In [2]:
def loadMovieLens(path='./data/movielens'):
#Get movie titles
movies={}
for line in open(path+'/u.item'):
id,title=line.split('|')[0:2]
movies[id]=title
# Load data
prefs={}
for line in open(path+'/u.data'):
(user,movieid,rating,ts)=line.split('\t')
prefs.setdefault(user,{})
prefs[user][movies[movieid]]=float(rating)
return prefs
In [3]:
data = loadMovieLens("data/ml-100k")
In [4]:
data['3']
Out[4]:
In [5]:
def split_train_test(data,percent_test):
test={}
train={}
movie={}
for u in data.keys():
test.setdefault(u,{})
train.setdefault(u,{})
for movie in data[u]:
#print(data[u][movie])
if (random()<percent_test):
test[u][movie]=data[u][movie]
else:
train[u][movie]=data[u][movie]
return train, test
In [6]:
percent_test=0.2
train,test=split_train_test(data,percent_test)
In [7]:
def get_moove(data):
moove = {}
for u in data:
for m in data[u]:
moove[m]=0
return moove
def get_youser(data):
youser = {}
for u in data:
youser[u]=0
return youser
def clean(d1,d2):
to_erase = {}
for i in d1:
try:
d2[i]
except KeyError:
to_erase[i]=0
for i in d2:
try:
d1[i]
except KeyError:
to_erase[i]=0
return to_erase
def _remove_users(test,rem):
for i in rem:
try:
del test[i]
except KeyError:
pass
def _remove_movies(test,rem):
for i in test:
for j in rem:
try:
del test[i][j]
except KeyError:
pass
In [8]:
mooveToRemoove = clean(get_moove(train),get_moove(test))
youserToRemoove = clean(get_youser(train),get_youser(test))
_remove_users(test,youserToRemoove)
_remove_movies(test,mooveToRemoove)
In [13]:
class BaselineMeanUser:
def __init__(self):
self.users={}
self.movies={}
def fit(self,train):
for user in train:
note=0
for movie in train[user]:
note+=train[user][movie]
note=note/len(train[user])
self.users[user]=note
def predict(self,user,movie):
return self.users[user]
def score(self,X):
nb_movies = len(get_moove(X))
score = 0.0
for user in X:
for movie in X[user]:
score += (self.predict(user,movie) - X[user][movie])**2
return score/nb_movies
class BaselineMeanMovie:
def __init__(self):
self.users={}
self.movies={}
def fit(self,train):
movies = get_moove(train)
for movie in movies:
note=0
cpt=0
for user in train:
try:
note+=train[user][movie]
cpt+=1
except KeyError:
pass
note=note/cpt
self.movies[movie]=note
def predict(self,user,movie):
return self.movies[movie]
def score(self,X):
nb_movies = len(get_moove(X))
score = 0.0
for user in X:
for movie in X[user]:
score += (self.predict(user,movie) - X[user][movie])**2
return score/nb_movies
In [14]:
baseline_mu= BaselineMeanUser()
baseline_mm= BaselineMeanMovie()
In [15]:
baseline_mu.fit(train)
baseline_mm.fit(train)
In [16]:
print("score baseline mean user ",baseline_mu.score(test))
print("score baseline mean movie ",baseline_mm.score(test))
In [18]:
# NMF by alternative non-negative least squares using projected gradients
# Author: Chih-Jen Lin, National Taiwan University
# Python/numpy translation: Anthony Di Franco
from numpy import *
from numpy.linalg import norm
from time import time
from sys import stdout
def nmf(V,Winit,Hinit,tol,timelimit,maxiter):
"""
(W,H) = nmf(V,Winit,Hinit,tol,timelimit,maxiter)
W,H: output solution
Winit,Hinit: initial solution
tol: tolerance for a relative stopping condition
timelimit, maxiter: limit of time and iterations
"""
W = Winit; H = Hinit; initt = time();
gradW = dot(W, dot(H, H.T)) - dot(V, H.T)
gradH = dot(dot(W.T, W), H) - dot(W.T, V)
initgrad = norm(r_[gradW, gradH.T])
print 'Init gradient norm %f' % initgrad
tolW = max(0.001,tol)*initgrad
tolH = tolW
for iter in xrange(1,maxiter):
# stopping condition
projnorm = norm(r_[gradW[logical_or(gradW<0, W>0)],
gradH[logical_or(gradH<0, H>0)]])
if projnorm < tol*initgrad or time() - initt > timelimit: break
(W, gradW, iterW) = nlssubprob(V.T,H.T,W.T,tolW,1000)
W = W.T
gradW = gradW.T
if iterW==1: tolW = 0.1 * tolW
(H,gradH,iterH) = nlssubprob(V,W,H,tolH,1000)
if iterH==1: tolH = 0.1 * tolH
if iter % 10 == 0: stdout.write('.')
print '\nIter = %d Final proj-grad norm %f' % (iter, projnorm)
return (W,H)
def nlssubprob(V,W,Hinit,tol,maxiter):
"""
H, grad: output solution and gradient
iter: #iterations used
V, W: constant matrices
Hinit: initial solution
tol: stopping tolerance
maxiter: limit of iterations
"""
H = Hinit
WtV = dot(W.T, V)
WtW = dot(W.T, W)
alpha = 1; beta = 0.1;
for iter in xrange(1, maxiter):
grad = dot(WtW, H) - WtV
projgrad = norm(grad[logical_or(grad < 0, H >0)])
if projgrad < tol: break
# search step size
for inner_iter in xrange(1,20):
Hn = H - alpha*grad
Hn = where(Hn > 0, Hn, 0)
d = Hn-H
gradd = sum(grad * d)
dQd = sum(dot(WtW,d) * d)
suff_decr = 0.99*gradd + 0.5*dQd < 0;
if inner_iter == 1:
decr_alpha = not suff_decr; Hp = H;
if decr_alpha:
if suff_decr:
H = Hn; break;
else:
alpha = alpha * beta;
else:
if not suff_decr or (Hp == Hn).all():
H = Hp; break;
else:
alpha = alpha/beta; Hp = Hn;
if iter == maxiter:
print 'Max iter in nlssubprob'
return (H, grad, iter)
In [33]:
from numpy import *
from nmf import *
w1 = array([[1,2,3],[4,5,6]])
h1 = array([[1,2],[3,4],[5,6]])
w2 = array([[1,1,3],[4,5,6]])
h2 = array([[1,1],[3,4],[5,6]])
w3 = array([[2,2,2],[2,2,2]])
h3 = array([[2,2],[2,2],[2,2]])
# v the ratings matrix
# v = dot(w1,h1)
v = array([array([4,0]),array([4,4])])
(wo,ho) = nmf(v, w3, h3, 0.001, 10, 10)
print wo
print ho
In [34]:
print v
print dot(wo,ho)
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]: