In [1]:
from random import random
import math
import numpy as np
import copy
In [2]:
def loadMovieLens(path='./data/movielens'):
#Get movie titles
movies={}
for line in open(path+'/u.item'):
id,title=line.split('|')[0:2]
movies[id]=title
# Load data
prefs={}
for line in open(path+'/u.data'):
(user,movieid,rating,ts)=line.split('\t')
prefs.setdefault(user,{})
prefs[user][movies[movieid]]=float(rating)
return prefs
In [3]:
data = loadMovieLens("data/ml-100k")
In [4]:
data['3']
Out[4]:
In [5]:
def getRawArray(data):
d = []
for u in data.keys():
for i in data[u].keys():
d.append([u,i,data[u][i]])
return np.array(d)
In [6]:
# splitting while avoiding to reduce the dataset too much
def split_train_test(data,percent_test):
test={}
train={}
movie={}
for u in data.keys():
test.setdefault(u,{})
train.setdefault(u,{})
for movie in data[u]:
#print(data[u][movie])
if (random()<percent_test):
test[u][movie]=data[u][movie]
else:
train[u][movie]=data[u][movie]
return train, test
In [7]:
def split_train_test_by_movies(data,percent_test):
test={}
train={}
movie={}
for u in data.keys():
for movie in data[u]:
if (random()<percent_test):
try:
test[movie][u]=data[u][movie]
except KeyError:
test.setdefault(movie,{})
test[movie][u]=data[u][movie]
else:
try:
train[movie][u]=data[u][movie]
except KeyError:
train.setdefault(movie,{})
train[movie][u]=data[u][movie]
return train, test
In [8]:
percent_test=0.2
train,test=split_train_test(data,percent_test)
In [9]:
percent_test=0.2
m_train,m_test=split_train_test_by_movies(data,percent_test)
In [10]:
def deleteUnseenInTest(train,test):
for k in test.keys():
try:
train[k]
except KeyError:
test.pop(k,None)
In [11]:
deleteUnseenInTest(train,test)
deleteUnseenInTest(m_train,m_test)
In [12]:
rawArray = getRawArray(data)
rawArrayTest = getRawArray(test)
In [13]:
class baselineMeanMovie:
def __init__(self):
self.users={}
self.movies={}
def fit(self,train):
movies = get_moove(train)
for movie in movies:
note=0.0
cpt=0
for user in train:
try:
note+=train[user][movie]
cpt+=1
except KeyError:
pass
note=note/cpt
self.movies[movie]=note
def predict(self,user,movie):
return self.movies[movie]
def score(self,X):
nb_movies = len(get_moove(X))
score = 0.0
for user in X:
for movie in X[user]:
score += (self.predict(user,movie) - X[user][movie])**2
return score/nb_movies
In [14]:
class baselineMeanUser:
def __init__(self):
self.users={}
def fit(self,train):
for user in train.keys():
note=0.0
for movie in train[user].keys():
note+=train[user][movie]
note=note/len(train[user])
self.users[user]=note
def predict(self,users):
return [self.users[u] for u in users]
In [15]:
baseline_mu= baselineMeanUser()
baseline_mu.fit(train)
pred = baseline_mu.predict(rawArray[:,0])
print("Mean Error %0.6f" %(
(np.array(pred) - np.array(rawArray[:,2], float)) ** 2).mean())
In [16]:
class baselineMeanMovie:
def __init__(self):
self.movies={}
def fit(self,train):
for movie in train.keys():
note=0.0
for user in train[movie].keys():
note+=train[movie][user]
note=note/len(train[movie])
self.movies[movie]=note
def predict(self,movies):
res=[]
for m in movies:
try:
res.append(self.movies[m])
except:
res.append(3)
return res
In [17]:
baseline_mm= baselineMeanMovie()
baseline_mm.fit(m_train)
pred = baseline_mm.predict(rawArrayTest[:,1])
print("Mean Error %0.6f" %(
(np.array(pred) - np.array(rawArrayTest[:,2], float)) ** 2).mean())
In [18]:
m_test['Adventures of Pinocchio, The (1996)']
Out[18]:
In [19]:
rawArray[:5]
Out[19]:
In [20]:
len(m_train['Birdcage, The (1996)'])
Out[20]:
In [27]:
# Way too slow, not ready yet matrix Factorisation
# With massive help from Remi Cadene
class matrixFactorisation():
def __init__(self, k, lambd=0.2, eps=1e-5, maxIter=2000, alternate=0):
self.k = k
self.lambd = lambd
self.eps = eps
self.maxIter = maxIter
self.alternate = alternate
def fit(self, dataUsers, dataItems, couples):
self.p = {}
self.q = {}
self.couples = couples
self.loss = []
optimP = True
optimQ = (self.alternate == 0)
for i in xrange(self.maxIter):
loss = 0
for j in xrange(len(couples)):
r = np.random.randint(len(couples))
user = couples[r][0]
item = couples[r][1]
if not user in self.p:
self.p[user] = np.random.rand(1,self.k)*4
if not item in self.q:
self.q[item] = np.random.rand(self.k,1)*4
tmp = dataUsers[user][item] - self.p[user].dot(self.q[item])[0][0]
if (optimP):
self.p[user] = (1 - self.lambd * self.eps) * self.p[user] + self.eps * 2 * tmp * self.q[item].transpose()
if (optimQ):
self.q[item] = (1 - self.lambd * self.eps) * self.q[item] + self.eps * 2 * tmp * self.p[user].transpose()
loss = loss + tmp*tmp #Sans régularisation
self.loss.append(loss)
if (self.alternate != 0):
if (i % self.alternate == 0):
oprimP = False if optimQ else True
print i, loss / len(couples)
else:
if (i % 100 == 0):
print i, loss / len(couples)
def predict(self, couplesTest):
pred = np.zeros(len(couplesTest))
for ind,c in enumerate(couplesTest):
pred[ind] = self.p[c[0]].dot(self.q[c[1]])[0][0]
return pred
In [31]:
model3 = matrixFactorisation(10, alternate=100)
model3.fit(trainUsers, trainItems, trainCouples)
In [36]:
pred = model3.predict(testCouples)
print "Erreur de test:", ((pred - np.array(testCouples[:,2], float)) ** 2).mean()
In [ ]:
In [ ]:
In [ ]:
In [ ]: