In [1]:
import numpy as np
import sklearn
from sklearn.decomposition import NMF # to add features based on the latent representation
from sklearn.decomposition import ProjectedGradientNMF
from scipy.sparse import csr_matrix
In [10]:
mat = np.array([[5,5,5,5],
[5,0,5,5],
[5,5,5,5]])
In [11]:
m = csr_matrix(mat)
In [12]:
m
Out[12]:
In [13]:
mat
Out[13]:
In [18]:
#%%time
nmf = NMF(n_components=None, init=None, solver='cd', tol=0.0001, max_iter=2000, random_state=None,
alpha=0, l1_ratio=0, verbose=0, shuffle=False, nls_max_iter=2000, sparseness=None, beta=1, eta=0.1)
W = nmf.fit_transform(m)
H = nmf.components_
W.dot(H)
Out[18]:
In [19]:
from scipy import linalg
In [26]:
def matrix_factorization(R, K, steps=100, eps=0.0001, beta=0.02, decay=0.95):
N,M = np.shape(R)
P = np.random.rand(N,K)
#P = np.maximum(P, eps)
#Q = np.random.rand(M,K).T
Q = linalg.lstsq(P, R)[0]
Q = np.maximum(Q, eps)
#masked_X = mask * X
#X_est_prev = dot(A, Y)
#mask = np.sign(R)
#masked_R = mask * R
for step in xrange(1,steps+1):
#"""
for i in xrange(len(R)):
for j in xrange(len(R[i])):
if R[i][j] > 0:
eij = R[i][j] - np.dot(P[i,:],Q[:,j])
P[i] = P[i] + eps * (2 * eij * Q.T[j] - beta * P[i])
#Q[i] = P[i] + eps * (2 * eij * Q.T[j] - beta * P[i])
Q.T[j] = Q.T[j] + eps * (2 * eij * P[i] - beta * Q.T[j])
#for k in xrange(K):
# P[i][k] = P[i][k] + eps * (2 * eij * Q[k][j] - beta * P[i][k])
#Q[k][j] = Q[k][j] + eps * (2 * eij * P[i][k] - beta * Q[k][j])
return P, Q.T
In [27]:
mat = np.array([[1,1,5],
[5,5,1],
[0,5,1]])
In [28]:
K = 5
nP, nQ = matrix_factorization(mat, K, steps=1000,eps=1e-5)
nP.dot(nQ.T)
Out[28]:
u.data: Toutes les données de jugement au format "user id | lm id | rating | timestamp", avec les champs séparés par des tabulations
u.links: Les liens entre utilisateur du réseau. Chaque ligne concerne liens sortants d'un utilisateur. Une ligne commence par le numéro d'un utilisateur, suivi par tous ses successeurs dans le réseau, séparés par des tabulation.
In [177]:
r_max = 5
def f(x):
return (x-1)/(r_max-1)
def readData(name):
a = np.array([i[:-2].split('\t')[:3] for i in open("recodata/"+name)])
# mettre les notes a l'echelle
a[:,2] = [f(float(i)) for i in a[:,2]]
return a
def readLinks(name):
ret = []
for i in open("recodata/"+name):
t = i[:-2].split('\t')
a = t[0]
for j in t[1:]:
ret.append([a,j,1])
return np.array(ret)
In [210]:
u_data = readData("u.data")
u1_train = readData("u1.train")
u1_test = readData("u1.test")
u2_train = readData("u2.train")
u2_test = readData("u2.test")
u3_train = readData("u3.train")
u3_test = readData("u3.test")
u4_train = readData("u4.train")
u4_test = readData("u4.test")
u5_train = readData("u5.train")
u5_test = readData("u5.test")
u_links = readLinks("u.links")
In [169]:
print(u_data[0])
print(u1_train[0])
print(u1_test[0])
print(u5_train[0])
print(u5_test[0])
print(u_links[0])
In [140]:
tmp=[]
for i in u_links:
tmp.append(i[0])
In [216]:
users=u_data[:,0]
movies=u_data[:,1]
In [217]:
dicu={}
for i in users:
dicu[i]=0
dicm={}
for i in movies:
dicm[i]=0
In [218]:
len(dicu.keys())
Out[218]:
In [144]:
len(dicm.keys())
Out[144]:
In [145]:
rawMatrix = np.zeros((len(dicu.keys()),len(dicm.keys())))
for l in u_data:
rawMatrix[int(l[0])-1][int(l[1])-1] = l[2]
In [146]:
rawMatrix[:7,:7]
Out[146]:
In [234]:
class soRec():
def __init__(self, k, eps=1e-5, lC=0.2, lU=0.2, lV=0.2, lZ=0.2, steps=10):
self.k = k
self.eps = eps
self.lC = lC
self.lU = lU
self.lV = lV
self.lZ = lZ
self.steps = steps
def fit(self, users_movies, users_link):
self.u = {}
self.v = {}
self.z = {}
#Choix du paramètre a optimisé en cas d'optimisation alternée
for i in xrange(self.steps):
lossUV = 0
lossUZ = 0
lossTot = 0
for j in xrange(len(users_movies)):
# Factorisation de matrices users/films
r = np.random.randint(len(users_movies))
user = users_movies[r][0]
movie = users_movies[r][1]
rating = float(users_movies[r][2])
try:
self.u[user]
except KeyError:
self.u[user] = np.random.rand(1,self.k)
try:
self.v[movie]
except KeyError:
self.v[movie] = np.random.rand(self.k,1)
e_UV = np.exp(self.u[user].dot(self.v[movie])[0][0])
log_UV = (1.0/(1 + e_UV))
tmp = log_UV - rating
g = e_UV/((1 + e_UV))**2
self.u[user] = self.u[user] - self.eps * tmp * g * self.v[movie].transpose()
self.v[movie] = self.v[movie] - self.eps * tmp * g * self.u[user].transpose()
lossUV = lossUV + tmp*tmp/2.
# Factorisation de matrices liens users
r = np.random.randint(len(users_link)) # on pourrais faire sur tout les liens par user
userSource = users_link[r][0]
userTarget = users_link[r][1]
linkScore = float(users_link[r][2])
try:
self.u[userSource]
except KeyError:
self.u[userSource] = np.random.rand(1,self.k)
try:
self.z[userTarget]
except KeyError:
self.z[userTarget] = np.random.rand(self.k,1)
e_UZ = np.exp(self.u[userSource].dot(self.z[userTarget])[0][0])
log_UZ = (1.0/(1 + e_UZ))
tmp = log_UZ - linkScore
g = e_UZ/((1 + e_UZ))**2
self.u[userSource] = self.u[userSource] - self.eps * tmp * g * self.z[userTarget].transpose()
self.z[userTarget] = self.z[userTarget] - self.eps * tmp * g * self.u[userSource].transpose()
lossUZ = lossUZ + tmp*tmp/2.
ru = np.random.choice(self.u.keys());
rv = np.random.choice(self.v.keys());
rz = np.random.choice(self.z.keys());
self.u[ru] = self.u[ru] * (1 - self.lU * self.eps)
self.v[rv] = self.v[rv] * (1 - self.lV * self.eps)
self.z[rz] = self.z[rz] * (1 - self.lZ * self.eps)
lossTot = lossTot + np.sqrt((self.u[ru]**2).sum()) + np.sqrt((self.v[rv]**2).sum()) + np.sqrt((self.z[rz]**2).sum())
if (i % 1 == 0):
print("Step %d, loss: %.04f\nlossUV:%0.4f, lossUZ:%0.4f, lossTot:%0.4f"
%(i, (lossUV + lossUZ + lossTot) / len(users_movies),lossUV ,lossUZ ,lossTot ))
def predict(self, users_movies):
pred = []
for c in users_movies:
try:
pred.append(self.u[c[0]].dot(self.v[c[1]])[0][0])
except KeyError:
pred.append(0.5)
return pred
In [235]:
#%%time
model = soRec(5, eps=1e-1, steps=10)
model.fit(u_data, u_links)
In [236]:
print "Erreur:", ((model.predict(u1_test) - np.array(np.array(u1_test)[:,2], float)) ** 2).mean()
In [225]:
%%time
model = soRec(5, eps=1e-1, steps=6)
model.fit(u1_train, u_links)
print "Erreur:", ((model.predict(u1_test) - np.array(np.array(u1_test)[:,2], float)) ** 2).mean()
In [228]:
%%time
model = soRec(5, eps=1e-1, steps=6)
model.fit(u2_train, u_links)
print "Erreur:", ((model.predict(u2_test) - np.array(np.array(u2_test)[:,2], float)) ** 2).mean()
In [229]:
%%time
model = soRec(5, eps=1e-1, steps=6)
model.fit(u3_train, u_links)
print "Erreur:", ((model.predict(u3_test) - np.array(np.array(u3_test)[:,2], float)) ** 2).mean()
In [230]:
%%time
model = soRec(5, eps=1e-1, steps=6)
model.fit(u4_train, u_links)
print "Erreur:", ((model.predict(u4_test) - np.array(np.array(u4_test)[:,2], float)) ** 2).mean()
In [231]:
%%time
model = soRec(5, eps=1e-1, steps=6)
model.fit(u5_train, u_links)
print "Erreur:", ((model.predict(u5_test) - np.array(np.array(u5_test)[:,2], float)) ** 2).mean()
In [ ]:
In [ ]:
In [ ]:
In [ ]: