In [1]:
import numpy as np

import sklearn
from sklearn.decomposition import NMF        # to add features based on the latent representation
from sklearn.decomposition import ProjectedGradientNMF
from scipy.sparse import csr_matrix

1- Lecture des données


In [10]:
mat = np.array([[5,5,5,5],
              [5,0,5,5],
              [5,5,5,5]])

In [11]:
m = csr_matrix(mat)

In [12]:
m


Out[12]:
<3x4 sparse matrix of type '<type 'numpy.int64'>'
	with 11 stored elements in Compressed Sparse Row format>

In [13]:
mat


Out[13]:
array([[5, 5, 5, 5],
       [5, 0, 5, 5],
       [5, 5, 5, 5]])

In [18]:
#%%time
nmf = NMF(n_components=None, init=None, solver='cd', tol=0.0001, max_iter=2000, random_state=None,
          alpha=0, l1_ratio=0, verbose=0, shuffle=False, nls_max_iter=2000, sparseness=None, beta=1, eta=0.1)

W = nmf.fit_transform(m)
H = nmf.components_
W.dot(H)


Out[18]:
array([[  4.99997771e+00,   5.00041069e+00,   5.00000602e+00,
          5.00000434e+00],
       [  5.00000477e+00,   2.28547159e-03,   4.99999871e+00,
          4.99999907e+00],
       [  5.00000687e+00,   4.99988914e+00,   4.99999931e+00,
          4.99999976e+00]])

In [19]:
from scipy import linalg

In [26]:
def matrix_factorization(R, K, steps=100, eps=0.0001, beta=0.02, decay=0.95):
    N,M = np.shape(R)
    P = np.random.rand(N,K)
    #P = np.maximum(P, eps)
    
    #Q = np.random.rand(M,K).T
    Q = linalg.lstsq(P, R)[0]
    Q = np.maximum(Q, eps)

    #masked_X = mask * X
    #X_est_prev = dot(A, Y)
    
    #mask = np.sign(R)
    #masked_R = mask * R
    
    
    
    for step in xrange(1,steps+1):
        #"""
        for i in xrange(len(R)):
            for j in xrange(len(R[i])):
                if R[i][j] > 0:
                    eij = R[i][j] - np.dot(P[i,:],Q[:,j])
                    P[i] = P[i] + eps * (2 * eij * Q.T[j] - beta * P[i])
                    #Q[i] = P[i] + eps * (2 * eij * Q.T[j] - beta * P[i])
                    Q.T[j] = Q.T[j] + eps * (2 * eij * P[i] - beta * Q.T[j])
                    #for k in xrange(K):
                    #    P[i][k] = P[i][k] + eps * (2 * eij * Q[k][j] - beta * P[i][k])
                        #Q[k][j] = Q[k][j] + eps * (2 * eij * P[i][k] - beta * Q[k][j])
        
        
    return P, Q.T

In [27]:
mat = np.array([[1,1,5],
                [5,5,1],
                [0,5,1]])

In [28]:
K = 5
nP, nQ = matrix_factorization(mat, K, steps=1000,eps=1e-5)
nP.dot(nQ.T)


Out[28]:
array([[ 2.52061432,  0.27650964,  4.50333508],
       [ 5.65921603,  4.45196109,  2.28235774],
       [ 5.62792907,  4.98694106,  2.88892641]])

  • u.data: Toutes les données de jugement au format "user id | lm id | rating | timestamp", avec les champs séparés par des tabulations

  • u.links: Les liens entre utilisateur du réseau. Chaque ligne concerne liens sortants d'un utilisateur. Une ligne commence par le numéro d'un utilisateur, suivi par tous ses successeurs dans le réseau, séparés par des tabulation.


In [177]:
r_max = 5
def f(x):
    return (x-1)/(r_max-1)

def readData(name):
    a = np.array([i[:-2].split('\t')[:3] for i in open("recodata/"+name)])
    # mettre les notes a l'echelle
    a[:,2] = [f(float(i)) for i in a[:,2]]
    return a

def readLinks(name):
    ret = []
    for i in open("recodata/"+name):
        t = i[:-2].split('\t')
        a = t[0]
        for j in t[1:]:
            ret.append([a,j,1])
    return np.array(ret)

In [210]:
u_data = readData("u.data")
u1_train = readData("u1.train")
u1_test = readData("u1.test")
u2_train = readData("u2.train")
u2_test = readData("u2.test")
u3_train = readData("u3.train")
u3_test = readData("u3.test")
u4_train = readData("u4.train")
u4_test = readData("u4.test")
u5_train = readData("u5.train")
u5_test = readData("u5.test")

u_links = readLinks("u.links")

In [169]:
print(u_data[0])
print(u1_train[0])
print(u1_test[0])
print(u5_train[0])
print(u5_test[0])
print(u_links[0])


['196' '242' '0.5']
['1' '1' '1.0']
['1' '6' '1.0']
['1' '1' '1.0']
['1' '3' '0.75']
['1' '520' '1']

In [140]:
tmp=[]
for i in u_links:
    tmp.append(i[0])

In [216]:
users=u_data[:,0]
movies=u_data[:,1]

In [217]:
dicu={}
for i in users:
    dicu[i]=0
dicm={}
for i in movies:
    dicm[i]=0

In [218]:
len(dicu.keys())


Out[218]:
943

In [144]:
len(dicm.keys())


Out[144]:
1682

In [145]:
rawMatrix = np.zeros((len(dicu.keys()),len(dicm.keys())))
for l in u_data:
    rawMatrix[int(l[0])-1][int(l[1])-1] = l[2]

In [146]:
rawMatrix[:7,:7]


Out[146]:
array([[ 1.  ,  0.5 ,  0.75,  0.5 ,  0.5 ,  1.  ,  0.75],
       [ 0.75,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ],
       [ 0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ],
       [ 0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ],
       [ 0.75,  0.5 ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ],
       [ 0.75,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.25],
       [ 0.  ,  0.  ,  0.  ,  1.  ,  0.  ,  0.  ,  1.  ]])

2- Modèle SoRec


In [234]:
class soRec():
    def __init__(self, k, eps=1e-5, lC=0.2, lU=0.2, lV=0.2, lZ=0.2, steps=10):
        self.k = k
        self.eps = eps
        self.lC = lC
        self.lU = lU
        self.lV = lV
        self.lZ = lZ
        self.steps = steps
        
    def fit(self, users_movies, users_link):
        self.u = {}
        self.v = {}
        self.z = {}
        
        #Choix du paramètre a optimisé en cas d'optimisation alternée
        for i in xrange(self.steps):
            lossUV = 0
            lossUZ = 0
            lossTot = 0
            
            for j in xrange(len(users_movies)):
                # Factorisation de matrices users/films
                r = np.random.randint(len(users_movies)) 
                user = users_movies[r][0]
                movie = users_movies[r][1]
                rating = float(users_movies[r][2])
                try:
                    self.u[user]
                except KeyError:
                    self.u[user] = np.random.rand(1,self.k)
                try:
                    self.v[movie]
                except KeyError:
                    self.v[movie] = np.random.rand(self.k,1)

                e_UV = np.exp(self.u[user].dot(self.v[movie])[0][0])
                
                log_UV = (1.0/(1 + e_UV))
                tmp = log_UV - rating
                g = e_UV/((1 + e_UV))**2

                self.u[user] = self.u[user] - self.eps * tmp * g * self.v[movie].transpose()
                self.v[movie] = self.v[movie] - self.eps * tmp * g * self.u[user].transpose()
                lossUV = lossUV + tmp*tmp/2. 
                
                # Factorisation de matrices liens users
                r = np.random.randint(len(users_link)) # on pourrais faire sur tout les liens par user
                userSource = users_link[r][0]
                userTarget = users_link[r][1]
                linkScore  = float(users_link[r][2])
                try:
                    self.u[userSource]
                except KeyError:
                    self.u[userSource] = np.random.rand(1,self.k)
                try:
                    self.z[userTarget]
                except KeyError:
                    self.z[userTarget] = np.random.rand(self.k,1)
                    
                e_UZ = np.exp(self.u[userSource].dot(self.z[userTarget])[0][0])
                log_UZ = (1.0/(1 + e_UZ))
                tmp = log_UZ - linkScore
                g = e_UZ/((1 + e_UZ))**2

                self.u[userSource] = self.u[userSource] - self.eps * tmp * g * self.z[userTarget].transpose()
                self.z[userTarget] = self.z[userTarget] - self.eps * tmp * g * self.u[userSource].transpose()
                lossUZ = lossUZ + tmp*tmp/2. 
                
                ru = np.random.choice(self.u.keys());
                rv = np.random.choice(self.v.keys());
                rz = np.random.choice(self.z.keys());
                self.u[ru] = self.u[ru] * (1 - self.lU * self.eps)
                self.v[rv] = self.v[rv] * (1 - self.lV * self.eps)
                self.z[rz] = self.z[rz] * (1 - self.lZ * self.eps)
                lossTot = lossTot + np.sqrt((self.u[ru]**2).sum()) + np.sqrt((self.v[rv]**2).sum()) + np.sqrt((self.z[rz]**2).sum())

            if (i % 1 == 0):
                print("Step %d, loss: %.04f\nlossUV:%0.4f, lossUZ:%0.4f, lossTot:%0.4f"
                      %(i, (lossUV + lossUZ + lossTot) / len(users_movies),lossUV ,lossUZ ,lossTot ))

                
    def predict(self, users_movies):
        pred = []
        for c in users_movies:
            try:
                pred.append(self.u[c[0]].dot(self.v[c[1]])[0][0])
            except KeyError:
                pred.append(0.5)
        return pred

In [235]:
#%%time
model = soRec(5, eps=1e-1, steps=10)
model.fit(u_data, u_links)


Step 0, loss: 3.2291
lossUV:12282.9179, lossUZ:31916.4666, lossTot:278712.2773
Step 1, loss: 2.4679
lossUV:11674.0919, lossUZ:30673.9898, lossTot:204441.3488
Step 2, loss: 2.2478
lossUV:11236.4273, lossUZ:30290.6454, lossTot:183254.5028
Step 3, loss: 2.1812
lossUV:11313.2055, lossUZ:30017.3439, lossTot:176792.1501
Step 4, loss: 2.1589
lossUV:11074.6221, lossUZ:29909.8537, lossTot:174904.7560
Step 5, loss: 2.1468
lossUV:11044.7450, lossUZ:29920.9334, lossTot:173713.5403
Step 6, loss: 2.1448
lossUV:10970.8721, lossUZ:29916.6042, lossTot:173588.3108
Step 7, loss: 2.1385
lossUV:11015.5169, lossUZ:29906.6026, lossTot:172924.9612
Step 8, loss: 2.1484
lossUV:11072.8976, lossUZ:29942.7332, lossTot:173819.6622
Step 9, loss: 2.1447
lossUV:11056.5700, lossUZ:29921.0767, lossTot:173496.7543

In [236]:
print "Erreur:", ((model.predict(u1_test) - np.array(np.array(u1_test)[:,2], float)) ** 2).mean()


Erreur: 0.624126429568

In [225]:
%%time
model = soRec(5, eps=1e-1, steps=6)
model.fit(u1_train, u_links)
print "Erreur:", ((model.predict(u1_test) - np.array(np.array(u1_test)[:,2], float)) ** 2).mean()


Step 0, loss: 3.3490
Step 1, loss: 2.5908
Step 2, loss: 2.3159
Step 3, loss: 2.2075
Step 4, loss: 2.1663
Step 5, loss: 2.1492
Erreur: 0.496394847345
CPU times: user 3min 53s, sys: 669 ms, total: 3min 54s
Wall time: 3min 54s

In [228]:
%%time
model = soRec(5, eps=1e-1, steps=6)
model.fit(u2_train, u_links)
print "Erreur:", ((model.predict(u2_test) - np.array(np.array(u2_test)[:,2], float)) ** 2).mean()


Step 0, loss: 3.3433
Step 1, loss: 2.5850
Step 2, loss: 2.3072
Step 3, loss: 2.2071
Step 4, loss: 2.1708
Step 5, loss: 2.1539
Erreur: 0.514934796902
CPU times: user 3min 47s, sys: 355 ms, total: 3min 48s
Wall time: 3min 48s

In [229]:
%%time
model = soRec(5, eps=1e-1, steps=6)
model.fit(u3_train, u_links)
print "Erreur:", ((model.predict(u3_test) - np.array(np.array(u3_test)[:,2], float)) ** 2).mean()


Step 0, loss: 3.3438
Step 1, loss: 2.5805
Step 2, loss: 2.3144
Step 3, loss: 2.2157
Step 4, loss: 2.1748
Step 5, loss: 2.1614
Erreur: 0.506797595625
CPU times: user 3min 48s, sys: 378 ms, total: 3min 49s
Wall time: 3min 49s

In [230]:
%%time
model = soRec(5, eps=1e-1, steps=6)
model.fit(u4_train, u_links)
print "Erreur:", ((model.predict(u4_test) - np.array(np.array(u4_test)[:,2], float)) ** 2).mean()


Step 0, loss: 3.3569
Step 1, loss: 2.6004
Step 2, loss: 2.3249
Step 3, loss: 2.2222
Step 4, loss: 2.1770
Step 5, loss: 2.1568
Erreur: 0.501765466611
CPU times: user 4min, sys: 846 ms, total: 4min 1s
Wall time: 4min 2s

In [231]:
%%time
model = soRec(5, eps=1e-1, steps=6)
model.fit(u5_train, u_links)
print "Erreur:", ((model.predict(u5_test) - np.array(np.array(u5_test)[:,2], float)) ** 2).mean()


Step 0, loss: 3.3437
Step 1, loss: 2.5899
Step 2, loss: 2.3246
Step 3, loss: 2.2205
Step 4, loss: 2.1804
Step 5, loss: 2.1598
Erreur: 0.492447201601
CPU times: user 3min 47s, sys: 319 ms, total: 3min 48s
Wall time: 3min 48s

In [ ]:


In [ ]:


In [ ]:


In [ ]: