TME4 FDMS Collaborative Filtering

Florian Toqué & Paul Willot


In [1]:
from random import random
import math
import numpy as np
import copy

Loading the data


In [2]:
def loadMovieLens(path='./data/movielens'):
    #Get movie titles
    movies={}
    rev_movies={}
    for idx,line in enumerate(open(path+'/u.item')):
        idx,title=line.split('|')[0:2]
        movies[idx]=title
        rev_movies[title]=idx

    # Load data
    prefs={}
    for line in open(path+'/u.data'):
        (user,movieid,rating,ts)=line.split('\t')
        prefs.setdefault(user,{})
        prefs[user][movies[movieid]]=float(rating)
        
    return prefs,rev_movies

In [3]:
data,movies = loadMovieLens("data/ml-100k")

Content example


In [4]:
data['3']


Out[4]:
{'187 (1997)': 2.0,
 'Air Force One (1997)': 2.0,
 'Alien: Resurrection (1997)': 3.0,
 'Apostle, The (1997)': 4.0,
 'Bean (1997)': 2.0,
 'Boogie Nights (1997)': 5.0,
 'Chasing Amy (1997)': 3.0,
 'Conspiracy Theory (1997)': 5.0,
 'Contact (1997)': 2.0,
 'Cop Land (1997)': 4.0,
 'Crash (1996)': 1.0,
 'Critical Care (1997)': 1.0,
 "Dante's Peak (1997)": 2.0,
 'Deconstructing Harry (1997)': 3.0,
 'Deep Rising (1998)': 1.0,
 'Desperate Measures (1998)': 4.0,
 "Devil's Advocate, The (1997)": 3.0,
 "Devil's Own, The (1997)": 1.0,
 'Edge, The (1997)': 4.0,
 'Event Horizon (1997)': 4.0,
 'Everyone Says I Love You (1996)': 2.0,
 'Fallen (1998)': 3.0,
 'G.I. Jane (1997)': 2.0,
 'Game, The (1997)': 2.0,
 'Good Will Hunting (1997)': 2.0,
 'Hard Rain (1998)': 3.0,
 'Hoodlum (1997)': 3.0,
 'House of Yes, The (1997)': 1.0,
 'How to Be a Player (1997)': 1.0,
 'In the Name of the Father (1993)': 2.0,
 'Jackie Brown (1997)': 5.0,
 'Kiss the Girls (1997)': 1.0,
 'L.A. Confidential (1997)': 2.0,
 'Liar Liar (1997)': 2.0,
 'Lost Highway (1997)': 2.0,
 'Mad City (1997)': 3.0,
 'Man Who Knew Too Little, The (1997)': 4.0,
 'Mimic (1997)': 2.0,
 'Mother (1996)': 5.0,
 'Murder at 1600 (1997)': 3.0,
 'Paradise Lost: The Child Murders at Robin Hood Hills (1996)': 5.0,
 'Playing God (1997)': 1.0,
 'Prophecy II, The (1998)': 3.0,
 'Return of the Jedi (1983)': 4.0,
 "Schindler's List (1993)": 4.0,
 'Scream (1996)': 2.0,
 'Sphere (1998)': 3.0,
 'Spice World (1997)': 2.0,
 'Starship Troopers (1997)': 3.0,
 'U Turn (1997)': 3.0,
 "Ulee's Gold (1997)": 3.0,
 'Wag the Dog (1997)': 5.0,
 'Wedding Singer, The (1998)': 3.0}

Splitting data between train/test

We avoid to let unseen data form the train set in the test set.
We also try to minimise the dataset reduction by splitting on each user.


In [5]:
def getRawArray(data):
    d = []
    for u in data.keys():
        for i in data[u].keys():
            d.append([u,i,data[u][i]])
    return np.array(d)

In [6]:
# splitting while avoiding to reduce the dataset too much
def split_train_test(data,percent_test):
    test={}
    train={}
    movie={}
    for u in data.keys():
        test.setdefault(u,{})
        train.setdefault(u,{})
        for movie in data[u]:
            #print(data[u][movie])
            if (random()<percent_test):
                test[u][movie]=data[u][movie]
            else:
                train[u][movie]=data[u][movie]
    return train, test

In [7]:
def split_train_test_by_movies(data,percent_test):
    test={}
    train={}
    movie={}
    for u in data.keys():
        for movie in data[u]:
            if (random()<percent_test):
                try:
                    test[movie][u]=data[u][movie]
                except KeyError:
                    test.setdefault(movie,{})
                    test[movie][u]=data[u][movie]
            else:
                try:
                    train[movie][u]=data[u][movie]
                except KeyError:
                    train.setdefault(movie,{})
                    train[movie][u]=data[u][movie]
    return train, test

In [8]:
percent_test=0.2
train,test=split_train_test(data,percent_test)

split used for convenience on the average by movie baseline


In [9]:
percent_test=0.2
m_train,m_test=split_train_test_by_movies(data,percent_test)

cleaning


In [10]:
def deleteUnseenInTest(train,test):
    for k in test.keys():
        try:
            train[k]
        except KeyError:
            test.pop(k,None)

In [11]:
deleteUnseenInTest(train,test)
deleteUnseenInTest(m_train,m_test)

Matrix used for fast evaluation


In [12]:
evalArrayAll = getRawArray(data)
evalArrayTest = getRawArray(test)

In [13]:
evalArrayTest[:10,:10]


Out[13]:
array([['344', 'Return of the Jedi (1983)', '3.0'],
       ['344', 'First Wives Club, The (1996)', '3.0'],
       ['344', 'Face/Off (1997)', '4.0'],
       ['344', 'Big Night (1996)', '5.0'],
       ['344', 'Sleepers (1996)', '4.0'],
       ['344', 'English Patient, The (1996)', '3.0'],
       ['344', 'Toy Story (1995)', '3.0'],
       ['344', 'Interview with the Vampire (1994)', '3.0'],
       ['344', 'Alien (1979)', '5.0'],
       ['344', 'Dragonheart (1996)', '3.0']], 
      dtype='|S81')

Baseline: mean by user


In [14]:
class baselineMeanUser:
    def __init__(self):
        self.users={}
    def fit(self,train):
        for user in train.keys():
            note=0.0
            for movie in train[user].keys():
                note+=train[user][movie]
            note=note/len(train[user])
            self.users[user]=note
        
    def predict(self,users):
        return [self.users[u] for u in users]

In [15]:
baseline_mu= baselineMeanUser()
baseline_mu.fit(train)
pred = baseline_mu.predict(evalArrayTest[:,0])
print("Mean Error %0.6f" %(
        (np.array(pred) - np.array(evalArrayTest[:,2], float)) ** 2).mean())


Mean Error 1.078619

In [16]:
class baselineMeanMovie:
    def __init__(self):
        self.movies={}
    def fit(self,train):
        for movie in train.keys():
            note=0.0
            for user in train[movie].keys():
                note+=train[movie][user]
            note=note/len(train[movie])
            self.movies[movie]=note
        
    def predict(self,movies):
        res=[]
        for m in movies:
            try:
                res.append(self.movies[m])
            except:
                res.append(3)
        return res

In [17]:
baseline_mm= baselineMeanMovie()
baseline_mm.fit(m_train)
pred = baseline_mm.predict(evalArrayTest[:,1])
print("Mean Error %0.6f" %(
        (np.array(pred) - np.array(evalArrayTest[:,2], float)) ** 2).mean())


Mean Error 1.009150

Raw matrix used for convenience and clarity.
Structure like scipy sparse matrix or python dictionnaries may be used for speedup.

Complete dataset


In [18]:
rawMatrix = np.zeros((len(data.keys()),1682))
for u in data:
    for m in data[u]:
        rawMatrix[int(u)-1][int(movies[m])-1] = data[u][m]

In [19]:
print(np.shape(rawMatrix))
rawMatrix[:10,:10]


(943, 1682)
Out[19]:
array([[ 5.,  3.,  4.,  3.,  3.,  5.,  4.,  1.,  5.,  3.],
       [ 4.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  2.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 4.,  3.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 4.,  0.,  0.,  0.,  0.,  0.,  2.,  4.,  4.,  0.],
       [ 0.,  0.,  0.,  5.,  0.,  0.,  5.,  5.,  5.,  4.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  3.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  5.,  4.,  0.,  0.,  0.],
       [ 4.,  0.,  0.,  4.,  0.,  0.,  4.,  0.,  4.,  0.]])

Train and test dataset


In [20]:
rawMatrixTrain = np.zeros((len(data.keys()),1682))
for u in train:
    for m in train[u]:
        rawMatrixTrain[int(u)-1][int(movies[m])-1] = train[u][m]
        
rawMatrixTest = np.zeros((len(data.keys()),1682))
for u in test:
    for m in test[u]:
        rawMatrixTest[int(u)-1][int(movies[m])-1] = test[u][m]

Non-negative Matrix Factorization

Fast implementation using numpy's matrix processing. Sadly, I can't avoid the overfitting starting after few iterations.


In [21]:
#from scipy import linalg

def nmf(X, latent_features, max_iter=100, eps = 1e-5,printevery=100):

    print "NMF with %d latent features, %d iterations."%(latent_features, max_iter)

    # mask used to ignore null element (coded by zero)
    mask = np.sign(X)

    # randomly initialized matrix
    rows, columns = X.shape
    A = np.random.rand(rows, latent_features)
    
    Y = np.random.rand(latent_features, columns)
    # Not used as I couldn't find significant improvments
    #Y = linalg.lstsq(A, X)[0]  # initializing that way as recommanded in a blog post
    #Y = np.maximum(Y, eps)     # avoiding too low values

    masked_X = mask * X
    masktest = np.sign(rawMatrixTest)    # used for prints
    masktrain = np.sign(rawMatrixTrain)  # used for prints

    for i in range(1, max_iter + 1):

        top = np.dot(masked_X, Y.T)
        bottom = (np.dot((mask * np.dot(A, Y)), Y.T)) + eps
        A *= top / bottom
        
        top = np.dot(A.T, masked_X)
        bottom = np.dot(A.T, mask * np.dot(A, Y)) + eps
        Y *= top / bottom


        # evaluation
        if i % printevery == 0 or i == 1 or i == max_iter:
            X_est = np.dot(A, Y)
            q = masktest*X_est - rawMatrixTest
            q_train = masktrain*X_est - rawMatrixTrain
            print "Iteration %d, Err %.05f, Err train %.05f"%( i, (q*q).sum()/ masktest.sum(), (q_train*q_train).sum()/ masktest.sum() )
            
    return A, Y

In [22]:
%%time
A,Y = nmf(rawMatrixTrain,100,eps = 1e-5,max_iter=5,printevery=1)
resMatrix = A.dot(Y)


NMF with 100 latent features, 5 iterations.
Iteration 1, Err 0.95052, Err train 3.40552
Iteration 2, Err 0.91779, Err train 3.22001
Iteration 3, Err 0.91638, Err train 3.15463
Iteration 4, Err 0.91650, Err train 3.09610
Iteration 5, Err 0.91685, Err train 3.03950
CPU times: user 975 ms, sys: 163 ms, total: 1.14 s
Wall time: 700 ms

We see that it quickly get better than the baseline.
However, we see below that it overfit after that:


In [25]:
%%time
A,Y = nmf(rawMatrixTrain,10,eps = 1e-5,max_iter=500,printevery=100)
resMatrix = A.dot(Y)


NMF with 10 latent features, 500 iterations.
Iteration 1, Err 1.09246, Err train 3.92730
Iteration 100, Err 1.06885, Err train 2.21423
Iteration 200, Err 1.10649, Err train 2.02401
Iteration 300, Err 1.13376, Err train 1.95647
Iteration 400, Err 1.15716, Err train 1.92148
Iteration 500, Err 1.17427, Err train 1.90105
CPU times: user 25.7 s, sys: 4.81 s, total: 30.5 s
Wall time: 17.5 s

This is due to the high sparsity of the matrix.
We can of course reduce the features matrix size to avoid overfitting, but that will limit further improvments.


In [24]:
%%time
A,Y = nmf(rawMatrixTrain,1,eps = 1e-5,max_iter=100,printevery=20)
resMatrix = A.dot(Y)


NMF with 1 latent features, 100 iterations.
Iteration 1, Err 0.95479, Err train 3.48068
Iteration 20, Err 0.90113, Err train 3.28905
Iteration 40, Err 0.90113, Err train 3.28905
Iteration 60, Err 0.90113, Err train 3.28905
Iteration 80, Err 0.90113, Err train 3.28905
Iteration 100, Err 0.90113, Err train 3.28905
CPU times: user 3.43 s, sys: 1.12 s, total: 4.55 s
Wall time: 3.43 s

Despite good results in few seconds on this dataset, this can only get us so far.
We then have to add regularization to the cost function.

Evaluation


In [51]:
class evalMF:
    def __init__(self,resMatrix,dicU,dicI):
        self.resMatrix=resMatrix
        self.dicU = dicU
        self.dicI = dicI
    def fit(self):
        pass
        
    def predict(self,user,movie):
        return self.resMatrix[int(user)][int(self.dicI[movie])]

In [37]:
mf= evalMF(resMatrix,data,movies)

In [52]:
np.array([ (float(ra[2]) - mf.predict(ra[0],ra[1]))**2 for ra in evalArrayTest]).mean()


Out[52]:
0.91739675737704607

We usualy see an important difference between users, so we need to take the bias into account.


In [53]:
summ=0
for i in data["1"]:
    summ+=(float(data["1"][i]) - mf.predict("1",i))**2
summ/len(data["1"])


Out[53]:
0.96738560601630563

In [54]:
summ=0
for i in data["3"]:
    summ+=(float(data["3"][i]) - mf.predict("3",i))**2
summ/len(data["3"])


Out[54]:
1.2849702647843322


In [470]:
#self.lamb*np.linalg.norm(self.theta)

#from scipy import linalg

def nmf(X, latent_features, max_iter=100, eps = 1e-5, printevery=100):

    print "NMF with %d latent features, %d iterations."%(latent_features, max_iter)
    
    #lamb = 0.2
    

    #X = copy.deepcopy(rawMatrix)
    #with np.errstate(all='ignore'):
    #avg_m = X.sum(0)/(X != 0).sum(0)
    avg_u = X.sum(1)/(X != 0).sum(1)
    #diff_m = avg_m - avg_m.mean()
    diff_u = avg_u - avg_u.mean()
    print(avg_u.mean())
    #X = X - diff_m
    for idxi,i in enumerate(X):
        for idxj,j in enumerate(i):
            if X[idxi,idxj]!=0:
                X[idxi,idxj]+=diff_u[idxi]

    # mask used to ignore null element (coded by zero)
    mask = np.sign(X)

    # randomly initialized matrix
    rows, columns = X.shape
    A = np.random.rand(rows, latent_features)
    
    Y = np.random.rand(latent_features, columns)
    # Not used as I couldn't find significant improvments
    #Y = linalg.lstsq(A, X)[0]  # initializing that way as recommanded in a blog post
    #Y = np.maximum(Y, eps)     # avoiding too low values

    masked_X = mask * X
    masktest = np.sign(rawMatrixTest)    # used for prints
    masktrain = np.sign(rawMatrixTrain)  # used for prints
    
    prev_A = A
    prev_Y = Y
    

    

    #diff_u = (avg_u - avg_u.mean().T).T
    #(np.array([1,5])-mat.T).T
    

    for i in range(1, max_iter + 1):

        top = np.dot(masked_X, Y.T)
        esti = np.dot((mask * np.dot(A, Y)), Y.T)
        #esti = esti - diff_u
        bottom = esti + eps
        #print("val",np.shape(top/bottom))
        A *= top / bottom
        
        top = np.dot(A.T, masked_X)
        esti = np.dot(A.T, mask * np.dot(A, Y))
        #esti = esti - diff_m
        bottom = esti + eps
        #print("lav",np.shape(top/bottom))
        tb = top / bottom
        #print(np.linalg.norm(tb))
        no = np.linalg.norm(tb)
        Y *= tb
        #Y *= (0.9 * tb) + ( 0.1 * ( tb + (1/no) )  )
        
        """
        if i % 10 == 0:
            diff = np.abs(Y - prev_Y)
            diff = diff - 0.1
            Y = np.sign(diff)*Y
            prev_Y = Y
        """

        # evaluation
        if i % 10 == 0 or i == 1 or i == max_iter:
            X_est = np.dot(A, Y)
            q = masktest*X_est - rawMatrixTest
            q_train = masktrain*X_est - rawMatrixTrain
            #print(np.linalg.norm(tb))
            print "Iteration %d, Err %.05f, Err train %.05f"%( i, (q*q).sum()/ masktest.sum(), (q_train*q_train).sum()/ masktest.sum() )
            
    return A, Y

In [450]:
mat = np.array([[1,2,3],
       [4,5,6]])
print(np.array([1,5,10])-mat)
print (np.array([1,5])-mat.T).T


[[ 0  3  7]
 [-3  0  4]]
[[ 0 -1 -2]
 [ 1  0 -1]]

In [454]:
X = copy.deepcopy(rawMatrix)
#with np.errstate(all='ignore'):
#avg_m = X.sum(0)/(X != 0).sum(0)
avg_u = X.sum(1)/(X != 0).sum(1)
#diff_m = avg_m - avg_m.mean()
diff_u = avg_u - avg_u.mean()
print(avg_u.mean())
#X = X - diff_m
for idxi,i in enumerate(X):
    for idxj,j in enumerate(i):
        if X[idxi,idxj]!=0:
            X[idxi,idxj]+=diff_u[idxi]


3.58825202241

In [469]:
%%time
X = copy.deepcopy(rawMatrixTrain)
A,Y = nmf(X,1,eps = 1e-5,max_iter=100,printevery=10)
resMatrix = A.dot(Y)


NMF with 1 latent features, 100 iterations.
Iteration 1, Err 0.96388, Err train 3.61444
Iteration 10, Err 0.90375, Err train 3.37212
Iteration 20, Err 0.90375, Err train 3.37212
Iteration 30, Err 0.90375, Err train 3.37212
Iteration 40, Err 0.90375, Err train 3.37212
Iteration 50, Err 0.90375, Err train 3.37212
Iteration 60, Err 0.90375, Err train 3.37212
Iteration 70, Err 0.90375, Err train 3.37212
Iteration 80, Err 0.90375, Err train 3.37212
Iteration 90, Err 0.90375, Err train 3.37212
Iteration 100, Err 0.90375, Err train 3.37212
CPU times: user 3.51 s, sys: 897 ms, total: 4.41 s
Wall time: 3.29 s

In [355]:
%%time
A,Y = nmf(rawMatrixTrain,1,eps = 1e-5,max_iter=100,printevery=10)
resMatrix = A.dot(Y)


NMF with 1 latent features, 100 iterations.
Iteration 1, Err 0.96614, Err train 3.61000
Iteration 10, Err 0.90375, Err train 3.37212
Iteration 20, Err 0.90375, Err train 3.37212
Iteration 30, Err 0.90375, Err train 3.37212
Iteration 40, Err 0.90375, Err train 3.37212
Iteration 50, Err 0.90375, Err train 3.37212
Iteration 60, Err 0.90375, Err train 3.37212
Iteration 70, Err 0.90375, Err train 3.37212
Iteration 80, Err 0.90375, Err train 3.37212
Iteration 90, Err 0.90375, Err train 3.37212
Iteration 100, Err 0.90375, Err train 3.37212
CPU times: user 3.96 s, sys: 1.11 s, total: 5.08 s
Wall time: 3.96 s

/!\ 18 movies have no ratings at all

so we get a divide by zero warning. Ignored with:


In [321]:
with np.errstate(all='ignore'):
    avg_m = rawMatrix.sum(0)/(rawMatrix != 0).sum(0)
    avg_u = rawMatrix.sum(1)/(rawMatrix != 0).sum(1)

In [144]:
np.shape(t)


Out[144]:
(10, 1683)

In [124]:
tt = t[0]

In [125]:
tt


Out[125]:
array([ 0.        ,  1.11761125,  1.06896261, ...,  1.081234  ,
        0.85526852,  1.78160155])


In [226]:
resMatrix = A.dot(Y)

In [227]:
a=np.array((1,2,4))
b=np.array((1,3,6))
(a-b).dot(a-b)

masqueTest=np.sign(rawMatrixTest)
masqueTest[:10,:10]

A=masqueTest*rawMatrix

In [228]:
aa = masqueTest*resMatrix

In [229]:
for idxi,i in enumerate(aa):
    for idxj,j in enumerate(i):
        if j>5:
            aa[idxi][idxj]=5

In [235]:
q = masqueTest*resMatrix - rawMatrixTest

In [236]:
(q*q).sum()/ masqueTest.sum()


Out[236]:
1.1273948891755721

In [33]:
masqueTest=np.sign(rawMatrixTest)
q = masqueTest*resMatrix - rawMatrixTest
(q*q).sum()/ masqueTest.sum()


Out[33]:
1.1465785558244217

In [59]:
mf = evalMF(resMatrix,data,movies)

In [69]:
print data["1"]["Akira (1988)"]
print mf.predict("1","Akira (1988)")
print data["1"]["All Dogs Go to Heaven 2 (1996)"]
print mf.predict("1","All Dogs Go to Heaven 2 (1996)")


4.0
3.68121941569
1.0
0.96164687419

In [61]:
print train["1"]["All Dogs Go to Heaven 2 (1996)"]
print test["1"]["Akira (1988)"]


1.0
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-61-ccddd715027a> in <module>()
      1 print train["1"]["All Dogs Go to Heaven 2 (1996)"]
----> 2 print test["1"]["Akira (1988)"]

KeyError: 'Akira (1988)'

In [80]:
len(rawMatrixTest)


Out[80]:
944

In [78]:
t = []
c = 10
for idxi,i in enumerate(rawMatrixTest):
    for idxj,j in enumerate(i):
        if rawMatrixTest[idxi][idxj] != 0:
            t.append( (resMatrix[idxi][idxj] - float(rawMatrixTest[idxi][idxj]))**2 )
            if c>0:
                print(rawMatrixTest[idxi][idxj],resMatrix[idxi][idxj])
                c-=1
np.array(t).mean()


(5.0, 2.5931883684545336)
(4.0, 5.0047797241971219)
(4.0, 3.469010363892524)
(4.0, 3.8143373120809678)
(4.0, 5.1669137101192657)
(3.0, 4.4963428645445518)
(5.0, 3.6366298516628395)
(3.0, 3.1448263066152959)
(5.0, 2.7517463531147883)
(5.0, 4.6741099156866328)
Out[78]:
2.0909226578962459

In [87]:
t = []
c = 10
for idxi,i in enumerate(resMatrix):
    for idxj,j in enumerate(i):
        if rawMatrixTest[idxi][idxj] != 0:
            t.append( (resMatrix[idxi][idxj] - float(rawMatrix[idxi][idxj]))**2 )
            if c>0:
                print(rawMatrix[idxi][idxj],resMatrix[idxi][idxj])
                c-=1
np.array(t).mean()


(5.0, 2.5931883684545336)
(4.0, 5.0047797241971219)
(4.0, 3.469010363892524)
(4.0, 3.8143373120809678)
(4.0, 5.1669137101192657)
(3.0, 4.4963428645445518)
(5.0, 3.6366298516628395)
(3.0, 3.1448263066152959)
(5.0, 2.7517463531147883)
(5.0, 4.6741099156866328)
Out[87]:
2.0909226578962459

In [108]:
t = []
c = 3
for idxi,i in enumerate(rawMatrixTrain):
    for idxj,j in enumerate(i):
        if rawMatrixTrain[idxi][idxj] != 0:
            t.append( (float(rawMatrixTrain[idxi][idxj]) - resMatrix[idxi][idxj])**2 )
            if c>0:
                print(rawMatrixTrain[idxi][idxj],resMatrix[idxi][idxj])
                c-=1
np.array(t).mean()


(5.0, 4.3114944585785064)
(3.0, 3.3257920434476187)
(3.0, 4.0022181201367522)
Out[108]:
0.45320954210197834

In [80]:
np.array([ (float(ra[2]) - mf.predict(ra[0],ra[1]))**2 for ra in rawMatrixTest]).mean()


---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-80-0241c027db0a> in <module>()
----> 1 np.array([ (float(ra[2]) - mf.predict(ra[0],ra[1]))**2 for ra in rawMatrixTest]).mean()

<ipython-input-52-9fb67320d8a3> in predict(self, user, movie)
      8 
      9     def predict(self,user,movie):
---> 10         return self.resMatrix[int(user)][int(self.dicI[movie])]

KeyError: 0.0


In [60]:
R = [
     [5,3,5,3],
     [4,0,0,1],
     [1,5,1,5],
     [1,0,1,4],
     [0,4,5,4],
    ]

R = np.array(R)
K = 10
np.random.rand


Out[60]:
<function rand>

In [87]:
%%time
nP, nQ = matrix_factorization(R, K, steps=1000)


CPU times: user 1.55 s, sys: 18.9 ms, total: 1.57 s
Wall time: 1.57 s

In [88]:
nR = np.dot(nP, nQ.T)

In [89]:
((nR-R)**2).sum()/np.sign(R).sum()


Out[89]:
2.7470816228443531


In [59]:
R = rawMatrixTrain

In [74]:
def matrix_factorization(R, K, steps=100, eps=0.0001, beta=0.02, decay=0.95):
    N,M = np.shape(R)
    P = np.random.rand(N,K)
    #P = np.maximum(P, eps)
    
    #Q = np.random.rand(M,K).T
    Q = linalg.lstsq(P, R)[0]
    Q = np.maximum(Q, eps)

    #masked_X = mask * X
    #X_est_prev = dot(A, Y)
    
    #mask = np.sign(R)
    #masked_R = mask * R
    
    masktest = np.sign(rawMatrixTest)
    masktrain = np.sign(rawMatrixTrain)
    
    
    for step in xrange(1,steps+1):
        #"""
        for i in xrange(len(R)):
            for j in xrange(len(R[i])):
                if R[i][j] > 0:
                    eij = R[i][j] - np.dot(P[i,:],Q[:,j])
                    P[i] = P[i] + eps * (2 * eij * Q.T[j] - beta * P[i])
                    #Q[i] = P[i] + eps * (2 * eij * Q.T[j] - beta * P[i])
                    Q.T[j] = Q.T[j] + eps * (2 * eij * P[i] - beta * Q.T[j])
                    #for k in xrange(K):
                    #    P[i][k] = P[i][k] + eps * (2 * eij * Q[k][j] - beta * P[i][k])
                        #Q[k][j] = Q[k][j] + eps * (2 * eij * P[i][k] - beta * Q[k][j])
        """
        ###
        top = np.dot(masked_R, Q.T)
        bottom = (np.dot((mask * np.dot(P, Q)), Q.T)) + eps
        
        P *= top / bottom

        P = np.maximum(P, eps)
        # print 'A',  np.round(A, 2)

        top = np.dot(P.T, masked_R)
        bottom = np.dot(P.T, mask * np.dot(P, Q)) + eps
        Q *= top / bottom
        Q = np.maximum(Q, eps)
        # print 'Y', np.round(Y, 2)
        """
        if step%5:
            eps=eps*decay
        
        if step % 10 == 0 or step == 1 or step == steps:

            X_est = dot(P, Q)
            q = masktest*X_est - rawMatrixTest
            q_train = masktrain*X_est - rawMatrixTrain
            print "Iteration %d, Err %.05f, Err on train %.05f"%( step, (q*q).sum()/ masktest.sum(), (q_train*q_train).sum()/ masktest.sum() )
            
        
        
    return P, Q.T

In [67]:
%%time
K = 10
nP, nQ = matrix_factorization(R, K, steps=20,eps=1e-3)


Iteration 1, Err 9.20925, Err train 35.62599
Iteration 10, Err 6.10217, Err train 23.23107
Iteration 20, Err 4.98627, Err train 18.82843
CPU times: user 1min, sys: 333 ms, total: 1min 1s
Wall time: 1min 1s

In [70]:
%%time
K = 10
nP, nQ = matrix_factorization(R, K, steps=100,eps=1e-5)


Iteration 1, Err 3.33905, Err train 12.59114
Iteration 10, Err 1.07766, Err train 3.68302
Iteration 20, Err 0.98629, Err train 3.23860
Iteration 30, Err 0.95904, Err train 3.02876
Iteration 40, Err 0.94580, Err train 2.85712
Iteration 50, Err 0.93984, Err train 2.70948
Iteration 60, Err 0.93874, Err train 2.58756
Iteration 70, Err 0.94040, Err train 2.48769
Iteration 80, Err 0.94345, Err train 2.40503
Iteration 90, Err 0.94717, Err train 2.33584
Iteration 100, Err 0.95115, Err train 2.27742
CPU times: user 6min 16s, sys: 2.28 s, total: 6min 18s
Wall time: 6min 17s

In [72]:
nR = np.dot(nP, nQ.T)
((nR-R)**2).sum()/np.sign(R).sum()


Out[72]:
174.05240132990514

In [79]:
%%time
K = 10
nP, nQ = matrix_factorization(R, K, steps=50,eps=1e-2)


Iteration 1, Err 5.05876, Err train 19.84190
Iteration 10, Err 0.98519, Err train 2.54577
Iteration 20, Err 0.98575, Err train 2.19544
Iteration 30, Err 0.98830, Err train 2.06617
Iteration 40, Err 0.99066, Err train 2.00391
Iteration 50, Err 0.99287, Err train 1.97126
CPU times: user 3min 16s, sys: 1.44 s, total: 3min 17s
Wall time: 3min 17s

In [ ]:
%%time
N,M = np.shape(R)
P = np.random.rand(N,K)
#P = np.maximum(P, eps)

#Q = np.random.rand(M,K).T
Q = linalg.lstsq(P, R)[0]
Q = np.maximum(Q, eps)

#masked_X = mask * X
#X_est_prev = dot(A, Y)

#mask = np.sign(R)
#masked_R = mask * R

for i in xrange(len(R)):
    for j in xrange(len(R[i])):
        if R[i][j] > 0:
            eij = R[i][j] - np.dot(P[i,:],Q[:,j])
            for k in xrange(K):
                P[i][k] = P[i][k] + eps * (2 * eij * Q[k][j] - beta * P[i][k])
                Q[k][j] = Q[k][j] + eps * (2 * eij * P[i][k] - beta * Q[k][j])

In [161]:
for _ in range(1,5):
    nP, nQ = matrix_factorization(R, K, steps=1000,eps=1e-3)
    nR = np.dot(nP, nQ.T)
    print ((nR-GR)**2).sum()/np.sign(GR).sum()


54.0298250116
56.7997458309
54.0097982207
54.3613471474

In [141]:
GR = [
     [0,0,0,0],
     [0,1,1,0],
     [0,0,0,0],
     [0,4,0,0],
     [4,0,0,0],
    ]

In [139]:
R


Out[139]:
array([[5, 3, 5, 3],
       [4, 0, 0, 1],
       [1, 5, 1, 5],
       [1, 0, 1, 4],
       [0, 4, 5, 4]])

In [149]:
nR


Out[149]:
array([[ 4.98064509,  3.00337373,  4.98567601,  2.99254764],
       [ 3.98478181,  1.19632854,  1.66596662,  1.0013092 ],
       [ 0.99961108,  4.9783092 ,  1.01288321,  4.97950296],
       [ 0.99817346,  2.34980003,  1.0067616 ,  3.97615341],
       [ 1.26528218,  3.99790715,  4.98389234,  3.9907852 ]])

In [55]:
R[1,1]


Out[55]:
5.0




In [98]:
from scipy import linalg

In [100]:
rows, columns = R.shape
A = np.random.rand(rows, 2)

In [ ]:
mask = np.sign(X)
    # initial matrices. A is random [0,1] and Y is A\X.
    rows, columns = X.shape
    A = np.random.rand(rows, latent_features)
    A = np.maximum(A, eps)

    Y = linalg.lstsq(A, X)[0]
    Y = np.maximum(Y, eps)

    masked_X = mask * X
    X_est_prev = dot(A, Y)
# updates
        top = dot(masked_X, Y.T)
        bottom = (dot((mask * dot(A, Y)), Y.T)) + eps
        A *= top / bottom

        A = np.maximum(A, eps)
        # print 'A',  np.round(A, 2)

        top = dot(A.T, masked_X)
        bottom = dot(A.T, mask * dot(A, Y)) + eps
        Y *= top / bottom
        Y = np.maximum(Y, eps)
        # print 'Y', np.round(Y, 2)


        # evaluation
        if i % 200 == 0 or i == 1 or i == max_iter:
            print 'Iteration {}:'.format(i),
            X_est = dot(A, Y)
            err = mask * (X_est_prev - X_est)
            fit_residual = np.sqrt(np.sum(err ** 2))
            X_est_prev = X_est

            curRes = linalg.norm(mask * (X - X_est), ord='fro')
            print 'fit residual', np.round(fit_residual, 4),
            print 'total residual', np.round(curRes, 4)
            if curRes < error_limit or fit_residual < fit_error_limit:
                break

In [190]:
%%time

R = rawMatrixTrain

nP, nQ = matrix_factorization(R, 10, steps=40,eps=1e-3)
nR = np.dot(nP, nQ.T)


CPU times: user 5min 57s, sys: 1.89 s, total: 5min 59s
Wall time: 6min 1s

In [191]:
masqueTest=np.sign(rawMatrixTest)

aa=masqueTest*rawMatrix
"""
for idxi,i in enumerate(aa):
    for idxj,j in enumerate(i):
        if j>5:
            aa[idxi][idxj]=5
            """
q = masqueTest*nR - rawMatrixTest

(q*q).sum()/ masqueTest.sum()


Out[191]:
0.93011954506905736

In [30]:
nR[:5,:5]


Out[30]:
array([[ 2.2328501 ,  2.86127689,  2.78138309,  2.18936119,  1.94124401],
       [ 2.873597  ,  3.01024103,  3.39995977,  2.86989372,  2.98342851],
       [ 2.39437824,  2.9654892 ,  2.95946595,  2.25596745,  2.07565552],
       [ 1.51408587,  2.19458866,  2.42556026,  1.70330838,  1.66845702],
       [ 3.28392919,  3.92596689,  3.90235368,  3.32824406,  3.11129257]])

In [31]:
rawMatrix[:5,:5]


Out[31]:
array([[ 0.,  0.,  0.,  0.,  0.],
       [ 0.,  5.,  3.,  4.,  3.],
       [ 0.,  4.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.]])

In [32]:
mf= evalMF(nR,data,movies)
mf.predict("1","Akira (1988)")


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-32-3ba1d810045a> in <module>()
----> 1 mf= evalMF(nR,data,movies)
      2 mf.predict("1","Akira (1988)")

NameError: name 'evalMF' is not defined

In [47]:
np.array([ (float(ra[2]) - mf.predict(ra[0],ra[1]))**2 for ra in rawArrayTest]).mean()


Out[47]:
1.5532842864204328

In [ ]: