TME4 FDMS Collaborative Filtering

Florian Toqué & Paul Willot


In [66]:
from random import random
import math
import numpy as np
import copy

Loading the data


In [67]:
def loadMovieLens(path='./data/movielens'):
    #Get movie titles
    movies={}
    rev_movies={}
    for idx,line in enumerate(open(path+'/u.item')):
        idx,title=line.split('|')[0:2]
        movies[idx]=title
        rev_movies[title]=idx

    # Load data
    prefs={}
    for line in open(path+'/u.data'):
        (user,movieid,rating,ts)=line.split('\t')
        prefs.setdefault(user,{})
        prefs[user][movies[movieid]]=float(rating)
        
    return prefs,rev_movies

In [68]:
data,movies = loadMovieLens("data/ml-100k")

Content example


In [69]:
data['3']


Out[69]:
{'187 (1997)': 2.0,
 'Air Force One (1997)': 2.0,
 'Alien: Resurrection (1997)': 3.0,
 'Apostle, The (1997)': 4.0,
 'Bean (1997)': 2.0,
 'Boogie Nights (1997)': 5.0,
 'Chasing Amy (1997)': 3.0,
 'Conspiracy Theory (1997)': 5.0,
 'Contact (1997)': 2.0,
 'Cop Land (1997)': 4.0,
 'Crash (1996)': 1.0,
 'Critical Care (1997)': 1.0,
 "Dante's Peak (1997)": 2.0,
 'Deconstructing Harry (1997)': 3.0,
 'Deep Rising (1998)': 1.0,
 'Desperate Measures (1998)': 4.0,
 "Devil's Advocate, The (1997)": 3.0,
 "Devil's Own, The (1997)": 1.0,
 'Edge, The (1997)': 4.0,
 'Event Horizon (1997)': 4.0,
 'Everyone Says I Love You (1996)': 2.0,
 'Fallen (1998)': 3.0,
 'G.I. Jane (1997)': 2.0,
 'Game, The (1997)': 2.0,
 'Good Will Hunting (1997)': 2.0,
 'Hard Rain (1998)': 3.0,
 'Hoodlum (1997)': 3.0,
 'House of Yes, The (1997)': 1.0,
 'How to Be a Player (1997)': 1.0,
 'In the Name of the Father (1993)': 2.0,
 'Jackie Brown (1997)': 5.0,
 'Kiss the Girls (1997)': 1.0,
 'L.A. Confidential (1997)': 2.0,
 'Liar Liar (1997)': 2.0,
 'Lost Highway (1997)': 2.0,
 'Mad City (1997)': 3.0,
 'Man Who Knew Too Little, The (1997)': 4.0,
 'Mimic (1997)': 2.0,
 'Mother (1996)': 5.0,
 'Murder at 1600 (1997)': 3.0,
 'Paradise Lost: The Child Murders at Robin Hood Hills (1996)': 5.0,
 'Playing God (1997)': 1.0,
 'Prophecy II, The (1998)': 3.0,
 'Return of the Jedi (1983)': 4.0,
 "Schindler's List (1993)": 4.0,
 'Scream (1996)': 2.0,
 'Sphere (1998)': 3.0,
 'Spice World (1997)': 2.0,
 'Starship Troopers (1997)': 3.0,
 'U Turn (1997)': 3.0,
 "Ulee's Gold (1997)": 3.0,
 'Wag the Dog (1997)': 5.0,
 'Wedding Singer, The (1998)': 3.0}

Splitting data between train/test

We avoid to let unseen data form the train set in the test set.
We also try to minimise the dataset reduction by splitting on each user.


In [70]:
def getRawArray(data):
    d = []
    for u in data.keys():
        for i in data[u].keys():
            d.append([u,i,data[u][i]])
    return np.array(d)

In [71]:
# splitting while avoiding to reduce the dataset too much
def split_train_test(data,percent_test):
    test={}
    train={}
    movie={}
    for u in data.keys():
        test.setdefault(u,{})
        train.setdefault(u,{})
        for movie in data[u]:
            #print(data[u][movie])
            if (random()<percent_test):
                test[u][movie]=data[u][movie]
            else:
                train[u][movie]=data[u][movie]
    return train, test

In [72]:
def split_train_test_by_movies(data,percent_test):
    test={}
    train={}
    movie={}
    for u in data.keys():
        for movie in data[u]:
            if (random()<percent_test):
                try:
                    test[movie][u]=data[u][movie]
                except KeyError:
                    test.setdefault(movie,{})
                    test[movie][u]=data[u][movie]
            else:
                try:
                    train[movie][u]=data[u][movie]
                except KeyError:
                    train.setdefault(movie,{})
                    train[movie][u]=data[u][movie]
    return train, test

In [73]:
percent_test=0.2
train,test=split_train_test(data,percent_test)

split used for convenience on the average by movie baseline


In [74]:
percent_test=0.2
m_train,m_test=split_train_test_by_movies(data,percent_test)

cleaning


In [75]:
def deleteUnseenInTest(train,test):
    for k in test.keys():
        try:
            train[k]
        except KeyError:
            test.pop(k,None)

In [76]:
deleteUnseenInTest(train,test)
deleteUnseenInTest(m_train,m_test)

Matrix used for fast evaluation


In [77]:
evalArrayAll = getRawArray(data)
evalArrayTest = getRawArray(test)

In [78]:
evalArrayTest[:10,:10]


Out[78]:
array([['344', 'Ransom (1996)', '3.0'],
       ['344', 'Strictly Ballroom (1992)', '5.0'],
       ['344', 'Mars Attacks! (1996)', '3.0'],
       ['344', 'Twister (1996)', '3.0'],
       ['344', 'People vs. Larry Flynt, The (1996)', '4.0'],
       ['344', 'Out to Sea (1997)', '2.0'],
       ['344', 'Sabrina (1954)', '4.0'],
       ['344', 'Secrets & Lies (1996)', '5.0'],
       ['344', 'Copycat (1995)', '3.0'],
       ['344', "Smilla's Sense of Snow (1997)", '3.0']], 
      dtype='|S79')

Baseline: mean by user


In [79]:
class baselineMeanUser:
    def __init__(self):
        self.users={}
    def fit(self,train):
        for user in train.keys():
            note=0.0
            for movie in train[user].keys():
                note+=train[user][movie]
            note=note/len(train[user])
            self.users[user]=note
        
    def predict(self,users):
        return [self.users[u] for u in users]

In [80]:
baseline_mu= baselineMeanUser()
baseline_mu.fit(train)
pred = baseline_mu.predict(evalArrayTest[:,0])
print("Mean Error %0.6f" %(
        (np.array(pred) - np.array(evalArrayTest[:,2], float)) ** 2).mean())


Mean Error 1.064954

In [81]:
class baselineMeanMovie:
    def __init__(self):
        self.movies={}
    def fit(self,train):
        for movie in train.keys():
            note=0.0
            for user in train[movie].keys():
                note+=train[movie][user]
            note=note/len(train[movie])
            self.movies[movie]=note
        
    def predict(self,movies):
        res=[]
        for m in movies:
            try:
                res.append(self.movies[m])
            except:
                res.append(3)
        return res

In [82]:
baseline_mm= baselineMeanMovie()
baseline_mm.fit(m_train)
pred = baseline_mm.predict(evalArrayTest[:,1])
print("Mean Error %0.6f" %(
        (np.array(pred) - np.array(evalArrayTest[:,2], float)) ** 2).mean())


Mean Error 0.981749

Raw matrix used for convenience and clarity.
Structure like scipy sparse matrix or python dictionnaries may be used for speedup.

Complete dataset


In [83]:
rawMatrix = np.zeros((len(data.keys()),1682))
for u in data:
    for m in data[u]:
        rawMatrix[int(u)-1][int(movies[m])-1] = data[u][m]

In [84]:
print(np.shape(rawMatrix))
rawMatrix[:10,:10]


(943, 1682)
Out[84]:
array([[ 5.,  3.,  4.,  3.,  3.,  5.,  4.,  1.,  5.,  3.],
       [ 4.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  2.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 4.,  3.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 4.,  0.,  0.,  0.,  0.,  0.,  2.,  4.,  4.,  0.],
       [ 0.,  0.,  0.,  5.,  0.,  0.,  5.,  5.,  5.,  4.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  3.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  5.,  4.,  0.,  0.,  0.],
       [ 4.,  0.,  0.,  4.,  0.,  0.,  4.,  0.,  4.,  0.]])

Train and test dataset


In [85]:
rawMatrixTrain = np.zeros((len(data.keys()),1682))
for u in train:
    for m in train[u]:
        rawMatrixTrain[int(u)-1][int(movies[m])-1] = train[u][m]
        
rawMatrixTest = np.zeros((len(data.keys()),1682))
for u in test:
    for m in test[u]:
        rawMatrixTest[int(u)-1][int(movies[m])-1] = test[u][m]

Non-negative Matrix Factorization

Fast implementation using numpy's matrix processing.


In [38]:
#from scipy import linalg

def nmf(X, latent_features, max_iter=100, eps = 1e-5,printevery=100):

    print "NMF with %d latent features, %d iterations."%(latent_features, max_iter)

    # mask used to ignore null element (coded by zero)
    mask = np.sign(X)

    # randomly initialized matrix
    rows, columns = X.shape
    A = np.random.rand(rows, latent_features)
    
    Y = np.random.rand(latent_features, columns)
    # Not used as I couldn't find significant improvments
    #Y = linalg.lstsq(A, X)[0]  # initializing that way as recommanded in a blog post
    #Y = np.maximum(Y, eps)     # avoiding too low values

    masked_X = mask * X
    masktest = np.sign(rawMatrixTest)    # used for prints
    masktrain = np.sign(rawMatrixTrain)  # used for prints

    for i in range(1, max_iter + 1):

        top = np.dot(masked_X, Y.T)
        bottom = (np.dot((mask * np.dot(A, Y)), Y.T)) + eps
        A *= top / bottom
        
        top = np.dot(A.T, masked_X)
        bottom = np.dot(A.T, mask * np.dot(A, Y)) + eps
        Y *= top / bottom


        # evaluation
        if i % printevery == 0 or i == 1 or i == max_iter:
            X_est = np.dot(A, Y)
            q = masktest*X_est - rawMatrixTest
            q_train = masktrain*X_est - rawMatrixTrain
            print "Iteration %d, Err %.05f, Err train %.05f"%( i, (q*q).sum()/ masktest.sum(), (q_train*q_train).sum()/ masktest.sum() )
            
    return A, Y

In [86]:
%%time
A,Y = nmf(rawMatrixTrain,100,eps = 1e-5,max_iter=5,printevery=1)
resMatrix = A.dot(Y)


NMF with 100 latent features, 5 iterations.
Iteration 1, Err 0.92854, Err train 3.48107
Iteration 2, Err 0.89742, Err train 3.29107
Iteration 3, Err 0.89667, Err train 3.22399
Iteration 4, Err 0.89701, Err train 3.16422
Iteration 5, Err 0.89745, Err train 3.10647
CPU times: user 984 ms, sys: 118 ms, total: 1.1 s
Wall time: 617 ms

We see that it quickly get better than the baseline.
However, we see below that it overfit after that:


In [25]:
%%time
A,Y = nmf(rawMatrixTrain,100,eps = 1e-5,max_iter=500,printevery=100)
resMatrix = A.dot(Y)


NMF with 100 latent features, 500 iterations.
Iteration 1, Err 0.93665, Err train 3.52000
Iteration 100, Err 1.19580, Err train 0.55700
Iteration 200, Err 1.38603, Err train 0.28243
Iteration 300, Err 1.49323, Err train 0.19640
Iteration 400, Err 1.56915, Err train 0.15397
Iteration 500, Err 1.62885, Err train 0.12858
CPU times: user 1min 9s, sys: 3.95 s, total: 1min 13s
Wall time: 31.6 s

This is due to the high sparsity of the matrix.
We can of course reduce the features matrix size to avoid overfitting, but that will limit further improvments.


In [87]:
%%time
A,Y = nmf(rawMatrixTrain,1,eps = 1e-5,max_iter=100,printevery=20)
resMatrix = A.dot(Y)


NMF with 1 latent features, 100 iterations.
Iteration 1, Err 0.93438, Err train 3.57343
Iteration 20, Err 0.88116, Err train 3.36634
Iteration 40, Err 0.88116, Err train 3.36634
Iteration 60, Err 0.88116, Err train 3.36634
Iteration 80, Err 0.88116, Err train 3.36634
Iteration 100, Err 0.88116, Err train 3.36634
CPU times: user 3.34 s, sys: 738 ms, total: 4.07 s
Wall time: 2.98 s

Despite good results in few seconds on this dataset, this can only get us so far.
We then have to add regularization to the cost function.

Evaluation


In [61]:
## This class is used to make predictions
class evalMF:
    def __init__(self,resMatrix,dicU,dicI):
        self.resMatrix=resMatrix
        self.dicU = dicU
        self.dicI = dicI
    def fit(self):
        pass
        
    def predict(self,user,movie):
        return self.resMatrix[int(user)-1][int(self.dicI[movie])-1]

In [45]:
mf = evalMF(resMatrix,data,movies)

In [62]:
# np.array([ (float(ra[2]) - mf.predict(ra[0],ra[1]))**2 for ra in evalArrayTest]).mean()
# faster evaluation
masqueTest=np.sign(rawMatrixTest)
q = masqueTest*resMatrix - rawMatrixTest
(q*q).sum()/ masqueTest.sum()


Out[62]:
0.89275690554990295

In [63]:
print data["1"]["Akira (1988)"]
print mf.predict("1","Akira (1988)")
print data["1"]["I.Q. (1994)"]
print mf.predict("1","I.Q. (1994)")


4.0
3.66336132148
3.0
3.32141384965

We usualy see an important difference between users, so we need to take the bias into account.


In [33]:
summ=0
for i in data["1"]:
    summ+=(float(data["1"][i]) - mf.predict("1",i))**2
summ/len(data["1"])


Out[33]:
0.95019735270796091

In [34]:
summ=0
for i in data["3"]:
    summ+=(float(data["3"][i]) - mf.predict("3",i))**2
summ/len(data["3"])


Out[34]:
1.3200645835765501

Various atempts to incoporate the bias and the L1 regularization can be found below.
However, we have not been very successful with them yet...
A simpler yet slower model can be found at the bottom of the page, in which the bias and L1 regularization can be added easly.



In [96]:
#self.lamb*np.linalg.norm(self.theta)

#from scipy import linalg

def nmf(X, latent_features, max_iter=100, eps = 1e-5, printevery=100):

    print "NMF with %d latent features, %d iterations."%(latent_features, max_iter)
    
    #lamb = 0.2
    

    ## User and Item bais
    
    #X = copy.deepcopy(rawMatrix)
    #with np.errstate(all='ignore'):
    #avg_m = X.sum(0)/(X != 0).sum(0)
    #avg_u = X.sum(1)/(X != 0).sum(1)
    #diff_m = avg_m - avg_m.mean()
    #diff_u = avg_u - avg_u.mean()
    #print(avg_u.mean())
    #X = X - diff_m
    #for idxi,i in enumerate(X):
    #    for idxj,j in enumerate(i):
    #        if X[idxi,idxj]!=0:
    #            X[idxi,idxj]+=diff_u[idxi]

    mask = np.sign(X)

    rows, columns = X.shape
    A = np.random.rand(rows, latent_features)
    
    Y = np.random.rand(latent_features, columns)
    # Not used as I couldn't find significant improvments
    #Y = linalg.lstsq(A, X)[0]  # initializing that way as recommanded in a blog post
    #Y = np.maximum(Y, eps)     # avoiding too low values

    masked_X = mask * X
    masktest = np.sign(rawMatrixTest)    # used for prints
    masktrain = np.sign(rawMatrixTrain)  # used for prints
    
    prev_A = A
    prev_Y = Y
    

    

    #diff_u = (avg_u - avg_u.mean().T).T
    #(np.array([1,5])-mat.T).T
    

    for i in range(1, max_iter + 1):

        top = np.dot(masked_X, Y.T)
        esti = np.dot((mask * np.dot(A, Y)), Y.T)
        #esti = esti - diff_u
        bottom = esti + eps
        #print("val",np.shape(top/bottom))
        A *= top / bottom
        
        top = np.dot(A.T, masked_X)
        esti = np.dot(A.T, mask * np.dot(A, Y))
        #esti = esti - diff_m
        bottom = esti + eps
        #print("lav",np.shape(top/bottom))
        tb = top / bottom
        #print(np.linalg.norm(tb))
        no = np.linalg.norm(tb)
        #Y *= tb
        Y *= (0.9 * tb) + ( 0.1 * ( tb + (1/no) )  )
        
        """
        ## Regularization
        if i % 10 == 0:
            diff = np.abs(Y - prev_Y)
            diff = diff - 0.001
            Y = np.sign(diff)*Y
            prev_Y = Y
        """

        # evaluation
        if i % 10 == 0 or i == 1 or i == max_iter:
            X_est = np.dot(A, Y)
            q = masktest*X_est - rawMatrixTest
            q_train = masktrain*X_est - rawMatrixTrain
            print(np.linalg.norm(tb))
            print "Iteration %d, Err %.05f, Err train %.05f"%( i, (q*q).sum()/ masktest.sum(), (q_train*q_train).sum()/ masktest.sum() )
            
    return A, Y

This is quite unstable


In [99]:
%%time
X = copy.deepcopy(rawMatrixTrain)
A,Y = nmf(X,1,eps = 1e-5,max_iter=100,printevery=10)
resMatrix = A.dot(Y)


NMF with 1 latent features, 100 iterations.
2269.30475923
Iteration 1, Err 0.94484, Err train 3.61341
40.3112135185
Iteration 10, Err 0.88123, Err train 3.36666
40.3112902647
Iteration 20, Err 0.88123, Err train 3.36666
40.3112902218
Iteration 30, Err 0.88123, Err train 3.36666
40.3112901646
Iteration 40, Err 0.88123, Err train 3.36666
40.3112901082
Iteration 50, Err 0.88123, Err train 3.36666
40.3112900527
Iteration 60, Err 0.88123, Err train 3.36666
40.311289998
Iteration 70, Err 0.88123, Err train 3.36666
40.311289944
Iteration 80, Err 0.88123, Err train 3.36666
40.3112898908
Iteration 90, Err 0.88123, Err train 3.36666
40.3112898383
Iteration 100, Err 0.88123, Err train 3.36666
CPU times: user 3.53 s, sys: 814 ms, total: 4.34 s
Wall time: 3.26 s

/!\ 18 movies have no ratings at all

so we get a divide by zero warning. Ignored with:


In [321]:
with np.errstate(all='ignore'):
    avg_m = rawMatrix.sum(0)/(rawMatrix != 0).sum(0)
    avg_u = rawMatrix.sum(1)/(rawMatrix != 0).sum(1)


In [33]:
masqueTest=np.sign(rawMatrixTest)
q = masqueTest*resMatrix - rawMatrixTest
(q*q).sum()/ masqueTest.sum()


Out[33]:
1.1465785558244217

In [59]:
mf = evalMF(resMatrix,data,movies)

In [69]:
print data["1"]["Akira (1988)"]
print mf.predict("1","Akira (1988)")
print data["1"]["All Dogs Go to Heaven 2 (1996)"]
print mf.predict("1","All Dogs Go to Heaven 2 (1996)")


4.0
3.68121941569
1.0
0.96164687419

In [64]:
len(rawMatrixTest)


Out[64]:
943

In [65]:
t = []
c = 10
for idxi,i in enumerate(rawMatrixTest):
    for idxj,j in enumerate(i):
        if rawMatrixTest[idxi][idxj] != 0:
            t.append( (resMatrix[idxi][idxj] - float(rawMatrixTest[idxi][idxj]))**2 )
            if c>0:
                print(rawMatrixTest[idxi][idxj],resMatrix[idxi][idxj])
                c-=1
np.array(t).mean()


(4.0, 3.8396463699257595)
(3.0, 3.5458851979095165)
(2.0, 3.2755712334736713)
(4.0, 3.5087244595199025)
(2.0, 2.4510027204018821)
(3.0, 3.3214138496532524)
(5.0, 3.7460132079987969)
(4.0, 4.146715732882595)
(5.0, 4.2383683022905867)
(3.0, 3.2597595068592664)
Out[65]:
0.89275690554990295



In [59]:
R = rawMatrixTrain

In [74]:
def matrix_factorization(R, K, steps=100, eps=0.0001, beta=0.02, decay=0.95):
    N,M = np.shape(R)
    P = np.random.rand(N,K)
    #P = np.maximum(P, eps)
    
    #Q = np.random.rand(M,K).T
    Q = linalg.lstsq(P, R)[0]
    Q = np.maximum(Q, eps)

    #masked_X = mask * X
    #X_est_prev = dot(A, Y)
    
    #mask = np.sign(R)
    #masked_R = mask * R
    
    masktest = np.sign(rawMatrixTest)
    masktrain = np.sign(rawMatrixTrain)
    
    
    for step in xrange(1,steps+1):
        #"""
        for i in xrange(len(R)):
            for j in xrange(len(R[i])):
                if R[i][j] > 0:
                    eij = R[i][j] - np.dot(P[i,:],Q[:,j])
                    P[i] = P[i] + eps * (2 * eij * Q.T[j] - beta * P[i])
                    #Q[i] = P[i] + eps * (2 * eij * Q.T[j] - beta * P[i])
                    Q.T[j] = Q.T[j] + eps * (2 * eij * P[i] - beta * Q.T[j])
                    #for k in xrange(K):
                    #    P[i][k] = P[i][k] + eps * (2 * eij * Q[k][j] - beta * P[i][k])
                        #Q[k][j] = Q[k][j] + eps * (2 * eij * P[i][k] - beta * Q[k][j])

        if step%5:
            eps=eps*decay
        
        if step % 10 == 0 or step == 1 or step == steps:

            X_est = dot(P, Q)
            q = masktest*X_est - rawMatrixTest
            q_train = masktrain*X_est - rawMatrixTrain
            print "Iteration %d, Err %.05f, Err on train %.05f"%( step, (q*q).sum()/ masktest.sum(), (q_train*q_train).sum()/ masktest.sum() )
            
        
        
    return P, Q.T

In [70]:
%%time
K = 10
nP, nQ = matrix_factorization(R, K, steps=100,eps=1e-5)


Iteration 1, Err 3.33905, Err train 12.59114
Iteration 10, Err 1.07766, Err train 3.68302
Iteration 20, Err 0.98629, Err train 3.23860
Iteration 30, Err 0.95904, Err train 3.02876
Iteration 40, Err 0.94580, Err train 2.85712
Iteration 50, Err 0.93984, Err train 2.70948
Iteration 60, Err 0.93874, Err train 2.58756
Iteration 70, Err 0.94040, Err train 2.48769
Iteration 80, Err 0.94345, Err train 2.40503
Iteration 90, Err 0.94717, Err train 2.33584
Iteration 100, Err 0.95115, Err train 2.27742
CPU times: user 6min 16s, sys: 2.28 s, total: 6min 18s
Wall time: 6min 17s

In [72]:
nR = np.dot(nP, nQ.T)
((nR-R)**2).sum()/np.sign(R).sum()


Out[72]:
174.05240132990514




In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:
def matrix_factorization(R, K, steps=100, eps=0.0001, beta=0.02, decay=0.95):
    N,M = np.shape(R)
    P = np.random.rand(N,K)

    Q = linalg.lstsq(P, R)[0]
    Q = np.maximum(Q, eps)


    masktest = np.sign(rawMatrixTest)
    masktrain = np.sign(rawMatrixTrain)
    
    
    for step in xrange(1,steps+1):

        for i in xrange(len(R)):
            for j in xrange(len(R[i])):
                if R[i][j] > 0:
                    eij = R[i][j] - np.dot(P[i,:],Q[:,j])
                    P[i] = P[i] + eps * (2 * eij * Q.T[j] - beta * P[i])

                    Q.T[j] = Q.T[j] + eps * (2 * eij * P[i] - beta * Q.T[j])
   





        if step%5:
            eps=eps*decay
        
        if step % 10 == 0 or step == 1 or step == steps:

            X_est = dot(P, Q)
            q = masktest*X_est - rawMatrixTest
            q_train = masktrain*X_est - rawMatrixTrain
            print "Iteration %d, Err %.05f, Err on train %.05f"%( step, (q*q).sum()/ masktest.sum(), (q_train*q_train).sum()/ masktest.sum() )
            
        
        
    return P, Q.T