notebook.community

Edit and run



In [1]:

    
from random import random
import math
import numpy as np
import copy



In [2]:

    
def loadMovieLens(path='./data/movielens'):
    #Get movie titles
    movies={}
    rev_movies={}
    for idx,line in enumerate(open(path+'/u.item')):
        idx,title=line.split('|')[0:2]
        movies[idx]=title
        rev_movies[title]=idx

    # Load data
    prefs={}
    for line in open(path+'/u.data'):
        (user,movieid,rating,ts)=line.split('\t')
        prefs.setdefault(user,{})
        prefs[user][movies[movieid]]=float(rating)
        
    return prefs,rev_movies



In [3]:

    
data,movies = loadMovieLens("data/ml-100k")



In [4]:

    
data['3']









    Out[4]:





{'187 (1997)': 2.0,
 'Air Force One (1997)': 2.0,
 'Alien: Resurrection (1997)': 3.0,
 'Apostle, The (1997)': 4.0,
 'Bean (1997)': 2.0,
 'Boogie Nights (1997)': 5.0,
 'Chasing Amy (1997)': 3.0,
 'Conspiracy Theory (1997)': 5.0,
 'Contact (1997)': 2.0,
 'Cop Land (1997)': 4.0,
 'Crash (1996)': 1.0,
 'Critical Care (1997)': 1.0,
 "Dante's Peak (1997)": 2.0,
 'Deconstructing Harry (1997)': 3.0,
 'Deep Rising (1998)': 1.0,
 'Desperate Measures (1998)': 4.0,
 "Devil's Advocate, The (1997)": 3.0,
 "Devil's Own, The (1997)": 1.0,
 'Edge, The (1997)': 4.0,
 'Event Horizon (1997)': 4.0,
 'Everyone Says I Love You (1996)': 2.0,
 'Fallen (1998)': 3.0,
 'G.I. Jane (1997)': 2.0,
 'Game, The (1997)': 2.0,
 'Good Will Hunting (1997)': 2.0,
 'Hard Rain (1998)': 3.0,
 'Hoodlum (1997)': 3.0,
 'House of Yes, The (1997)': 1.0,
 'How to Be a Player (1997)': 1.0,
 'In the Name of the Father (1993)': 2.0,
 'Jackie Brown (1997)': 5.0,
 'Kiss the Girls (1997)': 1.0,
 'L.A. Confidential (1997)': 2.0,
 'Liar Liar (1997)': 2.0,
 'Lost Highway (1997)': 2.0,
 'Mad City (1997)': 3.0,
 'Man Who Knew Too Little, The (1997)': 4.0,
 'Mimic (1997)': 2.0,
 'Mother (1996)': 5.0,
 'Murder at 1600 (1997)': 3.0,
 'Paradise Lost: The Child Murders at Robin Hood Hills (1996)': 5.0,
 'Playing God (1997)': 1.0,
 'Prophecy II, The (1998)': 3.0,
 'Return of the Jedi (1983)': 4.0,
 "Schindler's List (1993)": 4.0,
 'Scream (1996)': 2.0,
 'Sphere (1998)': 3.0,
 'Spice World (1997)': 2.0,
 'Starship Troopers (1997)': 3.0,
 'U Turn (1997)': 3.0,
 "Ulee's Gold (1997)": 3.0,
 'Wag the Dog (1997)': 5.0,
 'Wedding Singer, The (1998)': 3.0}



In [5]:

    
def getRawArray(data):
    d = []
    for u in data.keys():
        for i in data[u].keys():
            d.append([u,i,data[u][i]])
    return np.array(d)



In [6]:

    
# splitting while avoiding to reduce the dataset too much
def split_train_test(data,percent_test):
    test={}
    train={}
    movie={}
    for u in data.keys():
        test.setdefault(u,{})
        train.setdefault(u,{})
        for movie in data[u]:
            #print(data[u][movie])
            if (random()<percent_test):
                test[u][movie]=data[u][movie]
            else:
                train[u][movie]=data[u][movie]
    return train, test



In [7]:

    
def split_train_test_by_movies(data,percent_test):
    test={}
    train={}
    movie={}
    for u in data.keys():
        for movie in data[u]:
            if (random()<percent_test):
                try:
                    test[movie][u]=data[u][movie]
                except KeyError:
                    test.setdefault(movie,{})
                    test[movie][u]=data[u][movie]
            else:
                try:
                    train[movie][u]=data[u][movie]
                except KeyError:
                    train.setdefault(movie,{})
                    train[movie][u]=data[u][movie]
    return train, test



In [8]:

    
percent_test=0.2
train,test=split_train_test(data,percent_test)



In [9]:

    
percent_test=0.2
m_train,m_test=split_train_test_by_movies(data,percent_test)



In [10]:

    
def deleteUnseenInTest(train,test):
    for k in test.keys():
        try:
            train[k]
        except KeyError:
            test.pop(k,None)



In [11]:

    
deleteUnseenInTest(train,test)
deleteUnseenInTest(m_train,m_test)



In [12]:

    
rawArray = getRawArray(data)
rawArrayTest = getRawArray(test)

Baseline: mean by user



In [13]:

    
class baselineMeanUser:
    def __init__(self):
        self.users={}
    def fit(self,train):
        for user in train.keys():
            note=0.0
            for movie in train[user].keys():
                note+=train[user][movie]
            note=note/len(train[user])
            self.users[user]=note
        
    def predict(self,users):
        return [self.users[u] for u in users]



In [14]:

    
baseline_mu= baselineMeanUser()
baseline_mu.fit(train)
pred = baseline_mu.predict(rawArray[:,0])
print("Mean Error %0.6f" %(
        (np.array(pred) - np.array(rawArray[:,2], float)) ** 2).mean())









    



Mean Error 1.065122



In [15]:

    
class baselineMeanMovie:
    def __init__(self):
        self.movies={}
    def fit(self,train):
        for movie in train.keys():
            note=0.0
            for user in train[movie].keys():
                note+=train[movie][user]
            note=note/len(train[movie])
            self.movies[movie]=note
        
    def predict(self,movies):
        res=[]
        for m in movies:
            try:
                res.append(self.movies[m])
            except:
                res.append(3)
        return res



In [16]:

    
baseline_mm= baselineMeanMovie()
baseline_mm.fit(m_train)
pred = baseline_mm.predict(rawArrayTest[:,1])
print("Mean Error %0.6f" %(
        (np.array(pred) - np.array(rawArrayTest[:,2], float)) ** 2).mean())









    



Mean Error 1.024099



In [17]:

    
m_test['Adventures of Pinocchio, The (1996)']









    Out[17]:





{'254': 3.0, '434': 3.0, '648': 2.0, '699': 3.0, '756': 4.0, '821': 5.0}



In [18]:

    
rawArray[:5]









    Out[18]:





array([['344', 'Birdcage, The (1996)', '4.0'],
       ['344', 'Enchanted April (1991)', '4.0'],
       ['344', 'Diabolique (1996)', '2.0'],
       ['344', 'Face/Off (1997)', '4.0'],
       ['344', 'My Fellow Americans (1996)', '3.0']], 
      dtype='|S81')



In [19]:

    
len(m_train['Birdcage, The (1996)'])









    Out[19]:





222



In [27]:

    
class matrixFactorisation():
    def __init__(self, k, lambd=0.2, eps=1e-5, maxIter=2000, alternate=0):
        self.k = k
        self.lambd = lambd
        self.eps = eps
        self.maxIter = maxIter
        self.alternate = alternate
    def fit(self, dataUsers, dataItems, couples):
        self.p = {}
        self.q = {}
        self.couples = couples
        self.loss = []
        optimP = True
        optimQ = (self.alternate == 0)
        for i in xrange(self.maxIter):
            loss = 0
            for j in xrange(len(couples)):
                r = np.random.randint(len(couples))
                user = couples[r][0]
                item = couples[r][1]
                if not user in self.p:
                    self.p[user] = np.random.rand(1,self.k)
                if not item in self.q:
                    self.q[item] = np.random.rand(self.k,1)
                tmp = dataUsers[user][item] - self.p[user].dot(self.q[item])[0][0]
                if (optimP):
                    self.p[user] = (1 - self.lambd * self.eps) * self.p[user] + self.eps * 2 * tmp * self.q[item].transpose()
                if (optimQ):
                    self.q[item] = (1 - self.lambd * self.eps) * self.q[item] + self.eps * 2 * tmp * self.p[user].transpose()
                loss = loss + tmp*tmp #Sans régularisation
            self.loss.append(loss)
            if (self.alternate != 0):
                if (i % self.alternate == 0):
                    oprimP = False if optimQ else True
                    print i, loss / len(couples)
            else:
                if (i % 100 == 0):
                    print i, loss / len(couples)
    def predict(self, couplesTest):
        pred = np.zeros(len(couplesTest))
        for ind,c in enumerate(couplesTest):
            pred[ind] = self.p[c[0]].dot(self.q[c[1]])[0][0]
        return pred



In [176]:

    
model3 = matrixFactorisation(10, alternate=0)
model3.fit(trainUsers, trainItems, trainCouples)









    



0 2.77841840868
100 1.26732246398
200 1.05346402861
300 0.983480018466
400 0.940916112566
500 0.912340589664
600 0.895851403893
700 0.880687032448






    



---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-176-d96c14626eab> in <module>()
      1 model3 = matrixFactorisation(10, alternate=0)
----> 2 model3.fit(trainUsers, trainItems, trainCouples)

<ipython-input-175-885d71f2654b> in fit(self, dataUsers, dataItems, couples)
     23                 if not item in self.q:
     24                     self.q[item] = np.random.rand(self.k,1)
---> 25                 tmp = dataUsers[user][item] - self.p[user].dot(self.q[item])[0][0]
     26                 if (optimP):
     27                     self.p[user] = (1 - self.lambd * self.eps) * self.p[user] + self.eps * 2 * tmp * self.q[item].transpose()

KeyboardInterrupt:



In [22]:

    
dm = np.dok_matrix(train)









    



---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-22-734a04c46e62> in <module>()
      1 
----> 2 dm = np.dok_matrix(train)

AttributeError: 'module' object has no attribute 'dok_matrix'



In [28]:

    
print(len(movies))
print(len(data.keys()))



In [21]:

    
movies["Adventures of Pinocchio, The (1996)"]









    Out[21]:





'1060'



In [22]:

    
rawMatrix = np.zeros((len(data.keys())+1,1682+1))
for u in data:
    for m in data[u]:
        rawMatrix[int(u)][int(movies[m])] = data[u][m]



In [23]:

    
np.shape(rawMatrix)









    Out[23]:





(944, 1683)



In [24]:

    
train["1"]["101 Dalmatians (1996)"]









    Out[24]:





2.0



In [25]:

    
rawMatrixTrain = np.zeros((len(data.keys())+1,1682+1))
for u in train:
    for m in train[u]:
        rawMatrixTrain[int(u)][int(movies[m])] = train[u][m]
        
rawMatrixTest = np.zeros((len(data.keys())+1,1682+1))
for u in test:
    for m in test[u]:
        rawMatrixTest[int(u)][int(movies[m])] = test[u][m]



In [26]:

    
rawMatrixTrain[:10,:10]









    Out[26]:





array([[ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  5.,  3.,  4.,  0.,  3.,  5.,  4.,  1.,  5.],
       [ 0.,  4.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  4.,  3.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  2.,  4.,  0.],
       [ 0.,  0.,  0.,  0.,  5.,  0.,  0.,  0.,  5.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  3.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  5.,  0.,  0.,  0.]])



In [27]:

    
rawMatrixTest[:10,:10]









    Out[27]:





array([[ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  3.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  4.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  4.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  5.,  0.,  5.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  4.,  0.,  0.]])



In [28]:

    
np.shape(rawMatrixTest)









    Out[28]:





(944, 1683)



In [73]:

    
import numpy as np
from scipy import linalg
from numpy import dot

def nmf(X, latent_features, max_iter=100, error_limit=1e-6, fit_error_limit=1e-6, eps = 1e-5):
    """
    Decompose X to A*Y
    """
    eps = 1e-5
    print 'Starting NMF decomposition with {} latent features and {} iterations.'.format(latent_features, max_iter)
    #X = X.toarray()  # I am passing in a scipy sparse matrix

    # mask
    mask = np.sign(X)

    # initial matrices. A is random [0,1] and Y is A\X.
    rows, columns = X.shape
    A = np.random.rand(rows, latent_features)
    A = np.maximum(A, eps)

    Y = linalg.lstsq(A, X)[0]
    Y = np.maximum(Y, eps)

    masked_X = mask * X
    X_est_prev = dot(A, Y)
    for i in range(1, max_iter + 1):
        # updates
        top = dot(masked_X, Y.T)
        bottom = (dot((mask * dot(A, Y)), Y.T)) + eps
        A *= top / bottom

        A = np.maximum(A, eps)
        # print 'A',  np.round(A, 2)

        top = dot(A.T, masked_X)
        bottom = dot(A.T, mask * dot(A, Y)) + eps
        Y *= top / bottom
        Y = np.maximum(Y, eps)
        # print 'Y', np.round(Y, 2)


        # evaluation
        if i % 200 == 0 or i == 1 or i == max_iter:
            print 'Iteration {}:'.format(i),
            X_est = dot(A, Y)
            err = mask * (X_est_prev - X_est)
            fit_residual = np.sqrt(np.sum(err ** 2))
            X_est_prev = X_est

            curRes = linalg.norm(mask * (X - X_est), ord='fro')
            print 'fit residual', np.round(fit_residual, 4),
            print 'total residual', np.round(curRes, 4)
            if curRes < error_limit or fit_residual < fit_error_limit:
                break

    return A, Y



In [170]:

    
cpr = copy.deepcopy(rawMatrixTrain)



In [118]:

    
cpr[:10,:10]









    Out[118]:





array([[ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  5.,  3.,  4.,  3.,  3.,  5.,  4.,  1.,  5.],
       [ 0.,  4.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  4.,  3.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  4.,  0.,  0.,  0.,  0.,  0.,  2.,  4.,  0.],
       [ 0.,  0.,  0.,  0.,  5.,  0.,  0.,  0.,  0.,  5.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  3.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  5.,  0.,  0.,  0.]])



In [119]:

    
t1 = np.array([[7,1],
               [1,1]])
t2 = np.array([[1,2],
               [3,4]])



In [120]:

    
t1.dot(t2)









    Out[120]:





array([[10, 18],
       [ 4,  6]])



In [121]:

    
(t1*t2).sum()









    Out[121]:





16



In [122]:

    
cpr[1,1]=0



In [171]:

    
%%time
A,Y = nmf(cpr,100,max_iter=4000)









    



Starting NMF decomposition with 100 latent features and 4000 iterations.
Iteration 1: fit residual 341.2397 total residual 251.295






    



---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-171-768eaae14a73> in <module>()
----> 1 get_ipython().run_cell_magic(u'time', u'', u'A,Y = nmf(cpr,100,max_iter=4000)')

/Library/Python/2.7/site-packages/IPython/core/interactiveshell.pyc in run_cell_magic(self, magic_name, line, cell)
   2291             magic_arg_s = self.var_expand(line, stack_depth)
   2292             with self.builtin_trap:
-> 2293                 result = fn(magic_arg_s, cell)
   2294             return result
   2295 

/Library/Python/2.7/site-packages/IPython/core/magics/execution.pyc in time(self, line, cell, local_ns)

/Library/Python/2.7/site-packages/IPython/core/magic.pyc in <lambda>(f, *a, **k)
    191     # but it's overkill for just that one bit of state.
    192     def magic_deco(arg):
--> 193         call = lambda f, *a, **k: f(*a, **k)
    194 
    195         if callable(arg):

/Library/Python/2.7/site-packages/IPython/core/magics/execution.pyc in time(self, line, cell, local_ns)
   1165         else:
   1166             st = clock2()
-> 1167             exec(code, glob, local_ns)
   1168             end = clock2()
   1169             out = None

<timed exec> in <module>()

<ipython-input-73-35e257ec9a74> in nmf(X, latent_features, max_iter, error_limit, fit_error_limit, eps)
     27         # updates
     28         top = dot(masked_X, Y.T)
---> 29         bottom = (dot((mask * dot(A, Y)), Y.T)) + eps
     30         A *= top / bottom
     31 

KeyboardInterrupt:



In [131]:

    
resMatrix = A.dot(Y)



In [132]:

    
resMatrix[1,1]









    Out[132]:





3.8662150994784032



In [33]:

    
class evalMF:
    def __init__(self,resMatrix,dicU,dicI):
        self.resMatrix=resMatrix
        self.dicU = dicU
        self.dicI = dicI
    def fit(self):
        pass
        
    def predict(self,user,movie):
        return self.resMatrix[int(user)][int(self.dicI[movie])]



In [134]:

    
mf= evalMF(resMatrix,data,movies)



In [139]:

    
data["200"]









    Out[139]:





{'101 Dalmatians (1996)': 4.0,
 '20,000 Leagues Under the Sea (1954)': 4.0,
 '2001: A Space Odyssey (1968)': 4.0,
 'Absolute Power (1997)': 3.0,
 'Adventures of Pinocchio, The (1996)': 3.0,
 'Aladdin (1992)': 5.0,
 'Alice in Wonderland (1951)': 5.0,
 'Alien (1979)': 5.0,
 'Alien 3 (1992)': 4.0,
 'Aliens (1986)': 5.0,
 'All Dogs Go to Heaven 2 (1996)': 2.0,
 'Amadeus (1984)': 5.0,
 'American President, The (1995)': 3.0,
 'Andre (1994)': 4.0,
 'Apollo 13 (1995)': 5.0,
 'Around the World in 80 Days (1956)': 3.0,
 'Assassins (1995)': 4.0,
 'Babe (1995)': 4.0,
 'Back to the Future (1985)': 5.0,
 'Barbarella (1968)': 3.0,
 'Batman Forever (1995)': 4.0,
 'Batman Returns (1992)': 4.0,
 'Beauty and the Beast (1991)': 5.0,
 'Birdcage, The (1996)': 4.0,
 'Birds, The (1963)': 5.0,
 'Blade Runner (1982)': 5.0,
 'Boot, Das (1981)': 5.0,
 'Brady Bunch Movie, The (1995)': 2.0,
 'Braveheart (1995)': 4.0,
 'Broken Arrow (1996)': 3.0,
 'Cape Fear (1962)': 5.0,
 'Cape Fear (1991)': 5.0,
 'Carrie (1976)': 4.0,
 'Casablanca (1942)': 5.0,
 'Casper (1995)': 4.0,
 'Cat People (1982)': 4.0,
 'Chain Reaction (1996)': 3.0,
 'Cliffhanger (1993)': 4.0,
 'Clockwork Orange, A (1971)': 4.0,
 'Conan the Barbarian (1981)': 4.0,
 'Contact (1997)': 4.0,
 'Cool Hand Luke (1967)': 4.0,
 'Crash (1996)': 5.0,
 'Crow, The (1994)': 5.0,
 'Crow: City of Angels, The (1996)': 3.0,
 "Dante's Peak (1997)": 3.0,
 'Day the Earth Stood Still, The (1951)': 5.0,
 'Dead Man Walking (1995)': 4.0,
 'Dead Poets Society (1989)': 4.0,
 'Demolition Man (1993)': 5.0,
 'Desperado (1995)': 4.0,
 "Devil's Own, The (1997)": 3.0,
 'Die Hard 2 (1990)': 4.0,
 'Disclosure (1994)': 3.0,
 'Dragonheart (1996)': 4.0,
 'Dumbo (1941)': 4.0,
 'E.T. the Extra-Terrestrial (1982)': 5.0,
 'Eat Drink Man Woman (1994)': 3.0,
 'Empire Strikes Back, The (1980)': 5.0,
 'English Patient, The (1996)': 4.0,
 'Englishman Who Went Up a Hill, But Came Down a Mountain, The (1995)': 2.0,
 'Eraser (1996)': 4.0,
 'Escape from L.A. (1996)': 4.0,
 'Executive Decision (1996)': 4.0,
 'Fantasia (1940)': 5.0,
 'Father of the Bride (1950)': 3.0,
 'Father of the Bride Part II (1995)': 3.0,
 'Field of Dreams (1989)': 4.0,
 'First Knight (1995)': 4.0,
 'Flipper (1996)': 3.0,
 'Flubber (1997)': 4.0,
 'Fly Away Home (1996)': 5.0,
 'Forrest Gump (1994)': 5.0,
 'Fried Green Tomatoes (1991)': 3.0,
 'Frighteners, The (1996)': 4.0,
 'Fugitive, The (1993)': 5.0,
 'Full Metal Jacket (1987)': 4.0,
 'Gandhi (1982)': 4.0,
 'Ghost (1990)': 4.0,
 'Ghost and the Darkness, The (1996)': 4.0,
 'Glimmer Man, The (1996)': 3.0,
 'GoldenEye (1995)': 4.0,
 'Good, The Bad and The Ugly, The (1966)': 4.0,
 'Goofy Movie, A (1995)': 3.0,
 'Grease (1978)': 4.0,
 'Groundhog Day (1993)': 5.0,
 'Grumpier Old Men (1995)': 2.0,
 'Hackers (1995)': 3.0,
 'Hard Target (1993)': 4.0,
 'Harriet the Spy (1996)': 4.0,
 'Highlander (1986)': 5.0,
 'Highlander III: The Sorcerer (1994)': 5.0,
 'Home Alone (1990)': 4.0,
 'Homeward Bound II: Lost in San Francisco (1996)': 2.0,
 'Homeward Bound: The Incredible Journey (1993)': 4.0,
 'Hoop Dreams (1994)': 2.0,
 'House Arrest (1996)': 3.0,
 'Hunchback of Notre Dame, The (1996)': 4.0,
 'Hunt for Red October, The (1990)': 5.0,
 'Independence Day (ID4) (1996)': 5.0,
 'Indian in the Cupboard, The (1995)': 5.0,
 'Indiana Jones and the Last Crusade (1989)': 5.0,
 'Interview with the Vampire (1994)': 4.0,
 'Island of Dr. Moreau, The (1996)': 3.0,
 "It's a Wonderful Life (1946)": 5.0,
 'Jack (1996)': 2.0,
 "Jackie Chan's First Strike (1996)": 3.0,
 'James and the Giant Peach (1996)': 4.0,
 'Jaws (1975)': 4.0,
 'Johnny Mnemonic (1995)': 4.0,
 'Judge Dredd (1995)': 4.0,
 'Jumanji (1995)': 5.0,
 'Jungle Book, The (1994)': 4.0,
 'Jungle2Jungle (1997)': 3.0,
 'Jurassic Park (1993)': 5.0,
 'Juror, The (1996)': 4.0,
 "Kid in King Arthur's Court, A (1995)": 4.0,
 'Killing Fields, The (1984)': 4.0,
 'Kingpin (1996)': 3.0,
 'Last Action Hero (1993)': 4.0,
 'Last Man Standing (1996)': 4.0,
 'Last of the Mohicans, The (1992)': 4.0,
 'Lawnmower Man 2: Beyond Cyberspace (1996)': 3.0,
 'Leaving Las Vegas (1995)': 5.0,
 'Liar Liar (1997)': 4.0,
 'Like Water For Chocolate (Como agua para chocolate) (1992)': 4.0,
 'Lion King, The (1994)': 4.0,
 'Long Kiss Goodnight, The (1996)': 5.0,
 'Love Bug, The (1969)': 3.0,
 'Man Without a Face, The (1993)': 5.0,
 'Mars Attacks! (1996)': 2.0,
 'Mary Poppins (1964)': 4.0,
 'Mask, The (1994)': 4.0,
 'Maximum Risk (1996)': 2.0,
 'Michael (1996)': 4.0,
 'Miracle on 34th Street (1994)': 5.0,
 'Mission: Impossible (1996)': 3.0,
 'Moll Flanders (1996)': 3.0,
 'Mortal Kombat: Annihilation (1997)': 4.0,
 "Mr. Holland's Opus (1995)": 4.0,
 'Mrs. Doubtfire (1993)': 4.0,
 'My Left Foot (1989)': 4.0,
 'Net, The (1995)': 3.0,
 'Nightmare Before Christmas, The (1993)': 4.0,
 'Nutty Professor, The (1996)': 3.0,
 "One Flew Over the Cuckoo's Nest (1975)": 5.0,
 'Outbreak (1995)': 4.0,
 'Patton (1970)': 4.0,
 "Pete's Dragon (1977)": 4.0,
 'Phantom, The (1996)': 4.0,
 'Phenomenon (1996)': 5.0,
 'Philadelphia Story, The (1940)': 5.0,
 'Piano, The (1993)': 4.0,
 'Pocahontas (1995)': 3.0,
 'Powder (1995)': 5.0,
 "Preacher's Wife, The (1996)": 2.0,
 'Pretty Woman (1990)': 4.0,
 'Princess Bride, The (1987)': 5.0,
 'Pulp Fiction (1994)': 4.0,
 'Quest, The (1996)': 3.0,
 'Quiz Show (1994)': 4.0,
 'Raiders of the Lost Ark (1981)': 5.0,
 'Ransom (1996)': 4.0,
 'Right Stuff, The (1983)': 4.0,
 'Rob Roy (1995)': 4.0,
 'Rock, The (1996)': 5.0,
 'Rosencrantz and Guildenstern Are Dead (1990)': 2.0,
 'Rumble in the Bronx (1995)': 2.0,
 'Saint, The (1997)': 3.0,
 'Santa Clause, The (1994)': 4.0,
 "Schindler's List (1993)": 5.0,
 'Scream (1996)': 5.0,
 'Screamers (1995)': 4.0,
 'Secret Garden, The (1993)': 4.0,
 'Seven (Se7en) (1995)': 5.0,
 'Shadow Conspiracy (1997)': 3.0,
 'Shallow Grave (1994)': 3.0,
 'Silence of the Lambs, The (1991)': 5.0,
 'Sleepless in Seattle (1993)': 4.0,
 'Sneakers (1992)': 3.0,
 'Snow White and the Seven Dwarfs (1937)': 5.0,
 'Sound of Music, The (1965)': 5.0,
 'Space Jam (1996)': 3.0,
 'Spawn (1997)': 5.0,
 'Species (1995)': 4.0,
 'Speed (1994)': 5.0,
 'Star Trek III: The Search for Spock (1984)': 5.0,
 'Star Trek IV: The Voyage Home (1986)': 5.0,
 'Star Trek VI: The Undiscovered Country (1991)': 5.0,
 'Star Trek: First Contact (1996)': 5.0,
 'Star Trek: Generations (1994)': 5.0,
 'Star Trek: The Motion Picture (1979)': 5.0,
 'Star Trek: The Wrath of Khan (1982)': 5.0,
 'Star Wars (1977)': 5.0,
 'Stargate (1994)': 5.0,
 'Sudden Death (1995)': 3.0,
 'Swiss Family Robinson (1960)': 3.0,
 'Terminal Velocity (1994)': 4.0,
 'Terminator 2: Judgment Day (1991)': 5.0,
 'Terminator, The (1984)': 5.0,
 'Time to Kill, A (1996)': 4.0,
 'Titanic (1997)': 5.0,
 'Tombstone (1993)': 4.0,
 'Top Gun (1986)': 4.0,
 'Toy Story (1995)': 5.0,
 'True Lies (1994)': 5.0,
 'Twelve Monkeys (1995)': 4.0,
 'Twister (1996)': 4.0,
 'Under Siege 2: Dark Territory (1995)': 4.0,
 'Up Close and Personal (1996)': 4.0,
 'White Squall (1996)': 5.0,
 'Willy Wonka and the Chocolate Factory (1971)': 3.0,
 'Wizard of Oz, The (1939)': 5.0,
 'Wrong Trousers, The (1993)': 5.0,
 'Wyatt Earp (1994)': 4.0,
 'Young Frankenstein (1974)': 5.0}



In [142]:

    
print data["1"]["Akira (1988)"]
print mf.predict("1","Akira (1988)")
print data["1"]["All Dogs Go to Heaven 2 (1996)"]
print mf.predict("1","All Dogs Go to Heaven 2 (1996)")
print "***"
print data["18"]["Don Juan DeMarco (1995)"]
print mf.predict("1","Don Juan DeMarco (1995)")
print data["18"]["Winnie the Pooh and the Blustery Day (1968)"]
print mf.predict("1","Winnie the Pooh and the Blustery Day (1968)")
print "***"
print data["200"]["Assassins (1995)"]
print mf.predict("1","Assassins (1995)")
print data["200"]["Casablanca (1942)"]
print mf.predict("1","Casablanca (1942)")









    



4.0
3.39729089826
1.0
1.62532843747
***
2.0
4.57738258746
3.0
4.07796516329
***
4.0
2.86867907628
5.0
4.7192522862



In [143]:

    
summ=0
for i in data["1"]:
    summ+=(float(data["1"][i]) - mf.predict("1",i))**2
summ/len(data["1"])









    Out[143]:





0.72992292053033703



In [144]:

    
summ=0
for i in data["3"]:
    summ+=(float(data["3"][i]) - mf.predict("3",i))**2
summ/len(data["3"])









    Out[144]:





1.0374567107180328



In [175]:

    
tot=[]
ttt=[]
for j in test:
    summ=0
    for i in test[j]:
        summ+=(float(test[j][i]) - mf.predict(j,i))**2
    #print j, ">>", summ/len(data[j])
    ttt.append(len(test[j]))
    tot.append(summ)
    
#import pdb
#pdb.set_trace()



In [176]:

    
t = np.array(tot)
tt = np.array(ttt)



In [177]:

    
t.mean()/tt.mean()









    Out[177]:





1.1942361914404664



In [162]:

    
tt.std()









    Out[162]:





100.56729085359559



In [169]:

    
t.sum()/tt.sum()









    Out[169]:





0.60572217944012463



In [161]:

    
np.array([ (float(ra[2]) - mf.predict(ra[0],ra[1]))**2 for ra in rawArrayTest]).mean()









    Out[161]:





1.1942361914404731



In [225]:

    
%%time
A,Y = nmf(rawMatrixTrain,500,max_iter=300)









    



Starting NMF decomposition with 500 latent features and 300 iterations.
Iteration 1: fit residual 5298.0864 total residual 244.6212
Iteration 200: fit residual 240.2685 total residual 13.9941
Iteration 300: fit residual 7.7158 total residual 6.8284
CPU times: user 2min 45s, sys: 5.49 s, total: 2min 50s
Wall time: 1min 2s



In [226]:

    
resMatrix = A.dot(Y)



In [227]:

    
a=np.array((1,2,4))
b=np.array((1,3,6))
(a-b).dot(a-b)

masqueTest=np.sign(rawMatrixTest)
masqueTest[:10,:10]

A=masqueTest*rawMatrix



In [228]:

    
aa = masqueTest*resMatrix



In [229]:

    
for idxi,i in enumerate(aa):
    for idxj,j in enumerate(i):
        if j>5:
            aa[idxi][idxj]=5



In [235]:

    
q = masqueTest*resMatrix - rawMatrixTest



In [236]:

    
(q*q).sum()/ masqueTest.sum()









    Out[236]:





1.1273948891755721



In [ ]:

    
masqueTest=np.sign(rawMatrixTest)

aa=masqueTest*rawMatrix

for idxi,i in enumerate(aa):
    for idxj,j in enumerate(i):
        if j>5:
            aa[idxi][idxj]=5
            
q = masqueTest*resMatrix - rawMatrixTest

(q*q).sum()/ masqueTest.sum()



In [232]:

    
aa[:10,:10]









    Out[232]:





array([[ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  4.25790085,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  4.51410323],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ]])



In [111]:

    
rawMatrix[:10,:10]









    Out[111]:





array([[ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  5.,  3.,  4.,  3.,  3.,  5.,  4.,  1.,  5.],
       [ 0.,  4.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  4.,  3.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  4.,  0.,  0.,  0.,  0.,  0.,  2.,  4.,  4.],
       [ 0.,  0.,  0.,  0.,  5.,  0.,  0.,  5.,  5.,  5.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  3.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  5.,  4.,  0.,  0.]])



In [65]:

    
resMatrix[:10,:10]









    Out[65]:





array([[  1.00000000e-09,   2.11263017e-05,   1.75772784e-05,
          1.59467964e-05,   1.91700638e-05,   1.73865300e-05,
          2.06765659e-05,   1.94248575e-05,   2.17730688e-05,
          2.03784277e-05],
       [  1.78801970e-04,   3.95625546e+00,   3.54997734e+00,
          3.72895234e+00,   4.27096338e+00,   2.72442378e+00,
          4.01474525e+00,   4.52884136e+00,   3.69341713e+00,
          3.98399173e+00],
       [  1.96184298e-04,   4.03306906e+00,   2.99267665e+00,
          2.07003583e+00,   3.73621474e+00,   3.57324546e+00,
          4.50847524e+00,   4.03266244e+00,   4.35395407e+00,
          4.65985982e+00],
       [  1.76302740e-04,   3.61148196e+00,   3.27282155e+00,
          2.35550426e+00,   2.02886726e+00,   3.87498134e+00,
          1.48801040e+00,   2.95799674e+00,   2.77898478e+00,
          3.14122547e+00],
       [  2.63225365e-04,   4.84628612e+00,   3.96173386e+00,
          4.22460058e+00,   3.47028279e+00,   3.99521914e+00,
          5.25970696e+00,   4.27889369e+00,   5.52921267e+00,
          5.98457221e+00],
       [  1.73961907e-04,   3.44238651e+00,   3.45807865e+00,
          2.71245890e+00,   3.14421131e+00,   2.45772768e+00,
          1.46203100e+00,   4.82582842e+00,   3.09009585e+00,
          3.56641471e+00],
       [  1.57488630e-04,   3.43843718e+00,   2.08003954e+00,
          2.15150878e+00,   3.19166098e+00,   2.52915516e+00,
          2.59800164e+00,   3.06789555e+00,   3.78494389e+00,
          3.81078788e+00],
       [  2.17947541e-04,   4.70512003e+00,   3.99566394e+00,
          2.01249843e+00,   3.64370725e+00,   3.51552809e+00,
          3.54826067e+00,   4.00895913e+00,   5.02975404e+00,
          5.08923692e+00],
       [  2.07695050e-04,   4.28568480e+00,   3.87650139e+00,
          3.34228855e+00,   3.61386225e+00,   4.02478920e+00,
          2.97444638e+00,   3.65660112e+00,   4.04720870e+00,
          4.45891929e+00],
       [  1.81585514e-04,   4.17292085e+00,   4.22438463e+00,
          4.96064656e+00,   5.06876450e+00,   2.92737460e+00,
          5.80518190e+00,   4.37799768e+00,   3.62961909e+00,
          3.95327106e+00]])



In [59]:

    
mf = evalMF(resMatrix,data,movies)



In [69]:

    
print data["1"]["Akira (1988)"]
print mf.predict("1","Akira (1988)")
print data["1"]["All Dogs Go to Heaven 2 (1996)"]
print mf.predict("1","All Dogs Go to Heaven 2 (1996)")









    



4.0
3.68121941569
1.0
0.96164687419



In [61]:

    
print train["1"]["All Dogs Go to Heaven 2 (1996)"]
print test["1"]["Akira (1988)"]









    



1.0






    



---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-61-ccddd715027a> in <module>()
      1 print train["1"]["All Dogs Go to Heaven 2 (1996)"]
----> 2 print test["1"]["Akira (1988)"]

KeyError: 'Akira (1988)'



In [80]:

    
len(rawMatrixTest)









    Out[80]:





944



In [78]:

    
t = []
c = 10
for idxi,i in enumerate(rawMatrixTest):
    for idxj,j in enumerate(i):
        if rawMatrixTest[idxi][idxj] != 0:
            t.append( (resMatrix[idxi][idxj] - float(rawMatrixTest[idxi][idxj]))**2 )
            if c>0:
                print(rawMatrixTest[idxi][idxj],resMatrix[idxi][idxj])
                c-=1
np.array(t).mean()









    



(5.0, 2.5931883684545336)
(4.0, 5.0047797241971219)
(4.0, 3.469010363892524)
(4.0, 3.8143373120809678)
(4.0, 5.1669137101192657)
(3.0, 4.4963428645445518)
(5.0, 3.6366298516628395)
(3.0, 3.1448263066152959)
(5.0, 2.7517463531147883)
(5.0, 4.6741099156866328)






    Out[78]:





2.0909226578962459



In [87]:

    
t = []
c = 10
for idxi,i in enumerate(resMatrix):
    for idxj,j in enumerate(i):
        if rawMatrixTest[idxi][idxj] != 0:
            t.append( (resMatrix[idxi][idxj] - float(rawMatrix[idxi][idxj]))**2 )
            if c>0:
                print(rawMatrix[idxi][idxj],resMatrix[idxi][idxj])
                c-=1
np.array(t).mean()









    



(5.0, 2.5931883684545336)
(4.0, 5.0047797241971219)
(4.0, 3.469010363892524)
(4.0, 3.8143373120809678)
(4.0, 5.1669137101192657)
(3.0, 4.4963428645445518)
(5.0, 3.6366298516628395)
(3.0, 3.1448263066152959)
(5.0, 2.7517463531147883)
(5.0, 4.6741099156866328)






    Out[87]:





2.0909226578962459



In [108]:

    
t = []
c = 3
for idxi,i in enumerate(rawMatrixTrain):
    for idxj,j in enumerate(i):
        if rawMatrixTrain[idxi][idxj] != 0:
            t.append( (float(rawMatrixTrain[idxi][idxj]) - resMatrix[idxi][idxj])**2 )
            if c>0:
                print(rawMatrixTrain[idxi][idxj],resMatrix[idxi][idxj])
                c-=1
np.array(t).mean()









    



(5.0, 4.3114944585785064)
(3.0, 3.3257920434476187)
(3.0, 4.0022181201367522)






    Out[108]:





0.45320954210197834



In [80]:

    
np.array([ (float(ra[2]) - mf.predict(ra[0],ra[1]))**2 for ra in rawMatrixTest]).mean()









    



---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-80-0241c027db0a> in <module>()
----> 1 np.array([ (float(ra[2]) - mf.predict(ra[0],ra[1]))**2 for ra in rawMatrixTest]).mean()

<ipython-input-52-9fb67320d8a3> in predict(self, user, movie)
      8 
      9     def predict(self,user,movie):
---> 10         return self.resMatrix[int(user)][int(self.dicI[movie])]

KeyError: 0.0



In [124]:

    
R = [
     [5,3,0,1],
     [4,0,0,1],
     [1,1,0,5],
     [1,0,0,4],
     [0,1,5,4],
    ]

R = numpy.array(R)

N = len(R)
M = len(R[0])
K = 2

P = numpy.random.rand(N,K)
Q = numpy.random.rand(M,K)

nP, nQ = matrix_factorization(R, P, Q, K)
nR = numpy.dot(nP, nQ.T)



In [125]:

    
nR









    Out[125]:





array([[ 4.99650255,  2.93418578,  3.99234219,  0.99820419],
       [ 3.96298764,  2.33573468,  3.36035393,  0.99707133],
       [ 1.0716961 ,  0.82538354,  5.337668  ,  4.96193011],
       [ 0.9633362 ,  0.72181627,  4.33820223,  3.97311633],
       [ 1.81106643,  1.21517699,  4.91344271,  4.03428495]])



In [65]:

    
import numpy

def matrix_factorization(R, K, steps=100, alpha=0.0002, beta=0.02):
    N = len(R)
    M = len(R[0])

    P = numpy.random.rand(N,K)
    Q = numpy.random.rand(M,K)
    Q = Q.T
    for step in xrange(steps):
        for i in xrange(len(R)):
            for j in xrange(len(R[i])):
                if R[i][j] > 0:
                    eij = R[i][j] - numpy.dot(P[i,:],Q[:,j])
                    for k in xrange(K):
                        P[i][k] = P[i][k] + alpha * (2 * eij * Q[k][j] - beta * P[i][k])
                        Q[k][j] = Q[k][j] + alpha * (2 * eij * P[i][k] - beta * Q[k][j])
        #eR = numpy.dot(P,Q)
        #e = 0
        #for i in xrange(len(R)):
        #    for j in xrange(len(R[i])):
        #        if R[i][j] > 0:
        #            e = e + pow(R[i][j] - numpy.dot(P[i,:],Q[:,j]), 2)
        #            for k in xrange(K):
        #                e = e + (beta/2) * (pow(P[i][k],2) + pow(Q[k][j],2))
        #if e < 0.001:
        #    break
    return P, Q.T



In [44]:

    
N = len(R)
M = len(R[0])
K = 10
P = numpy.random.rand(N,K)
Q = numpy.random.rand(M,K)
Q = Q.T



In [55]:

    
R[1,1]









    Out[55]:





5.0



In [51]:

    
for i in xrange(len(R)):
    for j in xrange(len(R[i])):
        
        if R[i][j] > 0:
            eij = R[i][j] - numpy.dot(P[i,:],Q[:,j])
            for k in xrange(K):
                P[i][k] = P[i][k] + alpha * (2 * eij * Q[k][j] - beta * P[i][k])
                Q[k][j] = Q[k][j] + alpha * (2 * eij * P[i][k] - beta * Q[k][j])
            print(i,j)
            break
    break



In [ ]:

    
for step in xrange(steps):
    for i in xrange(len(R)):
        for j in xrange(len(R[i])):
            if R[i][j] > 0:
                eij = R[i][j] - numpy.dot(P[i,:],Q[:,j])
                for k in xrange(K):
                    P[i][k] = P[i][k] + alpha * (2 * eij * Q[k][j] - beta * P[i][k])
                    Q[k][j] = Q[k][j] + alpha * (2 * eij * P[i][k] - beta * Q[k][j])



In [60]:

    
%%time

R = rawMatrixTrain

nP, nQ = matrix_factorization(R, 10, steps=120)
nR = numpy.dot(nP, nQ.T)









    



CPU times: user 17min 45s, sys: 7.21 s, total: 17min 52s
Wall time: 17min 54s



In [61]:

    
masqueTest=np.sign(rawMatrixTest)

aa=masqueTest*rawMatrix
            
q = masqueTest*nR - rawMatrixTest

(q*q).sum()/ masqueTest.sum()









    Out[61]:





0.91837398331701225



In [72]:

    
%%time

R = rawMatrixTrain

nP, nQ = matrix_factorization(R, 10, alpha=1e-5,steps=40)
nR = numpy.dot(nP, nQ.T)









    



CPU times: user 5min 54s, sys: 2.49 s, total: 5min 57s
Wall time: 5min 58s



In [73]:

    
masqueTest=np.sign(rawMatrixTest)

aa=masqueTest*rawMatrix
            
q = masqueTest*nR - rawMatrixTest

(q*q).sum()/ masqueTest.sum()









    Out[73]:





1.7314604113745657



In [30]:

    
nR[:5,:5]









    Out[30]:





array([[ 2.2328501 ,  2.86127689,  2.78138309,  2.18936119,  1.94124401],
       [ 2.873597  ,  3.01024103,  3.39995977,  2.86989372,  2.98342851],
       [ 2.39437824,  2.9654892 ,  2.95946595,  2.25596745,  2.07565552],
       [ 1.51408587,  2.19458866,  2.42556026,  1.70330838,  1.66845702],
       [ 3.28392919,  3.92596689,  3.90235368,  3.32824406,  3.11129257]])



In [31]:

    
rawMatrix[:5,:5]









    Out[31]:





array([[ 0.,  0.,  0.,  0.,  0.],
       [ 0.,  5.,  3.,  4.,  3.],
       [ 0.,  4.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.]])



In [32]:

    
mf= evalMF(nR,data,movies)
mf.predict("1","Akira (1988)")









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-32-3ba1d810045a> in <module>()
----> 1 mf= evalMF(nR,data,movies)
      2 mf.predict("1","Akira (1988)")

NameError: name 'evalMF' is not defined



In [47]:

    
np.array([ (float(ra[2]) - mf.predict(ra[0],ra[1]))**2 for ra in rawArrayTest]).mean()









    Out[47]:





1.5532842864204328



In [ ]: