In [1]:
from random import random
import math
import numpy as np
import copy

In [2]:
def loadMovieLens(path='./data/movielens'):
    #Get movie titles
    movies={}
    rev_movies={}
    for idx,line in enumerate(open(path+'/u.item')):
        idx,title=line.split('|')[0:2]
        movies[idx]=title
        rev_movies[title]=idx

    # Load data
    prefs={}
    for line in open(path+'/u.data'):
        (user,movieid,rating,ts)=line.split('\t')
        prefs.setdefault(user,{})
        prefs[user][movies[movieid]]=float(rating)
        
    return prefs,rev_movies

In [3]:
data,movies = loadMovieLens("data/ml-100k")

In [4]:
data['3']


Out[4]:
{'187 (1997)': 2.0,
 'Air Force One (1997)': 2.0,
 'Alien: Resurrection (1997)': 3.0,
 'Apostle, The (1997)': 4.0,
 'Bean (1997)': 2.0,
 'Boogie Nights (1997)': 5.0,
 'Chasing Amy (1997)': 3.0,
 'Conspiracy Theory (1997)': 5.0,
 'Contact (1997)': 2.0,
 'Cop Land (1997)': 4.0,
 'Crash (1996)': 1.0,
 'Critical Care (1997)': 1.0,
 "Dante's Peak (1997)": 2.0,
 'Deconstructing Harry (1997)': 3.0,
 'Deep Rising (1998)': 1.0,
 'Desperate Measures (1998)': 4.0,
 "Devil's Advocate, The (1997)": 3.0,
 "Devil's Own, The (1997)": 1.0,
 'Edge, The (1997)': 4.0,
 'Event Horizon (1997)': 4.0,
 'Everyone Says I Love You (1996)': 2.0,
 'Fallen (1998)': 3.0,
 'G.I. Jane (1997)': 2.0,
 'Game, The (1997)': 2.0,
 'Good Will Hunting (1997)': 2.0,
 'Hard Rain (1998)': 3.0,
 'Hoodlum (1997)': 3.0,
 'House of Yes, The (1997)': 1.0,
 'How to Be a Player (1997)': 1.0,
 'In the Name of the Father (1993)': 2.0,
 'Jackie Brown (1997)': 5.0,
 'Kiss the Girls (1997)': 1.0,
 'L.A. Confidential (1997)': 2.0,
 'Liar Liar (1997)': 2.0,
 'Lost Highway (1997)': 2.0,
 'Mad City (1997)': 3.0,
 'Man Who Knew Too Little, The (1997)': 4.0,
 'Mimic (1997)': 2.0,
 'Mother (1996)': 5.0,
 'Murder at 1600 (1997)': 3.0,
 'Paradise Lost: The Child Murders at Robin Hood Hills (1996)': 5.0,
 'Playing God (1997)': 1.0,
 'Prophecy II, The (1998)': 3.0,
 'Return of the Jedi (1983)': 4.0,
 "Schindler's List (1993)": 4.0,
 'Scream (1996)': 2.0,
 'Sphere (1998)': 3.0,
 'Spice World (1997)': 2.0,
 'Starship Troopers (1997)': 3.0,
 'U Turn (1997)': 3.0,
 "Ulee's Gold (1997)": 3.0,
 'Wag the Dog (1997)': 5.0,
 'Wedding Singer, The (1998)': 3.0}

In [5]:
def getRawArray(data):
    d = []
    for u in data.keys():
        for i in data[u].keys():
            d.append([u,i,data[u][i]])
    return np.array(d)

In [6]:
# splitting while avoiding to reduce the dataset too much
def split_train_test(data,percent_test):
    test={}
    train={}
    movie={}
    for u in data.keys():
        test.setdefault(u,{})
        train.setdefault(u,{})
        for movie in data[u]:
            #print(data[u][movie])
            if (random()<percent_test):
                test[u][movie]=data[u][movie]
            else:
                train[u][movie]=data[u][movie]
    return train, test

In [7]:
def split_train_test_by_movies(data,percent_test):
    test={}
    train={}
    movie={}
    for u in data.keys():
        for movie in data[u]:
            if (random()<percent_test):
                try:
                    test[movie][u]=data[u][movie]
                except KeyError:
                    test.setdefault(movie,{})
                    test[movie][u]=data[u][movie]
            else:
                try:
                    train[movie][u]=data[u][movie]
                except KeyError:
                    train.setdefault(movie,{})
                    train[movie][u]=data[u][movie]
    return train, test

In [8]:
percent_test=0.2
train,test=split_train_test(data,percent_test)

In [9]:
percent_test=0.2
m_train,m_test=split_train_test_by_movies(data,percent_test)

In [10]:
def deleteUnseenInTest(train,test):
    for k in test.keys():
        try:
            train[k]
        except KeyError:
            test.pop(k,None)

In [11]:
deleteUnseenInTest(train,test)
deleteUnseenInTest(m_train,m_test)

In [12]:
rawArray = getRawArray(data)
rawArrayTest = getRawArray(test)

Baseline: mean by user


In [13]:
class baselineMeanMovie:
    def __init__(self):
        self.users={}
        self.movies={}
    def fit(self,train):
        movies = get_moove(train)
        for movie in movies:
            note=0.0
            cpt=0
            for user in train:
                try:
                    note+=train[user][movie]
                    cpt+=1
                except KeyError:
                    pass
            note=note/cpt
            self.movies[movie]=note
        
    def predict(self,user,movie):
        return self.movies[movie]
    def score(self,X):
        nb_movies = len(get_moove(X))
        score = 0.0
        for user in X:
            for movie in X[user]:
                score += (self.predict(user,movie) - X[user][movie])**2
        return score/nb_movies

In [14]:
class baselineMeanUser:
    def __init__(self):
        self.users={}
    def fit(self,train):
        for user in train.keys():
            note=0.0
            for movie in train[user].keys():
                note+=train[user][movie]
            note=note/len(train[user])
            self.users[user]=note
        
    def predict(self,users):
        return [self.users[u] for u in users]

In [15]:
baseline_mu= baselineMeanUser()
baseline_mu.fit(train)
pred = baseline_mu.predict(rawArray[:,0])
print("Mean Error %0.6f" %(
        (np.array(pred) - np.array(rawArray[:,2], float)) ** 2).mean())


Mean Error 1.065131

In [16]:
class baselineMeanMovie:
    def __init__(self):
        self.movies={}
    def fit(self,train):
        for movie in train.keys():
            note=0.0
            for user in train[movie].keys():
                note+=train[movie][user]
            note=note/len(train[movie])
            self.movies[movie]=note
        
    def predict(self,movies):
        res=[]
        for m in movies:
            try:
                res.append(self.movies[m])
            except:
                res.append(3)
        return res

In [17]:
baseline_mm= baselineMeanMovie()
baseline_mm.fit(m_train)
pred = baseline_mm.predict(rawArrayTest[:,1])
print("Mean Error %0.6f" %(
        (np.array(pred) - np.array(rawArrayTest[:,2], float)) ** 2).mean())


Mean Error 1.009774

In [18]:
m_test['Adventures of Pinocchio, The (1996)']


Out[18]:
{'125': 4.0, '181': 1.0, '200': 3.0, '222': 2.0, '592': 2.0}

In [19]:
rawArray[:5]


Out[19]:
array([['344', 'Birdcage, The (1996)', '4.0'],
       ['344', 'Enchanted April (1991)', '4.0'],
       ['344', 'Diabolique (1996)', '2.0'],
       ['344', 'Face/Off (1997)', '4.0'],
       ['344', 'My Fellow Americans (1996)', '3.0']], 
      dtype='|S81')

In [20]:
len(m_train['Birdcage, The (1996)'])


Out[20]:
241


In [175]:
class matrixFactorisation():
    def __init__(self, k, lambd=0.2, eps=1e-5, maxIter=2000, alternate=0):
        self.k = k
        self.lambd = lambd
        self.eps = eps
        self.maxIter = maxIter
        self.alternate = alternate
    def fit(self, dataUsers, dataItems, couples):
        self.p = {}
        self.q = {}
        self.couples = couples
        self.loss = []
        optimP = True
        optimQ = (self.alternate == 0)
        for i in xrange(self.maxIter):
            loss = 0
            for j in xrange(len(couples)):
                r = np.random.randint(len(couples))
                user = couples[r][0]
                item = couples[r][1]
                if not user in self.p:
                    self.p[user] = np.random.rand(1,self.k)
                if not item in self.q:
                    self.q[item] = np.random.rand(self.k,1)
                tmp = dataUsers[user][item] - self.p[user].dot(self.q[item])[0][0]
                if (optimP):
                    self.p[user] = (1 - self.lambd * self.eps) * self.p[user] + self.eps * 2 * tmp * self.q[item].transpose()
                if (optimQ):
                    self.q[item] = (1 - self.lambd * self.eps) * self.q[item] + self.eps * 2 * tmp * self.p[user].transpose()
                loss = loss + tmp*tmp #Sans régularisation
            self.loss.append(loss)
            if (self.alternate != 0):
                if (i % self.alternate == 0):
                    oprimP = False if optimQ else True
                    print i, loss / len(couples)
            else:
                if (i % 100 == 0):
                    print i, loss / len(couples)
    def predict(self, couplesTest):
        pred = np.zeros(len(couplesTest))
        for ind,c in enumerate(couplesTest):
            pred[ind] = self.p[c[0]].dot(self.q[c[1]])[0][0]
        return pred

In [176]:
model3 = matrixFactorisation(10, alternate=0)
model3.fit(trainUsers, trainItems, trainCouples)


0 2.77841840868
100 1.26732246398
200 1.05346402861
300 0.983480018466
400 0.940916112566
500 0.912340589664
600 0.895851403893
700 0.880687032448
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-176-d96c14626eab> in <module>()
      1 model3 = matrixFactorisation(10, alternate=0)
----> 2 model3.fit(trainUsers, trainItems, trainCouples)

<ipython-input-175-885d71f2654b> in fit(self, dataUsers, dataItems, couples)
     23                 if not item in self.q:
     24                     self.q[item] = np.random.rand(self.k,1)
---> 25                 tmp = dataUsers[user][item] - self.p[user].dot(self.q[item])[0][0]
     26                 if (optimP):
     27                     self.p[user] = (1 - self.lambd * self.eps) * self.p[user] + self.eps * 2 * tmp * self.q[item].transpose()

KeyboardInterrupt: 

In [22]:
dm = np.dok_matrix(train)


---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-22-734a04c46e62> in <module>()
      1 
----> 2 dm = np.dok_matrix(train)

AttributeError: 'module' object has no attribute 'dok_matrix'

In [88]:
print(len(movies))
print(len(data.keys()))


1664
943

In [64]:
movies["Adventures of Pinocchio, The (1996)"]


Out[64]:
'1060'

In [82]:
rawMatrix = np.zeros((len(data.keys())+1,1682+1))

In [83]:
np.shape(rawMatrix)


Out[83]:
(944, 1683)

In [84]:
data["1"]["101 Dalmatians (1996)"]


Out[84]:
2.0

In [85]:
for u in data:
    for m in data[u]:
        rawMatrix[int(u)][int(movies[m])] = data[u][m]

In [87]:
rawMatrix[:5][:5]


Out[87]:
array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  5.,  3., ...,  0.,  0.,  0.],
       [ 0.,  4.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [98]:
import numpy as np
from scipy import linalg
from numpy import dot

def nmf(X, latent_features, max_iter=100, error_limit=1e-6, fit_error_limit=1e-6):
    """
    Decompose X to A*Y
    """
    eps = 1e-5
    print 'Starting NMF decomposition with {} latent features and {} iterations.'.format(latent_features, max_iter)
    #X = X.toarray()  # I am passing in a scipy sparse matrix

    # mask
    mask = np.sign(X)

    # initial matrices. A is random [0,1] and Y is A\X.
    rows, columns = X.shape
    A = np.random.rand(rows, latent_features)
    A = np.maximum(A, eps)

    Y = linalg.lstsq(A, X)[0]
    Y = np.maximum(Y, eps)

    masked_X = mask * X
    X_est_prev = dot(A, Y)
    for i in range(1, max_iter + 1):
        # updates
        top = dot(masked_X, Y.T)
        bottom = (dot((mask * dot(A, Y)), Y.T)) + eps
        A *= top / bottom

        A = np.maximum(A, eps)
        # print 'A',  np.round(A, 2)

        top = dot(A.T, masked_X)
        bottom = dot(A.T, mask * dot(A, Y)) + eps
        Y *= top / bottom
        Y = np.maximum(Y, eps)
        # print 'Y', np.round(Y, 2)


        # evaluation
        if i % 50 == 0 or i == 1 or i == max_iter:
            print 'Iteration {}:'.format(i),
            X_est = dot(A, Y)
            err = mask * (X_est_prev - X_est)
            fit_residual = np.sqrt(np.sum(err ** 2))
            X_est_prev = X_est

            curRes = linalg.norm(mask * (X - X_est), ord='fro')
            print 'fit residual', np.round(fit_residual, 4),
            print 'total residual', np.round(curRes, 4)
            if curRes < error_limit or fit_residual < fit_error_limit:
                break

    return A, Y

In [104]:
A,Y = nmf(rawMatrix,10,max_iter=1000)


Starting NMF decomposition with 10 latent features and 1000 iterations.
Iteration 1: fit residual 882.6755 total residual 336.9268
Iteration 50: fit residual 202.99 total residual 255.134
Iteration 100: fit residual 54.7172 total residual 245.6577
Iteration 150: fit residual 33.4924 total residual 241.3665
Iteration 200: fit residual 25.0348 total residual 238.6455
Iteration 250: fit residual 21.4902 total residual 236.5486
Iteration 300: fit residual 17.9908 total residual 234.943
Iteration 350: fit residual 15.3883 total residual 233.6517
Iteration 400: fit residual 13.878 total residual 232.545
Iteration 450: fit residual 12.6491 total residual 231.5866
Iteration 500: fit residual 11.4608 total residual 230.7769
Iteration 550: fit residual 10.3756 total residual 230.0917
Iteration 600: fit residual 9.5832 total residual 229.4888
Iteration 650: fit residual 9.25 total residual 228.9448
Iteration 700: fit residual 8.5896 total residual 228.4733
Iteration 750: fit residual 8.0066 total residual 228.0668
Iteration 800: fit residual 7.3645 total residual 227.7124
Iteration 850: fit residual 7.1057 total residual 227.3899
Iteration 900: fit residual 6.6246 total residual 227.0994
Iteration 950: fit residual 6.3376 total residual 226.8345
Iteration 1000: fit residual 5.7849 total residual 226.5979

In [105]:
resMatrix = A.dot(Y)

In [ ]:


In [150]:
class evalMF:
    def __init__(self,resMatrix,dicU,dicI):
        self.resMatrix=resMatrix
        self.dicU = dicU
        self.dicI = dicI
    def fit(self):
        pass
        
    def predict(self,user,movie):
        #res=[]
        #for m in movies:
        #    try:
        #        res.append(self.movies[m])
        #    except:
        #        res.append(3)
        #return res
        return self.resMatrix[int(user)][int(self.dicI[movie])]

In [152]:
mf= evalMF(resMatrix,data,movies)
#baseline_mm.fit(m_train)
#pred = baseline_mm.predict(rawArrayTest[:,1])
#print("Mean Error %0.6f" %(
#        (np.array(pred) - np.array(rawArrayTest[:,2], float)) ** 2).mean())

In [153]:
mf.predict("1","Akira (1988)")


Out[153]:
4.2954887468335441

In [159]:
mf.predict("1","All Dogs Go to Heaven 2 (1996)")


Out[159]:
1.3539530296470097

In [ ]:
print("Mean Error %0.6f" %(
        (np.array(pred) - np.array(rawArrayTest[:,2], float)) ** 2).mean())

In [160]:
rawArrayTest[:5]


Out[160]:
array([['344', 'Birdcage, The (1996)', '4.0'],
       ['344', 'Emma (1996)', '4.0'],
       ['344', 'Diabolique (1996)', '2.0'],
       ['344', 'Sleepers (1996)', '4.0'],
       ['344', 'Breakdown (1997)', '3.0']], 
      dtype='|S79')

In [167]:
np.array([ (float(ra[2]) - mf.predict(ra[0],ra[1]))**2 for ra in rawArrayTest]).mean()


Out[167]:
0.51039178926982265

In [154]:
train["1"]


Out[154]:
{'101 Dalmatians (1996)': 2.0,
 '20,000 Leagues Under the Sea (1954)': 3.0,
 '2001: A Space Odyssey (1968)': 4.0,
 'Abyss, The (1989)': 3.0,
 'Ace Ventura: Pet Detective (1994)': 3.0,
 'Air Bud (1997)': 1.0,
 'Aladdin (1992)': 4.0,
 'Alien (1979)': 5.0,
 'Aliens (1986)': 5.0,
 'All Dogs Go to Heaven 2 (1996)': 1.0,
 'Angels and Insects (1995)': 4.0,
 "Antonia's Line (1995)": 5.0,
 'Apocalypse Now (1979)': 3.0,
 'Apollo 13 (1995)': 4.0,
 'Aristocats, The (1970)': 2.0,
 'Army of Darkness (1993)': 4.0,
 'Austin Powers: International Man of Mystery (1997)': 4.0,
 'Babe (1995)': 1.0,
 'Back to the Future (1985)': 5.0,
 'Bad Boys (1995)': 2.0,
 'Basic Instinct (1992)': 3.0,
 'Batman & Robin (1997)': 1.0,
 'Batman Returns (1992)': 1.0,
 'Beavis and Butt-head Do America (1996)': 3.0,
 'Belle de jour (1967)': 3.0,
 'Big Night (1996)': 5.0,
 'Billy Madison (1995)': 2.0,
 'Birdcage, The (1996)': 4.0,
 'Blade Runner (1982)': 5.0,
 'Blues Brothers, The (1980)': 4.0,
 'Bound (1996)': 5.0,
 "Bram Stoker's Dracula (1992)": 3.0,
 'Braveheart (1995)': 4.0,
 'Brazil (1985)': 5.0,
 "Breakfast at Tiffany's (1961)": 1.0,
 'Bridge on the River Kwai, The (1957)': 4.0,
 'Brother Minister: The Assassination of Malcolm X (1994)': 4.0,
 'Cape Fear (1991)': 3.0,
 "Carlito's Way (1993)": 4.0,
 'Cinema Paradiso (1988)': 5.0,
 'Citizen Kane (1941)': 4.0,
 'Citizen Ruth (1996)': 4.0,
 'Clerks (1994)': 5.0,
 'Clockwork Orange, A (1971)': 3.0,
 'Cold Comfort Farm (1995)': 3.0,
 'Contact (1997)': 5.0,
 'Copycat (1995)': 3.0,
 'Crimson Tide (1995)': 3.0,
 'Crow, The (1994)': 4.0,
 'Crumb (1994)': 5.0,
 'D3: The Mighty Ducks (1996)': 1.0,
 'Dances with Wolves (1990)': 3.0,
 'Dead Man Walking (1995)': 5.0,
 'Delicatessen (1991)': 5.0,
 'Desperado (1995)': 4.0,
 "Devil's Own, The (1997)": 2.0,
 'Diabolique (1996)': 4.0,
 'Die Hard (1988)': 4.0,
 'Die Hard 2 (1990)': 3.0,
 'Dolores Claiborne (1994)': 5.0,
 'Doom Generation, The (1995)': 2.0,
 'Eat Drink Man Woman (1994)': 5.0,
 'Empire Strikes Back, The (1980)': 5.0,
 'Event Horizon (1997)': 1.0,
 'Evil Dead II (1987)': 3.0,
 'Fargo (1996)': 5.0,
 'Faster Pussycat! Kill! Kill! (1965)': 1.0,
 'Field of Dreams (1989)': 3.0,
 'Fifth Element, The (1997)': 4.0,
 'Firm, The (1993)': 4.0,
 'Flipper (1996)': 1.0,
 'Four Rooms (1995)': 4.0,
 'Four Weddings and a Funeral (1994)': 3.0,
 'Free Willy (1993)': 1.0,
 'Free Willy 2: The Adventure Home (1995)': 1.0,
 'French Twist (Gazon maudit) (1995)': 5.0,
 'From Dusk Till Dawn (1996)': 3.0,
 'Fugitive, The (1993)': 4.0,
 'Gattaca (1997)': 5.0,
 'George of the Jungle (1997)': 1.0,
 'Get Shorty (1995)': 3.0,
 'Ghost and the Darkness, The (1996)': 2.0,
 'Glengarry Glen Ross (1992)': 4.0,
 'Godfather, The (1972)': 5.0,
 'Godfather: Part II, The (1974)': 4.0,
 'GoldenEye (1995)': 3.0,
 'Good Will Hunting (1997)': 3.0,
 'Good, The Bad and The Ugly, The (1966)': 5.0,
 'GoodFellas (1990)': 4.0,
 'Graduate, The (1967)': 5.0,
 'Grosse Pointe Blank (1997)': 4.0,
 'Groundhog Day (1993)': 5.0,
 'Haunted World of Edward D. Wood Jr., The (1995)': 5.0,
 'Heavy Metal (1981)': 2.0,
 'Henry V (1989)': 5.0,
 'Home Alone (1990)': 2.0,
 'Homeward Bound: The Incredible Journey (1993)': 1.0,
 'Hoop Dreams (1994)': 5.0,
 'Horseman on the Roof, The (Hussard sur le toit, Le) (1995)': 5.0,
 'Hot Shots! Part Deux (1993)': 4.0,
 'Hunt for Red October, The (1990)': 4.0,
 'In the Company of Men (1997)': 3.0,
 'Independence Day (ID4) (1996)': 4.0,
 'Indiana Jones and the Last Crusade (1989)': 4.0,
 'Jaws (1975)': 4.0,
 'Jean de Florette (1986)': 5.0,
 'Jerry Maguire (1996)': 2.0,
 'Jude (1996)': 2.0,
 'Jungle2Jungle (1997)': 1.0,
 'Jurassic Park (1993)': 5.0,
 'Kansas City (1996)': 3.0,
 'Kids in the Hall: Brain Candy (1996)': 5.0,
 'Kolya (1996)': 5.0,
 'Kull the Conqueror (1997)': 1.0,
 'Last of the Mohicans, The (1992)': 4.0,
 'Lawnmower Man, The (1992)': 2.0,
 'Legends of the Fall (1994)': 4.0,
 'Lion King, The (1994)': 3.0,
 'Lone Star (1996)': 5.0,
 'Long Kiss Goodnight, The (1996)': 3.0,
 'Lost World: Jurassic Park, The (1997)': 2.0,
 'Love Bug, The (1969)': 3.0,
 'Madness of King George, The (1994)': 4.0,
 'Manon of the Spring (Manon des sources) (1986)': 5.0,
 'Mars Attacks! (1996)': 5.0,
 'Men in Black (1997)': 4.0,
 'Mighty Aphrodite (1995)': 5.0,
 'Mirror Has Two Faces, The (1996)': 3.0,
 'Moll Flanders (1996)': 4.0,
 'Monty Python and the Holy Grail (1974)': 5.0,
 "Monty Python's Life of Brian (1979)": 5.0,
 "Mr. Holland's Opus (1995)": 5.0,
 'Mr. Smith Goes to Washington (1939)': 3.0,
 'Much Ado About Nothing (1993)': 3.0,
 'Muppet Treasure Island (1996)': 1.0,
 "My Best Friend's Wedding (1997)": 2.0,
 'Mystery Science Theater 3000: The Movie (1996)': 5.0,
 'Nadja (1994)': 2.0,
 'Natural Born Killers (1994)': 3.0,
 'Net, The (1995)': 3.0,
 'Nightmare on Elm Street, A (1984)': 1.0,
 'On Golden Pond (1981)': 4.0,
 'Patton (1970)': 3.0,
 'Phenomenon (1996)': 3.0,
 'Pillow Book, The (1995)': 5.0,
 'Pink Floyd - The Wall (1982)': 4.0,
 'Platoon (1986)': 4.0,
 'Private Benjamin (1980)': 2.0,
 'Professional, The (1994)': 5.0,
 'Psycho (1960)': 4.0,
 'Pulp Fiction (1994)': 4.0,
 'Quiz Show (1994)': 4.0,
 'Raiders of the Lost Ark (1981)': 5.0,
 'Raising Arizona (1987)': 4.0,
 'Ref, The (1994)': 3.0,
 'Remains of the Day, The (1993)': 5.0,
 'Return of the Pink Panther, The (1974)': 4.0,
 'Richard III (1995)': 3.0,
 'Ridicule (1996)': 5.0,
 'Right Stuff, The (1983)': 4.0,
 'Rock, The (1996)': 3.0,
 'Room with a View, A (1986)': 2.0,
 'Rumble in the Bronx (1995)': 3.0,
 'Santa Clause, The (1994)': 2.0,
 'Searching for Bobby Fischer (1993)': 5.0,
 'Seven (Se7en) (1995)': 2.0,
 'Shanghai Triad (Yao a yao yao dao waipo qiao) (1995)': 5.0,
 'Shawshank Redemption, The (1994)': 5.0,
 'Shining, The (1980)': 3.0,
 'Sleeper (1973)': 5.0,
 'Sleepless in Seattle (1993)': 4.0,
 'Sling Blade (1996)': 5.0,
 "Smilla's Sense of Snow (1997)": 2.0,
 'Sneakers (1992)': 4.0,
 'Snow White and the Seven Dwarfs (1937)': 3.0,
 'So I Married an Axe Murderer (1993)': 4.0,
 'Star Trek III: The Search for Spock (1984)': 4.0,
 'Star Trek IV: The Voyage Home (1986)': 4.0,
 'Star Trek VI: The Undiscovered Country (1991)': 4.0,
 'Star Trek: First Contact (1996)': 4.0,
 'Star Wars (1977)': 5.0,
 'Stargate (1994)': 3.0,
 'Starship Troopers (1997)': 2.0,
 'Steel (1997)': 1.0,
 'Sting, The (1973)': 4.0,
 'Strange Days (1995)': 4.0,
 'Striptease (1996)': 1.0,
 'Supercop (1992)': 4.0,
 'Swingers (1996)': 5.0,
 'Terminator 2: Judgment Day (1991)': 5.0,
 'Terminator, The (1984)': 5.0,
 'Theodore Rex (1995)': 1.0,
 'This Is Spinal Tap (1984)': 4.0,
 'Three Colors: Blue (1993)': 5.0,
 'Three Colors: Red (1994)': 5.0,
 'Three Colors: White (1994)': 4.0,
 'To Wong Foo, Thanks for Everything! Julie Newmar (1995)': 3.0,
 'Top Gun (1986)': 4.0,
 'True Romance (1993)': 3.0,
 'Truth About Cats & Dogs, The (1996)': 5.0,
 'Twister (1996)': 3.0,
 'Unbearable Lightness of Being, The (1988)': 4.0,
 'Under Siege (1992)': 2.0,
 'Unforgiven (1992)': 4.0,
 'Usual Suspects, The (1995)': 5.0,
 'Welcome to the Dollhouse (1995)': 5.0,
 'When Harry Met Sally... (1989)': 5.0,
 'While You Were Sleeping (1995)': 4.0,
 'White Balloon, The (1995)': 4.0,
 'Willy Wonka and the Chocolate Factory (1971)': 4.0,
 'Wizard of Oz, The (1939)': 4.0,
 'Wrong Trousers, The (1993)': 5.0,
 'Young Guns (1988)': 3.0,
 'unknown': 4.0}

In [121]:
test["1"]


Out[121]:
{'12 Angry Men (1957)': 5.0,
 'Akira (1988)': 4.0,
 'Amadeus (1984)': 5.0,
 'Batman Forever (1995)': 1.0,
 'Bedknobs and Broomsticks (1971)': 2.0,
 'Breaking the Waves (1996)': 5.0,
 'Brothers McMullen, The (1995)': 3.0,
 'Cable Guy, The (1996)': 3.0,
 'Chasing Amy (1997)': 5.0,
 'Cyrano de Bergerac (1990)': 5.0,
 'Dead Poets Society (1989)': 5.0,
 'Dirty Dancing (1987)': 2.0,
 'Disclosure (1994)': 4.0,
 'Ed Wood (1994)': 4.0,
 'Exotica (1994)': 4.0,
 'Fish Called Wanda, A (1988)': 3.0,
 'Forrest Gump (1994)': 3.0,
 'Frighteners, The (1996)': 4.0,
 'Full Metal Jacket (1987)': 3.0,
 'Full Monty, The (1997)': 5.0,
 'Gone with the Wind (1939)': 4.0,
 'Grand Day Out, A (1992)': 3.0,
 'Hudsucker Proxy, The (1994)': 5.0,
 'I.Q. (1994)': 3.0,
 'M*A*S*H (1970)': 3.0,
 'Mad Love (1995)': 2.0,
 'Mask, The (1994)': 4.0,
 'Maverick (1994)': 3.0,
 'Maya Lin: A Strong Clear Vision (1994)': 5.0,
 'Mimic (1997)': 2.0,
 'Nightmare Before Christmas, The (1993)': 5.0,
 'Nikita (La Femme Nikita) (1990)': 5.0,
 'Operation Dumbo Drop (1995)': 1.0,
 'Outbreak (1995)': 3.0,
 'Postino, Il (1994)': 5.0,
 'Priest (1994)': 5.0,
 'Princess Bride, The (1987)': 5.0,
 'Raging Bull (1980)': 4.0,
 'Reservoir Dogs (1992)': 4.0,
 'Return of the Jedi (1983)': 5.0,
 "Robert A. Heinlein's The Puppet Masters (1994)": 4.0,
 'Sgt. Bilko (1996)': 2.0,
 'Shall We Dance? (1996)': 4.0,
 'Silence of the Lambs, The (1991)': 4.0,
 'Sound of Music, The (1965)': 1.0,
 'Spitfire Grill, The (1996)': 2.0,
 'Star Trek: The Wrath of Khan (1982)': 5.0,
 'Taxi Driver (1976)': 4.0,
 'Toy Story (1995)': 5.0,
 'Turbo: A Power Rangers Movie (1997)': 1.0,
 'Twelve Monkeys (1995)': 4.0,
 'Unhook the Stars (1996)': 4.0,
 'Wallace & Gromit: The Best of Aardman Animation (1996)': 5.0,
 "Weekend at Bernie's (1989)": 3.0,
 "What's Eating Gilbert Grape (1993)": 4.0,
 'When the Cats Away (Chacun cherche son chat) (1996)': 4.0,
 'Young Frankenstein (1974)': 5.0}


In [ ]:


In [ ]:


In [ ]:


In [123]:
import numpy

def matrix_factorization(R, P, Q, K, steps=5000, alpha=0.0002, beta=0.02):
    Q = Q.T
    for step in xrange(steps):
        for i in xrange(len(R)):
            for j in xrange(len(R[i])):
                if R[i][j] > 0:
                    eij = R[i][j] - numpy.dot(P[i,:],Q[:,j])
                    for k in xrange(K):
                        P[i][k] = P[i][k] + alpha * (2 * eij * Q[k][j] - beta * P[i][k])
                        Q[k][j] = Q[k][j] + alpha * (2 * eij * P[i][k] - beta * Q[k][j])
        eR = numpy.dot(P,Q)
        e = 0
        for i in xrange(len(R)):
            for j in xrange(len(R[i])):
                if R[i][j] > 0:
                    e = e + pow(R[i][j] - numpy.dot(P[i,:],Q[:,j]), 2)
                    for k in xrange(K):
                        e = e + (beta/2) * (pow(P[i][k],2) + pow(Q[k][j],2))
        if e < 0.001:
            break
    return P, Q.T

In [124]:
R = [
     [5,3,0,1],
     [4,0,0,1],
     [1,1,0,5],
     [1,0,0,4],
     [0,1,5,4],
    ]

R = numpy.array(R)

N = len(R)
M = len(R[0])
K = 2

P = numpy.random.rand(N,K)
Q = numpy.random.rand(M,K)

nP, nQ = matrix_factorization(R, P, Q, K)
nR = numpy.dot(nP, nQ.T)

In [125]:
nR


Out[125]:
array([[ 4.99650255,  2.93418578,  3.99234219,  0.99820419],
       [ 3.96298764,  2.33573468,  3.36035393,  0.99707133],
       [ 1.0716961 ,  0.82538354,  5.337668  ,  4.96193011],
       [ 0.9633362 ,  0.72181627,  4.33820223,  3.97311633],
       [ 1.81106643,  1.21517699,  4.91344271,  4.03428495]])


In [140]:
%%time

R = rawMatrix

N = len(R)
M = len(R[0])
K = 2

P = numpy.random.rand(N,K)
Q = numpy.random.rand(M,K)

nP, nQ = matrix_factorization(R, P, Q, K, steps=100)
nR = numpy.dot(nP, nQ.T)


CPU times: user 12min 15s, sys: 5.83 s, total: 12min 21s
Wall time: 12min 33s

In [141]:
nR[:5,:5]


Out[141]:
array([[ 0.28872932,  2.18817177,  1.85474169,  1.70232665,  1.99304327],
       [ 0.52041677,  4.00343944,  3.38358237,  3.14717666,  3.64726599],
       [ 0.5200989 ,  3.97133365,  3.36127841,  3.10588446,  3.61760816],
       [ 0.40158254,  3.22346401,  2.70251611,  2.60665046,  2.93853012],
       [ 0.60854536,  4.61584397,  3.9118434 ,  3.59312263,  4.2042843 ]])

In [142]:
rawMatrix[:5,:5]


Out[142]:
array([[ 0.,  0.,  0.,  0.,  0.],
       [ 0.,  5.,  3.,  4.,  3.],
       [ 0.,  4.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.]])

In [168]:
mf= evalMF(nR,data,movies)
mf.predict("1","Akira (1988)")


Out[168]:
3.6673599590921162

In [169]:
np.array([ (float(ra[2]) - mf.predict(ra[0],ra[1]))**2 for ra in rawArrayTest]).mean()


Out[169]:
0.86354390536474346