In [1]:
from random import random

import math

Collect data


In [2]:
def loadMovieLens(path='./data/movielens'):
    #Get movie titles
    movies={}
    for line in open(path+'/u.item'):
        id,title=line.split('|')[0:2]
        movies[id]=title

    # Load data
    prefs={}
    for line in open(path+'/u.data'):
        (user,movieid,rating,ts)=line.split('\t')
        prefs.setdefault(user,{})
        prefs[user][movies[movieid]]=float(rating)
        
    return prefs

In [3]:
data = loadMovieLens("data/ml-100k")

Explore data


In [4]:
data['3']


Out[4]:
{'187 (1997)': 2.0,
 'Air Force One (1997)': 2.0,
 'Alien: Resurrection (1997)': 3.0,
 'Apostle, The (1997)': 4.0,
 'Bean (1997)': 2.0,
 'Boogie Nights (1997)': 5.0,
 'Chasing Amy (1997)': 3.0,
 'Conspiracy Theory (1997)': 5.0,
 'Contact (1997)': 2.0,
 'Cop Land (1997)': 4.0,
 'Crash (1996)': 1.0,
 'Critical Care (1997)': 1.0,
 "Dante's Peak (1997)": 2.0,
 'Deconstructing Harry (1997)': 3.0,
 'Deep Rising (1998)': 1.0,
 'Desperate Measures (1998)': 4.0,
 "Devil's Advocate, The (1997)": 3.0,
 "Devil's Own, The (1997)": 1.0,
 'Edge, The (1997)': 4.0,
 'Event Horizon (1997)': 4.0,
 'Everyone Says I Love You (1996)': 2.0,
 'Fallen (1998)': 3.0,
 'G.I. Jane (1997)': 2.0,
 'Game, The (1997)': 2.0,
 'Good Will Hunting (1997)': 2.0,
 'Hard Rain (1998)': 3.0,
 'Hoodlum (1997)': 3.0,
 'House of Yes, The (1997)': 1.0,
 'How to Be a Player (1997)': 1.0,
 'In the Name of the Father (1993)': 2.0,
 'Jackie Brown (1997)': 5.0,
 'Kiss the Girls (1997)': 1.0,
 'L.A. Confidential (1997)': 2.0,
 'Liar Liar (1997)': 2.0,
 'Lost Highway (1997)': 2.0,
 'Mad City (1997)': 3.0,
 'Man Who Knew Too Little, The (1997)': 4.0,
 'Mimic (1997)': 2.0,
 'Mother (1996)': 5.0,
 'Murder at 1600 (1997)': 3.0,
 'Paradise Lost: The Child Murders at Robin Hood Hills (1996)': 5.0,
 'Playing God (1997)': 1.0,
 'Prophecy II, The (1998)': 3.0,
 'Return of the Jedi (1983)': 4.0,
 "Schindler's List (1993)": 4.0,
 'Scream (1996)': 2.0,
 'Sphere (1998)': 3.0,
 'Spice World (1997)': 2.0,
 'Starship Troopers (1997)': 3.0,
 'U Turn (1997)': 3.0,
 "Ulee's Gold (1997)": 3.0,
 'Wag the Dog (1997)': 5.0,
 'Wedding Singer, The (1998)': 3.0}

Creation of train set and test set

We want to split data in two set (train and test)

Actually :

train= 80%totaldataset

test = 20%totaldataset


In [5]:
def split_train_test(data,percent_test):
    test={}
    train={}
    movie={}
    for u in data.keys():
        test.setdefault(u,{})
        train.setdefault(u,{})
        for movie in data[u]:
            #print(data[u][movie])
            if (random()<percent_test):
                test[u][movie]=data[u][movie]
            else:
                train[u][movie]=data[u][movie]
    return train, test

In [6]:
percent_test=0.2
train,test=split_train_test(data,percent_test)

Part that allows to clean train and test

We don't want to have user in test set which are not in train test, the same for the movies so we delete them


In [7]:
def get_moove(data):
    moove = {}
    for u in data:
        for m in data[u]:
            moove[m]=0
    return moove
            
def get_youser(data):
    youser = {}
    for u in data:
        youser[u]=0
    return youser  

def clean(d1,d2):
    to_erase = {}
    for i in d1:
        try:
            d2[i]
        except KeyError:
            to_erase[i]=0
    for i in d2:
        try:
            d1[i]
        except KeyError:
            to_erase[i]=0
    return to_erase
def _remove_users(test,rem):
    for i in rem:
        try:
            del test[i]
        except KeyError:
            pass
def _remove_movies(test,rem):
    for i in test:
        for j in rem:
            try:
                del test[i][j]
            except KeyError:
                pass

In [8]:
mooveToRemoove = clean(get_moove(train),get_moove(test))
youserToRemoove = clean(get_youser(train),get_youser(test))
_remove_users(test,youserToRemoove)
_remove_movies(test,mooveToRemoove)

Collaboritive Filtering classes


In [13]:
class BaselineMeanUser:
    def __init__(self):
        self.users={}
        self.movies={}
    def fit(self,train):
        for user in train:
            note=0
            for movie in train[user]:
                note+=train[user][movie]
            note=note/len(train[user])
            self.users[user]=note
        
    def predict(self,user,movie):
        return self.users[user]
    def score(self,X):
        nb_movies = len(get_moove(X))
        score = 0.0
        for user in X:
            for movie in X[user]:
                score += (self.predict(user,movie) - X[user][movie])**2
        return score/nb_movies
                
                    
    
class BaselineMeanMovie:
    def __init__(self):
        self.users={}
        self.movies={}
    def fit(self,train):
        movies = get_moove(train)
        for movie in movies:
            note=0
            cpt=0
            for user in train:
                try:
                    note+=train[user][movie]
                    cpt+=1
                except KeyError:
                    pass
            note=note/cpt
            self.movies[movie]=note
        
    def predict(self,user,movie):
        return self.movies[movie]
    def score(self,X):
        nb_movies = len(get_moove(X))
        score = 0.0
        for user in X:
            for movie in X[user]:
                score += (self.predict(user,movie) - X[user][movie])**2
        return score/nb_movies

In [14]:
baseline_mu= BaselineMeanUser()
baseline_mm= BaselineMeanMovie()

In [15]:
baseline_mu.fit(train)
baseline_mm.fit(train)

In [16]:
print("score baseline mean user  ",baseline_mu.score(test))
print("score baseline mean movie ",baseline_mm.score(test))


('score baseline mean user  ', 15.993894035341953)
('score baseline mean movie ', 15.254203103208283)


In [18]:
# NMF by alternative non-negative least squares using projected gradients
# Author: Chih-Jen Lin, National Taiwan University
# Python/numpy translation: Anthony Di Franco

from numpy import *
from numpy.linalg import norm
from time import time
from sys import stdout

def nmf(V,Winit,Hinit,tol,timelimit,maxiter):
    """
    (W,H) = nmf(V,Winit,Hinit,tol,timelimit,maxiter)
    W,H: output solution
    Winit,Hinit: initial solution
    tol: tolerance for a relative stopping condition
    timelimit, maxiter: limit of time and iterations
    """

    W = Winit; H = Hinit; initt = time();

    gradW = dot(W, dot(H, H.T)) - dot(V, H.T)
    gradH = dot(dot(W.T, W), H) - dot(W.T, V)
    initgrad = norm(r_[gradW, gradH.T])
    print 'Init gradient norm %f' % initgrad 
    tolW = max(0.001,tol)*initgrad
    tolH = tolW

    for iter in xrange(1,maxiter):
        # stopping condition
        projnorm = norm(r_[gradW[logical_or(gradW<0, W>0)],
                                     gradH[logical_or(gradH<0, H>0)]])
        if projnorm < tol*initgrad or time() - initt > timelimit: break

        (W, gradW, iterW) = nlssubprob(V.T,H.T,W.T,tolW,1000)
        W = W.T
        gradW = gradW.T

        if iterW==1: tolW = 0.1 * tolW

        (H,gradH,iterH) = nlssubprob(V,W,H,tolH,1000)
        if iterH==1: tolH = 0.1 * tolH

        if iter % 10 == 0: stdout.write('.')

    print '\nIter = %d Final proj-grad norm %f' % (iter, projnorm)
    return (W,H)

def nlssubprob(V,W,Hinit,tol,maxiter):
    """
    H, grad: output solution and gradient
    iter: #iterations used
    V, W: constant matrices
    Hinit: initial solution
    tol: stopping tolerance
    maxiter: limit of iterations
    """

    H = Hinit
    WtV = dot(W.T, V)
    WtW = dot(W.T, W) 

    alpha = 1; beta = 0.1;
    for iter in xrange(1, maxiter):  
        grad = dot(WtW, H) - WtV
        projgrad = norm(grad[logical_or(grad < 0, H >0)])
        if projgrad < tol: break

    # search step size 
    for inner_iter in xrange(1,20):
        Hn = H - alpha*grad
        Hn = where(Hn > 0, Hn, 0)
        d = Hn-H
        gradd = sum(grad * d)
        dQd = sum(dot(WtW,d) * d)
        suff_decr = 0.99*gradd + 0.5*dQd < 0;
        if inner_iter == 1:
            decr_alpha = not suff_decr; Hp = H;
        if decr_alpha: 
            if suff_decr:
                H = Hn; break;
            else:
                alpha = alpha * beta;
        else:
            if not suff_decr or (Hp == Hn).all():
                H = Hp; break;
            else:
                alpha = alpha/beta; Hp = Hn;

    if iter == maxiter:
        print 'Max iter in nlssubprob'
    return (H, grad, iter)

In [33]:
from numpy import *
from nmf import *

w1 = array([[1,2,3],[4,5,6]])
h1 = array([[1,2],[3,4],[5,6]])
w2 = array([[1,1,3],[4,5,6]])
h2 = array([[1,1],[3,4],[5,6]])
w3 = array([[2,2,2],[2,2,2]])
h3 = array([[2,2],[2,2],[2,2]])

# v the ratings matrix
# v = dot(w1,h1)
v = array([array([4,0]),array([4,4])])

(wo,ho) = nmf(v, w3, h3, 0.001, 10, 10)
print wo
print ho


Init gradient norm 125.475097

Iter = 5 Final proj-grad norm 0.119379
[[ 0.3904523   0.3904523   0.3904523 ]
 [ 0.63257492  0.63257492  0.63257492]]
[[ 2.46681606  1.52881008]
 [ 2.46681606  1.52881008]
 [ 2.46681606  1.52881008]]

In [34]:
print v
print dot(wo,ho)


[[4 0]
 [4 4]]
[[ 2.88952199  1.79078222]
 [ 4.68133791  2.90126074]]

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]: