In [1]:
import numpy as np
import pandas as pd
import copy
import collections
from scipy import linalg
import math
from collections import defaultdict

In [65]:
#data
df = pd.read_csv('./data/ml-100k/u.data',sep='\t',header=None)
#movie list
df_info = pd.read_csv('./data/ml-100k/u.item',sep='|',header=None)
movielist = [df_info[1].tolist()[indx]+';'+str(indx+1) for indx in xrange(len(df_info[1].tolist()))]
nmovies = len(movielist)
nusers = len(df[0].drop_duplicates().tolist())  

min_ratings = 50
movies_rated  = list(df[1]) 
counts = collections.Counter(movies_rated)
dfout = pd.DataFrame(columns=['user']+movielist)

toremovelist = []
for i in range(1,nusers):
    tmpmovielist = [0 for j in range(nmovies)]
    dftmp =df[df[0]==i]
    for k in dftmp.index:
        if counts[dftmp.ix[k][1]]>= min_ratings:           
           tmpmovielist[dftmp.ix[k][1]-1] = dftmp.ix[k][2]
           
        else:
           toremovelist.append(dftmp.ix[k][1])
            
    dfout.loc[i] = [i]+tmpmovielist
  
toremovelist = list(set(toremovelist))
dfout.drop(dfout.columns[toremovelist], axis=1, inplace=True)
dfout.to_csv('data/utilitymatrix.csv',index=None)

In [38]:
df = pd.read_csv('data/utilitymatrix.csv')
df.head(2)


Out[38]:
user Toy Story (1995);1 GoldenEye (1995);2 Four Rooms (1995);3 Get Shorty (1995);4 Copycat (1995);5 Twelve Monkeys (1995);7 Babe (1995);8 Dead Man Walking (1995);9 Richard III (1995);10 ... Cool Runnings (1993);1035 Hamlet (1996);1039 Forget Paris (1995);1041 Multiplicity (1996);1047 She's the One (1996);1048 Koyaanisqatsi (1983);1065 Shallow Grave (1994);1073 Reality Bites (1994);1074 Six Degrees of Separation (1993);1101 Some Kind of Wonderful (1987);1119
0 1 5 3 4 3 3 4 1 5 3 ... 0 0 0 0 0 0 0 0 0 0
1 2 4 0 0 0 0 0 0 0 2 ... 0 0 0 0 0 0 0 0 0 0

2 rows × 604 columns


In [2]:
def imputation(inp,Ri):
    Ri = Ri.astype(float)
    def userav():
        for i in xrange(len(Ri)):
            Ri[i][Ri[i]==0] = sum(Ri[i])/float(len(Ri[i][Ri[i]>0]))
        return Ri
    def itemav():
        for i in xrange(len(Ri[0])):
            Ri[:,i][Ri[:,i]==0] = sum(Ri[:,i])/float(len(Ri[:,i][Ri[:,i]>0]))
        return Ri            
    switch = {'useraverage':userav(),'itemaverage':itemav()}
    return switch[inp]

In [3]:
from scipy.stats import pearsonr
from scipy.spatial.distance import cosine 
def sim(x,y,metric='cos'):
    if metric == 'cos':
       return 1.-cosine(x,y)
    else:#correlation
       return pearsonr(x,y)[0]

In [4]:
def CF_userbased(u_vec,K,data,indxs=False):
    def FindKNeighbours(r,data,K):
        neighs = []
        cnt=0
        for u in xrange(len(data)):
            if data[u,r]>0 and cnt<K:
               neighs.append(data[u])   
               cnt +=1 
            elif cnt==K:
               break
        return np.array(neighs)
        
    def CalcRating(u_vec,r,neighs):
        rating = 0.
        den = 0.
        for j in xrange(len(neighs)):
            rating += neighs[j][-1]*float(neighs[j][r]-neighs[j][neighs[j]>0][:-1].mean())
            den += abs(neighs[j][-1])
        if den>0:
            rating = np.round(u_vec[u_vec>0].mean()+(rating/den),0)
        else:
            rating = np.round(u_vec[u_vec>0].mean(),0)
        if rating>5:
            return 5.
        elif rating<1:
            return 1.
        return rating 
    #add similarity col
    data = data.astype(float)
    nrows = len(data)
    ncols = len(data[0])
    data_sim = np.zeros((nrows,ncols+1))
    data_sim[:,:-1] = data
    #calc similarities:
    for u in xrange(nrows):
        if np.array_equal(data_sim[u,:-1],u_vec)==False: #list(data_sim[u,:-1]) != list(u_vec):
           data_sim[u,ncols] = sim(data_sim[u,:-1],u_vec,'pearson')
        else:
           data_sim[u,ncols] = 0.
    #order by similarity:
    data_sim =data_sim[data_sim[:,ncols].argsort()][::-1]
    #find the K users for each item not rated:
    u_rec = np.zeros(len(u_vec))
    for r in xrange(ncols):
        if u_vec[r]==0:
           neighs = FindKNeighbours(r,data_sim,K)
           #calc the predicted rating
           u_rec[r] = CalcRating(u_vec,r,neighs)
    if indxs:
            #take out the rated movies
            seenindxs = [indx for indx in xrange(len(u_vec)) if u_vec[indx]>0]
            u_rec[seenindxs] = -1
            recsvec = np.argsort(u_rec)[::-1][np.argsort(u_rec)>0]
        
            return recsvec    
    return u_rec

In [5]:
class CF_itembased(object):
    def __init__(self,data):
        #calc item similarities matrix
        nitems = len(data[0])
        self.data = data
        self.simmatrix = np.zeros((nitems,nitems))
        for i in xrange(nitems):
            for j in xrange(nitems):
                if j>=i:#triangular matrix
                   self.simmatrix[i,j] = sim(data[:,i],data[:,j])
                else:
                   self.simmatrix[i,j] = self.simmatrix[j,i]

    def GetKSimItemsperUser(self,r,K,u_vec):
        items = np.argsort(self.simmatrix[r])[::-1]
        items = items[items!=r]
        cnt=0
        neighitems = []
        for i in items:
            if u_vec[i]>0 and cnt<K:
               neighitems.append(i)
               cnt+=1
            elif cnt==K:
               break
        return neighitems
        
    def CalcRating(self,r,u_vec,neighitems):
        rating = 0.
        den = 0.
        for i in neighitems:
            rating +=  self.simmatrix[r,i]*u_vec[i]
            den += abs(self.simmatrix[r,i])
        if den>0:
            rating = np.round(rating/den,0)
        else:
            rating = np.round(self.data[:,r][self.data[:,r]>0].mean(),0)
        return rating
        
    def CalcRatings(self,u_vec,K,indxs=False):
        #u_rec = copy.copy(u_vec)
        u_rec = np.zeros(len(u_vec))
        for r in xrange(len(u_vec)):
            if u_vec[r]==0:
               neighitems = self.GetKSimItemsperUser(r,K,u_vec)
               #calc predicted rating
               u_rec[r] = self.CalcRating(r,u_vec,neighitems)
        if indxs:
            #take out the rated movies
            seenindxs = [indx for indx in xrange(len(u_vec)) if u_vec[indx]>0]
            u_rec[seenindxs]=-1
            recsvec = np.argsort(u_rec)[::-1][np.argsort(u_rec)>0]
        
            return recsvec
        return u_rec

In [6]:
class SlopeOne(object):
    def __init__(self,Umatrix):
        #calc item similarities matrix
        nitems = len(Umatrix[0])
        self.difmatrix = np.zeros((nitems,nitems))
        self.nratings = np.zeros((nitems,nitems))
        def diffav_n(x,y):
            xy = np.vstack((x, y)).T
            xy = xy[(xy[:,0]>0) & (xy[:,1]>0)]
            nxy = len(xy)
            if nxy == 0:
                #print 'no common'
                return [1000.,0]
            return [float(sum(xy[:,0])-sum(xy[:,1]))/nxy,nxy]
            
        for i in xrange(nitems):
            for j in xrange(nitems):
                if j>=i:#triangular matrix                 
                   self.difmatrix[i,j],self.nratings[i,j] = diffav_n(Umatrix[:,i],Umatrix[:,j])
                else:
                   self.difmatrix[i,j] = -self.difmatrix[j,i]
                   self.nratings[i,j] = self.nratings[j,i]
        
    def GetKSimItemsperUser(self,r,K,u_vec):
        items = np.argsort(self.difmatrix[r])
        items = items[items!=r]
        cnt=0
        neighitems = []
        for i in items:
            if u_vec[i]>0 and cnt<K:
               neighitems.append(i)
               cnt+=1
            elif cnt==K:
               break
        return neighitems
        
    def CalcRating(self,r,u_vec,neighitems):
        rating = 0.
        den = 0.
        for i in neighitems:
            if abs(self.difmatrix[r,i])!=1000:
               rating +=  (self.difmatrix[r,i]+u_vec[i])*self.nratings[r,i]
               den += self.nratings[r,i]
        if den==0:
            #print 'no similar diff'
            return 0.
        rating = np.round(rating/den,0)
        if rating >5:
            return 5.
        elif rating <1.:
            return 1.
        return rating
        
    def CalcRatings(self,u_vec,K):
        #u_rec = copy.copy(u_vec)
        u_rec = np.zeros(len(u_vec))
        for r in xrange(len(u_vec)):
            if u_vec[r]==0:
               neighitems = self.GetKSimItemsperUser(r,K,u_vec)
               #calc predicted rating
               u_rec[r] = self.CalcRating(r,u_vec,neighitems)
        return u_rec

In [7]:
def SGD(Umatrix, K, iterations=100, alpha=0.00001, l=0.001, tol=0.001):

    nrows = len(Umatrix)
    ncols = len(Umatrix[0])  
    P = np.random.rand(nrows,K)
    Q = np.random.rand(ncols,K)
    Qt = Q.T
    cost=-1
    for it in xrange(iterations):
        for i in xrange(nrows):
            for j in xrange(ncols):
                if Umatrix[i][j] > 0:
                   eij = Umatrix[i][j] -np.dot(P[i,:],Qt[:,j])
                   for k in xrange(K):
                       P[i][k] += alpha*(2*eij*Qt[k][j]-l*P[i][k])
                       Qt[k][j] += alpha*(2*eij*P[i][k]-l*Qt[k][j]) 
        cost = 0
        for i in xrange(nrows):
            for j in xrange(ncols):
                if Umatrix[i][j]>0:
                   cost += pow(Umatrix[i][j]-np.dot(P[i,:],Qt[:,j]),2)
                   for k in xrange(K):
                       cost += float(l/2.0)*(pow(P[i][k],2)+pow(Qt[k][j],2))
        if cost < tol:
           break
    return np.round(np.dot(P,Qt),0)

In [8]:
def ALS(Umatrix, K, iterations=50, l=0.001, tol=0.001):

    nrows = len(Umatrix)
    ncols = len(Umatrix[0])  
    P = np.random.rand(nrows,K)
    Q = np.random.rand(ncols,K)
    Qt = Q.T
    err = 0.
    Umatrix = Umatrix.astype(float)
    mask = Umatrix>0.
    mask[mask==True]=1
    mask[mask==False]=0
    mask = mask.astype(np.float64, copy=False)
    for it in xrange(iterations):
        for u, mask_u in enumerate(mask):
            P[u] = np.linalg.solve(np.dot(Qt, np.dot(np.diag(mask_u), Qt.T)) + l*np.eye(K), 
                                np.dot(Qt, np.dot(np.diag(mask_u), Umatrix[u].T))).T
        for i, mask_i in enumerate(mask.T):
            Qt[:,i] = np.linalg.solve(np.dot(P.T, np.dot(np.diag(mask_i), P)) + l*np.eye(K),
                                np.dot(P.T, np.dot(np.diag(mask_i), Umatrix[:,i])))                            
        err=np.sum((mask*(Umatrix - np.dot(P, Qt)))**2)
        if err < tol:
            break
    return np.round(np.dot(P,Qt),0)

In [9]:
from sklearn.decomposition import NMF
def NMF_alg(Umatrix,K,inp='none',l=0.001):
    R_tmp = copy.copy(Umatrix)
    R_tmp = R_tmp.astype(float)
    #imputation
    if inp != 'none':
        R_tmp = imputation(inp,Umatrix)
    nmf = NMF(n_components=K,alpha=l)
    P = nmf.fit_transform(R_tmp)
    R_tmp = np.dot(P,nmf.components_)
    return R_tmp

In [10]:
from sklearn.decomposition import TruncatedSVD
def SVD(Umatrix,K,inp='none'):
    R_tmp = copy.copy(Umatrix)
    R_tmp = R_tmp.astype(float)
    #imputation
    if inp != 'none':
        R_tmp = imputation(inp,Umatrix)     

    means = np.array([ R_tmp[i][R_tmp[i]>0].mean() for i in xrange(len(R_tmp))]).reshape(-1,1)
    R_tmp = R_tmp-means
    svd = TruncatedSVD(n_components=K, random_state=4)
    R_k = svd.fit_transform(R_tmp)
    R_tmp = svd.inverse_transform(R_k)
    R_tmp = means+R_tmp
    
    return np.round(R_tmp,0)

In [11]:
def SVD_EM(Umatrix,K,inp='none',iterations=50,tol=0.001):
    R_tmp = copy.copy(Umatrix)
    R_tmp = R_tmp.astype(float)
    nrows = len(Umatrix)
    ncols = len(Umatrix[0])
    #imputation
    if inp != 'none':
        R_tmp = imputation(inp,Umatrix)
    #define svd
    svd = TruncatedSVD(n_components=K, random_state=4)
    err = -1
    for it in xrange(iterations):
        #m-step
        R_k = svd.fit_transform(R_tmp)
        R_tmp = svd.inverse_transform(R_k)
        #e-step and error evaluation
        err = 0
        for i in xrange(nrows):
            for j in xrange(ncols):
                if Umatrix[i][j]>0:
                   err += pow(Umatrix[i][j]-R_tmp[i][j],2)
                   R_tmp[i][j] = Umatrix[i][j]                   
                   
        if err < tol:
            print it,'toll reached!'
            break
    return np.round(R_tmp,0)

In [66]:
#matrix movies's content
movieslist = [int(m.split(';')[-1]) for m in dfout.columns[1:]]
moviescats = ['unknown','Action','Adventure','Animation','Children\'s','Comedy','Crime','Documentary',
              'Drama','Fantasy','Film-Noir','Horror','Musical','Mystery',
              'Romance','Sci-Fi','Thriller','War','Western']
dfout_movies =  pd.DataFrame(columns=['movie_id']+moviescats)
startcatsindx = 5
cnt= 0
for m in movieslist:
    dfout_movies.loc[cnt] = [m]+df_info.iloc[m-1][startcatsindx:].tolist()
    cnt +=1 
print dfout_movies.head()

dfout_movies.to_csv('data/movies_content.csv',index=None)


   movie_id  unknown  Action  Adventure  Animation  Children's  Comedy  Crime  \
0         1        0       0          0          1           1       1      0   
1         2        0       1          1          0           0       0      0   
2         3        0       0          0          0           0       0      0   
3         4        0       1          0          0           0       1      0   
4         5        0       0          0          0           0       0      1   

   Documentary  Drama  Fantasy  Film-Noir  Horror  Musical  Mystery  Romance  \
0            0      0        0          0       0        0        0        0   
1            0      0        0          0       0        0        0        0   
2            0      0        0          0       0        0        0        0   
3            0      1        0          0       0        0        0        0   
4            0      1        0          0       0        0        0        0   

   Sci-Fi  Thriller  War  Western  
0       0         0    0        0  
1       0         1    0        0  
2       0         1    0        0  
3       0         0    0        0  
4       0         1    0        0  

In [12]:
class CBF_averageprofile(object):
    def __init__(self,Movies,Movieslist):
        #calc user profiles:
        self.nfeatures = len(Movies[0])
        self.Movieslist = Movieslist 
        self.Movies = Movies
        
    def GetRecMovies(self,u_vec,indxs=False):
        #generate user profile
        nmovies = len(u_vec)
        nfeatures = self.nfeatures
        mean_u = u_vec[u_vec>0].mean()
        diff_u = u_vec-mean_u
        features_u = np.zeros(nfeatures).astype(float)
        cnts = np.zeros(nfeatures)
        for m in xrange(nmovies):
            if u_vec[m]>0:#u has rated m
               features_u += self.Movies[m]*(diff_u[m])
               cnts += self.Movies[m]
        #average:
        for m in xrange(nfeatures):
            if cnts[m]>0:
               features_u[m] = features_u[m]/float(cnts[m])
               
        #calc sim:
        sims = np.zeros(nmovies)
        for m in xrange(nmovies):
            if u_vec[m]==0:#sim only for movies not yet rated by the user
               sims[m] = sim(features_u,self.Movies[m])
        #order movies
        order_movies_indxs = np.argsort(sims)[::-1] 
        if indxs:
            return order_movies_indxs
        return self.Movieslist[order_movies_indxs]

In [13]:
class CBF_regression(object):
    def __init__(self,Movies,Umatrix,alpha=0.01,l=0.0001,its=50,tol=0.001):
        #calc parameters:
        self.nfeatures = len(Movies[0])+1#intercept
        nusers = len(Umatrix)
        nmovies = len(Umatrix[0])
        #add intercept col
        movies_feats = np.ones((nmovies,self.nfeatures))
        movies_feats[:,1:] = Movies
        self.movies_feats = movies_feats.astype(float)
        
        #set Umatrix as float
        self.Umatrix = Umatrix.astype(float)
        #initialize the matrix:
        Pmatrix = np.random.rand(nusers,self.nfeatures)
        Pmatrix[:,0]=1.
        err = 0.
        cost = -1
        for it in xrange(its):
            print 'it:',it,' -- ',cost
            for u in xrange(nusers):
                for f in xrange(self.nfeatures):                    
                    if f==0:#no regularization
                        for m in xrange(nmovies):
                            if self.Umatrix[u,m]>0:
                               diff = np.dot(Pmatrix[u],self.movies_feats[m])-self.Umatrix[u,m]
                               Pmatrix[u,f] += -alpha*(diff*self.movies_feats[m][f])
                    else:
                        for m in xrange(nmovies):
                            if self.Umatrix[u,m]>0:
                               diff = np.dot(Pmatrix[u],self.movies_feats[m])-self.Umatrix[u,m]
                               Pmatrix[u,f] += -alpha*(diff*self.movies_feats[m][f] +l*Pmatrix[u][f])        
                
            cost = 0
            for u in xrange(nusers):
                for m in xrange(nmovies):
                    if self.Umatrix[u][m]>0:
                       cost += 0.5*pow(Umatrix[u][m]-np.dot(Pmatrix[u],self.movies_feats[m]),2)
                for f in xrange(1,self.nfeatures):
                    cost += float(l/2.0)*(pow(Pmatrix[u][f],2))
            if cost < tol:
               print 'err',cost
               break
        self.Pmatrix = Pmatrix
        
    def CalcRatings(self,u_vec):
        #find u_vec
        s = 0.
        u_feats = np.zeros(len(self.Pmatrix[0]))
        #in case the user is not present in the utility matrix find the most similar
        for u in xrange(len(self.Umatrix)):
            #print self.Umatrix[u]
            tmps = sim(self.Umatrix[u],u_vec)
            if tmps > s:
                s = tmps
                u_feats = self.Pmatrix[u]
            if s == 1.:
                break
        new_vec = np.zeros(len(u_vec))
        for r in xrange(len(u_vec)):
            if u_vec[r]==0:
                new_vec[r] = np.dot(u_feats,self.movies_feats[r])
        return new_vec

In [14]:
class LogLikelihood(object):
    def __init__(self,Umatrix,Movieslist,likethreshold=3):
        self.Movieslist = Movieslist
        #calculate loglikelihood ratio for each pair
        self.nusers = len(Umatrix)
        self.Umatrix =Umatrix
        self.likethreshold = likethreshold
        self.likerange = range(self.likethreshold+1,5+1)
        self.dislikerange = range(1,self.likethreshold+1)
        self.loglikelihood_ratio()

    def calc_k(self,a,b):
        tmpk = [[0 for j in range(2)] for i in range(2)]
        for ratings in self.Umatrix:
            if ratings[a] in self.likerange and ratings[b] in self.likerange:
               tmpk[0][0] += 1
            if ratings[a] in self.likerange and ratings[b] in self.dislikerange:
                tmpk[0][1] += 1
            if ratings[a] in self.dislikerange and ratings[b] in self.likerange:
                tmpk[1][0] += 1
            if ratings[a] in self.dislikerange and ratings[b] in self.dislikerange:
                tmpk[1][1] += 1
        return tmpk
        
    def calc_llr(self,k_matrix):
        Hcols=Hrows=Htot=0.0
        if sum(k_matrix[0])+sum(k_matrix[1])==0:
            return 0.
        invN = 1.0/(sum(k_matrix[0])+sum(k_matrix[1])) 
        for i in range(0,2):
            if((k_matrix[0][i]+k_matrix[1][i])!=0.0):
               Hcols += invN*(k_matrix[0][i]+k_matrix[1][i])*math.log((k_matrix[0][i]+k_matrix[1][i])*invN )#sum of rows
            if((k_matrix[i][0]+k_matrix[i][1])!=0.0):
               Hrows += invN*(k_matrix[i][0]+k_matrix[i][1])*math.log((k_matrix[i][0]+k_matrix[i][1])*invN )#sum of cols
            for j in range(0,2):
                if(k_matrix[i][j]!=0.0):
                   Htot +=invN*k_matrix[i][j]*math.log(invN*k_matrix[i][j])
        return 2.0*(Htot-Hcols-Hrows)/invN

    def loglikelihood_ratio(self):
        nitems = len(self.Movieslist)
        self.items_llr= pd.DataFrame(np.zeros((nitems,nitems))).astype(float)
        for i in xrange(nitems):
            for j in xrange(nitems):
                if(j>=i):
                   tmpk=self.calc_k(i,j)
                   self.items_llr.ix[i,j] = self.calc_llr(tmpk)
                else:
                   self.items_llr.ix[i,j] = self.items_llr.iat[j,i]
        
    def GetRecItems(self,u_vec,indxs=False):
        items_weight = np.dot(u_vec,self.items_llr)
        sortedweight = np.argsort(items_weight)
        seenindxs = [indx for indx in xrange(len(u_vec)) if u_vec[indx]>0]
        seenmovies = np.array(self.Movieslist)[seenindxs]
        #remove seen items
        recitems = np.array(self.Movieslist)[sortedweight]
        recitems = [m for m in recitems if m not in seenmovies]
        if indxs:
            items_weight[seenindxs]=-1
            recsvec = np.argsort(items_weight)[::-1][np.argsort(items_weight)>0]
            return recsvec
        return recitems[::-1]

In [15]:
class AssociationRules(object):
    def __init__(self,Umatrix,Movieslist,min_support=0.1,min_confidence=0.1,likethreshold=3):
        self.min_support = min_support
        self.min_confidence = min_confidence
        self.Movieslist = Movieslist
        #transform utility matrix to sets of liked items
        nitems = len(Umatrix[0])
        transactions = []
        for u in Umatrix:
            s = [i for i in xrange(len(u)) if u[i]>likethreshold]
            if len(s)>0:
               transactions.append(s)
        #find sets of 2 items
        flat = [item for sublist in transactions for item in sublist]
        inititems = map(frozenset,[ [item] for item in frozenset(flat)])
        set_trans = map(set, transactions)
        sets_init, self.dict_sets_support = self.filterSet(set_trans, inititems)
        setlen = 2
        items_tmp = self.combine_lists(sets_init, setlen)
        self.freq_sets, sup_tmp = self.filterSet(set_trans, items_tmp)
        self.dict_sets_support.update(sup_tmp)
        self.ass_matrix = np.zeros((nitems,nitems))
        for freqset in self.freq_sets:
            #print 'freqset',freqset
            list_setitems = [frozenset([item]) for item in freqset]
            #print "freqSet", freqset, 'H1', list_setitems
            self.calc_confidence_matrix(freqset, list_setitems)
        
    def filterSet(self,set_trans, likeditems):
        itemscnt = {}
        for id in set_trans:
            for item in likeditems:
                if item.issubset(id):
                    itemscnt.setdefault(item, 0)
                    itemscnt[item] += 1
        num_items = float(len(set_trans))
        freq_sets = []
        dict_sets = {}
        for key in itemscnt:
            support = itemscnt[key] / num_items
            if support >= self.min_support:
                freq_sets.insert(0, key)
            dict_sets[key] = support
        return freq_sets, dict_sets
        
    def combine_lists(self,freq_sets, setlen):
        setitems_list = []
        nsets = len(freq_sets)
        for i in range(nsets):
            for j in range(i + 1, nsets):
                setlist1 = list(freq_sets[i])[:setlen - 2]
                setlist2 = list(freq_sets[j])[:setlen - 2]
                if set(setlist1) == set(setlist2):
                    setitems_list.append(freq_sets[i].union(freq_sets[j]))
        return setitems_list
        
    def calc_confidence_matrix(self,freqset, list_setitems):
        for target in list_setitems:
            confidence = self.dict_sets_support[freqset] / self.dict_sets_support[freqset - target]
            if confidence >= self.min_confidence:
                self.ass_matrix[list(freqset - target)[0]][list(target)[0]] = confidence
                
    def GetRecItems(self,u_vec,indxs=False):
        vec_recs = np.dot(u_vec,self.ass_matrix)
        sortedweight = np.argsort(vec_recs)
        seenindxs = [indx for indx in xrange(len(u_vec)) if u_vec[indx]>0]
        seenmovies = np.array(self.Movieslist)[seenindxs]
        #remove seen items
        recitems = np.array(self.Movieslist)[sortedweight]
        recitems = [m for m in recitems if m not in seenmovies]
        if indxs:
            vec_recs[seenindxs]=-1
            recsvec = np.argsort(vec_recs)[::-1][np.argsort(vec_recs)>0]
            return recsvec
        return recitems[::-1]

In [16]:
class Hybrid_cbf_cf(object):
    def __init__(self,Movies,Movieslist,Umatrix):
        #calc user profiles:
        self.nfeatures = len(Movies[0])
        self.Movieslist = Movieslist 
        self.Movies = Movies.astype(float)
        self.Umatrix_mfeats = np.zeros((len(Umatrix),len(Umatrix[0])+self.nfeatures))
        means = np.array([ Umatrix[i][Umatrix[i]>0].mean() for i in xrange(len(Umatrix))]).reshape(-1,1)
        diffs = np.array([ [Umatrix[i][j]-means[i] if Umatrix[i][j]>0 else 0. 
                            for j in xrange(len(Umatrix[i]))  ] for i in xrange(len(Umatrix))])
        self.Umatrix_mfeats[:,:len(Umatrix[0])] = Umatrix#diffs
        self.nmovies = len(Movies)
        #calc item features for each user
        for u in xrange(len(Umatrix)):
            u_vec = Umatrix[u]
            self.Umatrix_mfeats[u,len(Umatrix[0]):] = self.GetUserItemFeatures(u_vec)
            
    def GetUserItemFeatures(self,u_vec):
        mean_u = u_vec[u_vec>0].mean()
        #diff_u = u_vec-mean_u
        features_u = np.zeros(self.nfeatures).astype(float)
        cnts = np.zeros(self.nfeatures)
        for m in xrange(self.nmovies):
            if u_vec[m]>0:#u has rated m
               features_u += self.Movies[m]*u_vec[m]#self.Movies[m]*(diff_u[m])
               cnts += self.Movies[m]
        #average:
        for m in xrange(self.nfeatures):
            if cnts[m]>0:
               features_u[m] = features_u[m]/float(cnts[m])
        return features_u
    def CalcRatings(self,u_vec,K):
        def FindKNeighbours(r,data,K):
            neighs = []
            cnt=0
            for u in xrange(len(data)):
                if data[u,r]>0 and cnt<K:
                   neighs.append(data[u])   
                   cnt +=1 
                elif cnt==K:
                   break
            return np.array(neighs)
        
        def CalcRating(u_vec,r,neighs):
            rating = 0.
            den = 0.
            for j in xrange(len(neighs)):
                rating += neighs[j][-1]*float(neighs[j][r]-neighs[j][neighs[j]>0][:-1].mean())
                den += abs(neighs[j][-1])
            if den>0:
                rating = np.round(u_vec[u_vec>0].mean()+(rating/den),0)
            else:
                rating = np.round(u_vec[u_vec>0].mean(),0)
            if rating>5:
                return 5.
            elif rating<1:
                return 1.
            return rating
        #add similarity col
        nrows = len(self.Umatrix_mfeats)
        ncols = len(self.Umatrix_mfeats[0])
        data_sim = np.zeros((nrows,ncols+1))
        data_sim[:,:-1] = self.Umatrix_mfeats
        u_rec = np.zeros(len(u_vec))
        #calc similarities:
        mean = u_vec[u_vec>0].mean()
        u_vec_feats = u_vec#np.array([u_vec[i]-mean if u_vec[i]>0 else 0 for i in xrange(len(u_vec))])
        u_vec_feats = np.append(u_vec_feats,self.GetUserItemFeatures(u_vec))
        
        for u in xrange(nrows):
            if np.array_equal(data_sim[u,:-1],u_vec)==False: #list(data_sim[u,:-1]) != list(u_vec):
               data_sim[u,ncols] = sim(data_sim[u,:-1],u_vec_feats)
            else:
               data_sim[u,ncols] = 0.
        #order by similarity:
        data_sim =data_sim[data_sim[:,ncols].argsort()][::-1]
        #find the K users for each item not rated:
        
        for r in xrange(self.nmovies):
            if u_vec[r]==0:
               neighs = FindKNeighbours(r,data_sim,K)
               #calc the predicted rating
               u_rec[r] = CalcRating(u_vec,r,neighs)
        return u_rec

In [17]:
class Hybrid_svd(object):
    def __init__(self,Movies,Movieslist,Umatrix,K,inp):
        #calc user profiles:
        self.nfeatures = len(Movies[0])
        self.Movieslist = Movieslist 
        self.Movies = Movies.astype(float)
        
        R_tmp = copy.copy(Umatrix)
        R_tmp = R_tmp.astype(float)
        #imputation
        
        if inp != 'none':
            R_tmp = imputation(inp,Umatrix)
        Umatrix_mfeats = np.zeros((len(Umatrix),len(Umatrix[0])+self.nfeatures))
        means = np.array([ Umatrix[i][Umatrix[i]>0].mean() for i in xrange(len(Umatrix))]).reshape(-1,1)
        diffs = np.array([ [float(Umatrix[i][j]-means[i]) 
                            if Umatrix[i][j]>0 else float(R_tmp[i][j]-means[i]) for j in xrange(len(Umatrix[i]))  ] 
                          for i in xrange(len(Umatrix))])
        Umatrix_mfeats[:,:len(Umatrix[0])] = diffs#R_tmp
        self.nmovies = len(Movies)
        #calc item features for each user
        for u in xrange(len(Umatrix)):
            u_vec = Umatrix[u]
            Umatrix_mfeats[u,len(Umatrix[0]):] = self.GetUserItemFeatures(u_vec)
        
        #calc svd
        svd = TruncatedSVD(n_components=K, random_state=4)
        R_k = svd.fit_transform(Umatrix_mfeats)
        R_tmp = means+svd.inverse_transform(R_k)
        self.matrix = np.round(R_tmp[:,:self.nmovies],0)
        
        
    def GetUserItemFeatures(self,u_vec):
        mean_u = u_vec[u_vec>0].mean()
        diff_u = u_vec-mean_u
        features_u = np.zeros(self.nfeatures).astype(float)
        cnts = np.zeros(self.nfeatures)
        for m in xrange(self.nmovies):
            if u_vec[m]>0:#u has rated m
               features_u += self.Movies[m]*(diff_u[m])#self.Movies[m]*u_vec[m]
               cnts += self.Movies[m]
        #average:
        for m in xrange(self.nfeatures):
            if cnts[m]>0:
               features_u[m] = features_u[m]/float(cnts[m])
        return features_u

In [18]:
def cross_validation(df,k):
    val_num = int(len(df)/float(k))
    print val_num
    df_trains = []
    df_vals = []
    for i in xrange(k):
        start_val = (k-i-1)*val_num
        end_val = start_val+val_num
        df_trains.append(pd.concat([df[:start_val],df[end_val:]]))
        df_vals.append(df[start_val:end_val])

    return df_trains,df_vals

In [19]:
import random
def HideRandomRatings(u_vec, ratiovals=0.5):
    u_test = np.zeros(len(u_vec))
    u_vals = np.zeros(len(u_vec))
    cnt = 0
    nratings = len(u_vec[u_vec>0])
    for i in xrange(len(u_vec)):
        if u_vec[i]>0:        
            if bool(random.getrandbits(1)) or cnt>=int(nratings*ratiovals):
                u_test[i]=u_vec[i]
            else:#random choice to hide the rating:
                cnt +=1
                u_vals[i]=u_vec[i]
    return u_test,u_vals

In [20]:
#load data
df = pd.read_csv('data/utilitymatrix.csv')
print df.head(4)
df_movies = pd.read_csv('data/movies_content.csv')
movies = df_movies.values[:,1:]
print 'check:::',len(df.columns[1:]),'--',len(df_movies)
movieslist = list(df.columns[1:])
#k-fold cv 5 folds
nfolds = 5
df_trains,df_vals = cross_validation(df,nfolds)


   user  Toy Story (1995);1  GoldenEye (1995);2  Four Rooms (1995);3  \
0     1                   5                   3                    4   
1     2                   4                   0                    0   
2     3                   0                   0                    0   
3     4                   0                   0                    0   

   Get Shorty (1995);4  Copycat (1995);5  Twelve Monkeys (1995);7  \
0                    3                 3                        4   
1                    0                 0                        0   
2                    0                 0                        0   
3                    0                 0                        0   

   Babe (1995);8  Dead Man Walking (1995);9  Richard III (1995);10  \
0              1                          5                      3   
1              0                          0                      2   
2              0                          0                      0   
3              0                          0                      0   

                  ...                  Cool Runnings (1993);1035  \
0                 ...                                          0   
1                 ...                                          0   
2                 ...                                          0   
3                 ...                                          0   

   Hamlet (1996);1039  Forget Paris (1995);1041  Multiplicity (1996);1047  \
0                   0                         0                         0   
1                   0                         0                         0   
2                   0                         0                         0   
3                   0                         0                         0   

   She's the One (1996);1048  Koyaanisqatsi (1983);1065  \
0                          0                          0   
1                          0                          0   
2                          0                          0   
3                          0                          0   

   Shallow Grave (1994);1073  Reality Bites (1994);1074  \
0                          0                          0   
1                          0                          0   
2                          0                          0   
3                          0                          0   

   Six Degrees of Separation (1993);1101  Some Kind of Wonderful (1987);1119  
0                                      0                                   0  
1                                      0                                   0  
2                                      0                                   0  
3                                      0                                   0  

[4 rows x 604 columns]
check::: 603 -- 603
188

In [21]:
def SE(u_preds,u_vals):
    nratings = len(u_vals)
    se = 0.
    cnt = 0
    for i in xrange(nratings):
        if u_vals[i]>0:
           se +=  (u_vals[i]-u_preds[i])*(u_vals[i]-u_preds[i])
           cnt += 1
    return se,cnt

In [22]:
nmovies = len(df_vals[0].values[:,1:][0])
vals_vecs_folds = []
tests_vecs_folds = []
for i in xrange(nfolds):
    u_vecs = df_vals[i].values[:,1:]
    vtests = np.empty((0,nmovies),float)
    vvals = np.empty((0,nmovies),float)
    for u_vec in u_vecs:
        u_test,u_vals = HideRandomRatings(u_vec)
        vvals = np.vstack([vvals,u_vals])
        vtests = np.vstack([vtests,u_test])
    vals_vecs_folds.append(vvals)
    tests_vecs_folds.append(vtests)

In [40]:
err_itembased = 0.
cnt_itembased = 0
err_userbased = 0.
cnt_userbased = 0
err_slopeone = 0.
cnt_slopeone = 0
err_cbfcf = 0.
cnt_cbfcf = 0
for i in xrange(nfolds):
    Umatrix = df_trains[i].values[:,1:]
    cfitembased = CF_itembased(Umatrix)
    cfslopeone = SlopeOne(Umatrix)
    cbfcf = Hybrid_cbf_cf(movies,movieslist,Umatrix)
    print 'fold:',i+1
    vec_vals = vals_vecs_folds[i]
    vec_tests = tests_vecs_folds[i]
    for j in xrange(len(vec_vals)):
        u_vals = vec_vals[j]
        u_test = vec_tests[j]
        #cbfcf
        u_preds = cbfcf.CalcRatings(u_test,5)
        e,c =  SE(u_preds,u_vals)
        err_cbfcf +=e
        cnt_cbfcf +=c
        #cf_userbased
        u_preds = CF_userbased(u_test,5,Umatrix)
        e,c =  SE(u_preds,u_vals)
        err_userbased +=e
        cnt_userbased +=c
        #cf_itembased
        u_preds = cfitembased.CalcRatings(u_test,5)
        e,c =  SE(u_preds,u_vals)
        err_itembased +=e
        cnt_itembased +=c
        #slope one
        u_preds = cfslopeone.CalcRatings(u_test,5)
        e,c =  SE(u_preds,u_vals)
        err_slopeone +=e
        cnt_slopeone +=c
rmse_userbased = np.sqrt(err_userbased/float(cnt_userbased))
rmse_itembased = np.sqrt(err_itembased/float(cnt_itembased))
rmse_slopeone = np.sqrt(err_slopeone/float(cnt_slopeone))
print 'user_userbased rmse:',rmse_userbased,'--',cnt_userbased
print 'user_itembased rmse:',rmse_itembased,'--',cnt_itembased
print 'slope one rmse:',rmse_slopeone,'--',cnt_slopeone

rmse_cbfcf = np.sqrt(err_cbfcf/float(cnt_cbfcf))
print 'cbfcf rmse:',rmse_cbfcf,'---',cnt_cbfcf


fold: 1
fold: 2
fold: 3
fold: 4
fold: 5
user_userbased rmse: 1.01381431911 -- 39972
user_itembased rmse: 1.0301785707 -- 39972
slope one rmse: 1.07792084094 -- 39972
cbfcf rmse: 1.0134317593 --- 39972

In [63]:
err_svd = 0.          
cnt_svd = 0
err_svd_em = 0.
cnt_svd_em = 0
err_als = 0.
cnt_als = 0
err_cbfreg = 0.
cnt_cbfreg = 0
for i in xrange(nfolds):
    Umatrix = df_trains[i].values[:,1:]
    print 'fold:',i+1
    teststartindx = len(Umatrix)
    vals_vecs = vals_vecs_folds[i]
    tests_vecs = tests_vecs_folds[i]
    for k in xrange(len(vals_vecs)):
        u_vals = vals_vecs[k]
        u_test = tests_vecs[k]
        #add test vector to utility matrix
        Umatrix = np.vstack([Umatrix,u_test])
    
    #svd_em_matrix = Hybrid_svd(movies,movieslist,Umatrix,20,'useraverage').matrix#SVD_EM(Umatrix,20,'useraverage',1)
    svd_matrix = SVD(Umatrix,20,'itemaverage')
    cbf_reg = CBF_regression(movies,Umatrix)
    #als_umatrix = SGD(Umatrix,20,50)#ALS(Umatrix,20,50)#NMF_alg(Umatrix,20,'itemaverage',0.001)
    #evaluate errors
    for indx in xrange(len(vals_vecs)):
        #e,c =  SE(als_umatrix[teststartindx+indx],vals_vecs[indx])
        #err_als += e
        #cnt_als += c
        u_preds = cbf_reg.CalcRatings(Umatrix[teststartindx+indx])
        e,c = SE(u_preds,vals_vecs[indx])
        err_cbfreg +=e
        cnt_cbfreg +=c

        e,c = SE(svd_matrix[teststartindx+indx],vals_vecs[indx])
        err_svd +=e
        cnt_svd +=c
        #e,c = SE(svd_em_matrix[teststartindx+indx],vals_vecs[indx])
        #err_svd_em +=e
        #cnt_svd_em +=c

if cnt_svd==0: cnt_svd=1
if cnt_svd_em==0: cnt_svd_em=1
if cnt_als==0: cnt_als=1
if cnt_cbfreg==0: cnt_cbfreg=1

rmse_als = np.sqrt(err_als/float(cnt_als))
rmse_svd = np.sqrt(err_svd/float(cnt_svd))
rmse_svd_em = np.sqrt(err_svd_em/float(cnt_svd_em))
rmse_cbfreg = np.sqrt(err_cbfreg/float(cnt_cbfreg))

print 'svd rmse:',rmse_svd,'--',cnt_svd
#print 'svd_em rmse:',rmse_svd_em,'--',cnt_svd_em
#print 'als rmse:',rmse_als,'--',cnt_als
print 'cbfreg rmse:',rmse_cbfreg,'--',cnt_cbfreg


fold: 1
it: 0  --  -1
it: 1  --  57260.8505699
it: 2  --  47929.3409804
it: 3  --  44144.4291348
it: 4  --  41968.003765
it: 5  --  40493.252468
it: 6  --  39396.671426
it: 7  --  38532.7511847
it: 8  --  37825.4882603
it: 9  --  37230.7281602
it: 10  --  36720.6754199
it: 11  --  36276.7051366
it: 12  --  35885.7066331
it: 13  --  35538.0814244
it: 14  --  35226.5774773
it: 15  --  34945.5739775
it: 16  --  34690.6223712
it: 17  --  34458.1402765
it: 18  --  34245.2004645
it: 19  --  34049.3811689
it: 20  --  33868.6572543
it: 21  --  33701.3193899
it: 22  --  33545.912899
it: 23  --  33401.190744
it: 24  --  33266.0768611
it: 25  --  33139.6372059
it: 26  --  33021.0566321
it: 27  --  32909.6202422
it: 28  --  32804.6982104
it: 29  --  32705.7333291
it: 30  --  32612.2307158
it: 31  --  32523.7492461
it: 32  --  32439.8943805
it: 33  --  32360.3121225
it: 34  --  32284.6839028
it: 35  --  32212.7222259
it: 36  --  32144.1669492
it: 37  --  32078.7820875
it: 38  --  32016.353059
it: 39  --  31956.6843012
it: 40  --  31899.597201
it: 41  --  31844.9282906
it: 42  --  31792.5276695
it: 43  --  31742.2576218
it: 44  --  31693.9913988
it: 45  --  31647.6121456
it: 46  --  31603.011952
it: 47  --  31560.0910106
it: 48  --  31518.7568685
it: 49  --  31478.9237611
fold: 2
it: 0  --  -1
it: 1  --  57282.6864201
it: 2  --  47889.8611494
it: 3  --  44150.0360989
it: 4  --  42018.8919985
it: 5  --  40579.8246521
it: 6  --  39510.5818892
it: 7  --  38667.8257454
it: 8  --  37977.2516688
it: 9  --  37395.9067422
it: 10  --  36896.8376779
it: 11  --  36462.0162571
it: 12  --  36078.7604279
it: 13  --  35737.7837177
it: 14  --  35432.0640955
it: 15  --  35156.1523906
it: 16  --  34905.7297191
it: 17  --  34677.3128666
it: 18  --  34468.0513707
it: 19  --  34275.5836036
it: 20  --  34097.9320902
it: 21  --  33933.4256921
it: 22  --  33780.6406645
it: 23  --  33638.3552748
it: 24  --  33505.5143587
it: 25  --  33381.2012909
it: 26  --  33264.6155682
it: 27  --  33155.0547051
it: 28  --  33051.8994795
it: 29  --  32954.6018119
it: 30  --  32862.6747353
it: 31  --  32775.6840384
it: 32  --  32693.2412601
it: 33  --  32614.9977842
it: 34  --  32540.639834
it: 35  --  32469.8842093
it: 36  --  32402.4746397
it: 37  --  32338.17865
it: 38  --  32276.7848567
it: 39  --  32218.1006265
it: 40  --  32161.950041
it: 41  --  32108.1721218
it: 42  --  32056.6192782
it: 43  --  32007.1559439
it: 44  --  31959.6573783
it: 45  --  31914.0086079
it: 46  --  31870.1034896
it: 47  --  31827.84388
it: 48  --  31787.1388969
it: 49  --  31747.9042601
fold: 3
it: 0  --  -1
it: 1  --  56048.270442
it: 2  --  46841.493063
it: 3  --  43148.4056038
it: 4  --  41038.0526681
it: 5  --  39611.7549949
it: 6  --  38550.1931317
it: 7  --  37711.4465659
it: 8  --  37022.3751043
it: 9  --  36440.9226905
it: 10  --  35940.7663949
it: 11  --  35504.295288
it: 12  --  35119.0926984
it: 13  --  34776.0356609
it: 14  --  34468.1997603
it: 15  --  34190.1918656
it: 16  --  33937.723437
it: 17  --  33707.3260637
it: 18  --  33496.1549428
it: 19  --  33301.8489352
it: 20  --  33122.4283139
it: 21  --  32956.2184063
it: 22  --  32801.7915
it: 23  --  32657.9219343
it: 24  --  32523.5509001
it: 25  --  32397.7585157
it: 26  --  32279.7414374
it: 27  --  32168.7947372
it: 28  --  32064.2971092
it: 29  --  31965.698701
it: 30  --  31872.5110337
it: 31  --  31784.298598
it: 32  --  31700.6718068
it: 33  --  31621.2810531
it: 34  --  31545.8116739
it: 35  --  31473.9796612
it: 36  --  31405.5279941
it: 37  --  31340.223488
it: 38  --  31277.8540776
it: 39  --  31218.2264652
it: 40  --  31161.1640778
it: 41  --  31106.505287
it: 42  --  31054.1018518
it: 43  --  31003.8175539
it: 44  --  30955.5269965
it: 45  --  30909.1145457
it: 46  --  30864.4733934
it: 47  --  30821.5047271
it: 48  --  30780.1169919
it: 49  --  30740.2252331
fold: 4
it: 0  --  -1
it: 1  --  56526.7017686
it: 2  --  47025.7227619
it: 3  --  43244.0236798
it: 4  --  41093.9118124
it: 5  --  39644.0424226
it: 6  --  38566.6610984
it: 7  --  37716.6142915
it: 8  --  37019.1170738
it: 9  --  36431.1494593
it: 10  --  35925.7861318
it: 11  --  35485.0282649
it: 12  --  35096.2054816
it: 13  --  34750.027072
it: 14  --  34439.4564381
it: 15  --  34159.0242593
it: 16  --  33904.389118
it: 17  --  33672.0448235
it: 18  --  33459.1185905
it: 19  --  33263.2276932
it: 20  --  33082.3750441
it: 21  --  32914.8714583
it: 22  --  32759.276692
it: 23  --  32614.3539866
it: 24  --  32479.0345273
it: 25  --  32352.3893022
it: 26  --  32233.6065752
it: 27  --  32121.973673
it: 28  --  32016.8621296
it: 29  --  31917.7154721
it: 30  --  31824.0391041
it: 31  --  31735.3918712
it: 32  --  31651.378986
it: 33  --  31571.6460592
it: 34  --  31495.8740381
it: 35  --  31423.7748931
it: 36  --  31355.0879259
it: 37  --  31289.5765951
it: 38  --  31227.0257769
it: 39  --  31167.2393916
it: 40  --  31110.0383399
it: 41  --  31055.258703
it: 42  --  31002.7501672
it: 43  --  30952.3746408
it: 44  --  30904.0050374
it: 45  --  30857.5242009
it: 46  --  30812.8239546
it: 47  --  30769.8042576
it: 48  --  30728.372454
it: 49  --  30688.4426048
fold: 5
it: 0  --  -1
it: 1  --  56944.3511544
it: 2  --  47571.7089252
it: 3  --  43802.2338486
it: 4  --  41648.6918128
it: 5  --  40196.538329
it: 6  --  39118.9054479
it: 7  --  38269.9659178
it: 8  --  37574.4232644
it: 9  --  36988.9511654
it: 10  --  36486.4361712
it: 11  --  36048.7557142
it: 12  --  35663.1533814
it: 13  --  35320.2767513
it: 14  --  35013.044425
it: 15  --  34735.9545419
it: 16  --  34484.6421895
it: 17  --  34255.5843788
it: 18  --  34045.8964711
it: 19  --  33853.1875216
it: 20  --  33675.4548817
it: 21  --  33511.0057422
it: 22  --  33358.3976416
it: 23  --  33216.392625
it: 24  --  33083.9214203
it: 25  --  32960.0550951
it: 26  --  32843.9823812
it: 27  --  32734.9913554
it: 28  --  32632.4545057
it: 29  --  32535.8164602
it: 30  --  32444.5838289
it: 31  --  32358.3167381
it: 32  --  32276.6217317
it: 33  --  32199.1457852
it: 34  --  32125.5712305
it: 35  --  32055.6114322
it: 36  --  31989.0070874
it: 37  --  31925.5230453
it: 38  --  31864.9455625
it: 39  --  31807.0799256
it: 40  --  31751.7483846
it: 41  --  31698.78835
it: 42  --  31648.0508157
it: 43  --  31599.3989743
it: 44  --  31552.7069989
it: 45  --  31507.8589675
it: 46  --  31464.747911
it: 47  --  31423.2749697
it: 48  --  31383.3486418
it: 49  --  31344.884115
svd_em rmse: 0.0 -- 1
als rmse: 0.0 -- 1

In [ ]:
#user_userbased rmse: 1.01381431911 -- 39972
#user_itembased rmse: 1.0301785707 -- 39972
#slope one rmse: 1.07792084094 -- 39972
#cbfcf rmse: 1.0134317593 --- 39972
#svd rmse: 1.0145666769 -- 39972
#cbfreg rmse: 1.09495415915 -- 39972
#NMF_alg rmse: 0.972259334147 -- 39972
#SVD EM rmse: 1.03845070461 -- 39972
#HYBRID SVD rmse: 1.01385133337 -- 39972
#ALS rmse: 2.58784908254 -- 39972
#SGD rmse: 1.35396020834 -- 39972

In [33]:
def ClassificationMetrics(vec_vals,vec_recs,likethreshold=3,shortlist=50,ratingsval=False,vec_test=None):
    #convert vals in indxs vec
    indxs_like = [i for i in xrange(len(vec_vals)) if vec_vals[i]>likethreshold]
    indxs_dislike = [i for i in xrange(len(vec_vals)) if vec_vals[i]<=likethreshold and vec_vals[i]>0]
    cnt = len(indxs_like)+len(indxs_dislike)
    indxs_rec = []
    if ratingsval:
        #convert ratings into items's list
        if vec_test==None:
            raise 'Error no test vector'
        indxs_rec = [i for i in xrange(len(vec_recs)) if vec_recs[i]>likethreshold and vec_test[i]<1][:shortlist]
    else:
        #consider only the first slot of recs
        indxs_rec = vec_recs[:shortlist]

    tp = len(set(indxs_rec).intersection(set(indxs_like)))
    fp = len(set(indxs_rec).intersection(set(indxs_dislike)))
    fn = len(set(indxs_like)^(set(indxs_rec).intersection(set(indxs_like))))
    precision = 0.
    if tp+fp>0:
        precision = float(tp)/(tp+fp)
    recall = 0.
    if tp+fn>0:
        recall = float(tp)/(tp+fn)
    f1 = 0.
    if recall+precision >0:
        f1 = 2.*precision*recall/(precision+recall)
    
    return np.array([precision,recall,f1]),cnt

In [61]:
tot_measures = np.zeros(3)    
cnt_vals = 0.
#CF memory based
for i in xrange(nfolds):
    Umatrix = df_trains[i].values[:,1:]
    #cfitembased = CF_itembased(Umatrix)
    #cfslopeone = SlopeOne(Umatrix)
    #cbfcf = Hybrid_cbf_cf(movies,movieslist,Umatrix)
    print 'fold:',i+1
    tot_measures_fold = np.zeros(3)
    vals_vecs = vals_vecs_folds[i]
    tests_vecs = tests_vecs_folds[i]
    for j in xrange(len(vals_vecs)):
        u_vals = vals_vecs[j]
        u_test = tests_vecs[j]
        u_preds = CF_userbased(u_test,20,Umatrix)#cfslopeone.CalcRatings(u_test,5)#cfitembased.CalcRatings(u_test,5)#cbfcf.CalcRatings(u_test,20)
        tmp_measures,cnt_tmp = ClassificationMetrics(u_vals,u_preds,3,50,True,u_test)
        tot_measures_fold +=  tmp_measures
        cnt_vals += cnt_tmp
    tot_measures_fold /= float(len(vals_vecs))
    print tot_measures_fold
    tot_measures += tot_measures_fold
tot_measures /= float(nfolds)
    
print 'precision:',tot_measures[0],' recall:',tot_measures[1],' f1:',tot_measures[2],'---',cnt_vals


fold: 1
[ 0.56651331  0.16081154  0.23628727]
fold: 2
[ 0.59850305  0.18860111  0.27172641]
fold: 3
[ 0.60989212  0.18333456  0.26413943]
fold: 4
[ 0.60932924  0.19346904  0.27782071]
fold: 5
[ 0.59372861  0.17254562  0.25126217]
precision: 0.595593265581  recall: 0.179752374596  f1: 0.260247197345 --- 39786.0
/opt/local/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/IPython/kernel/__main__.py:10: FutureWarning: comparison to `None` will result in an elementwise object comparison in the future.

In [62]:
#CF_userbased precision: 0.595593265581  recall: 0.179752374596  f1: 0.260247197345 --- 39786.0
#CF_itembased precision: 0.573049057653  recall: 0.150154902908  f1: 0.224407731332 --- 39786.0
#SlopeOne precision: 0.572945843878  recall: 0.166998383035  f1: 0.24433916059 --- 39786.0
#Hybrid_cbf_cf precision: 0.600636639987  recall: 0.183293616752  f1: 0.26385405692 --- 39786.0

In [55]:
#CF model based
cnt_vals=0.
tot_measures = np.zeros(3)
for i in xrange(nfolds):
    Umatrix = df_trains[i].values[:,1:]
    print 'fold:',i+1
    teststartindx = len(Umatrix)
    
    vals_vecs = vals_vecs_folds[i]
    tests_vecs = tests_vecs_folds[i]
    for k in xrange(len(vals_vecs)):
        u_vals = vals_vecs[k]
        u_test = tests_vecs[k]
        #add test vector to utility matrix
        Umatrix = np.vstack([Umatrix,u_test])
    
    #svd_matrix = SVD_EM(Umatrix,20,'useraverage',30)#SVD(Umatrix,20,'itemaverage') #Hybrid_svd(movies,movieslist,Umatrix,20,'useraverage').matrix#SGD(Umatrix,20,50)#ALS(Umatrix,20,50) 
    #matrix=NMF_alg(Umatrix,20,'useraverage')
    #cbf_reg = CBF_regression(movies,Umatrix)
    #cbf_av = CBF_averageprofile(movies,movieslist)
    #llr = LogLikelihood(Umatrix,movieslist)
    assrules = AssociationRules(Umatrix,movieslist)
    
    tot_measures_fold = np.zeros(3)
    for indx in xrange(len(vals_vecs)):
        #u_preds = cbf_reg.CalcRatings(Umatrix[teststartindx+indx])#cbf_av.GetRecMovies(Umatrix[teststartindx+indx],True)
        #u_preds = svd_matrix[teststartindx+indx]#matrix[teststartindx+indx] 
        u_preds = assrules.GetRecItems(Umatrix[teststartindx+indx],True)#llr.GetRecItems(Umatrix[teststartindx+indx],True)
        tmp_measures,cnt_tmp = ClassificationMetrics(vals_vecs[indx],u_preds,3,50,False,Umatrix[teststartindx+indx])
        tot_measures_fold +=  tmp_measures
        cnt_vals += cnt_tmp
    tot_measures_fold = tot_measures_fold/float(len(vals_vecs))
    print tot_measures_fold
    tot_measures += tot_measures_fold
tot_measures = tot_measures/float(nfolds)
print 'precision:',tot_measures[0],' recall:',tot_measures[1],' f1:',tot_measures[2],'---',cnt_vals


fold: 1
[ 0.64264754  0.29272826  0.37885232]
fold: 2
[ 0.64008348  0.32754908  0.40919527]
fold: 3
[ 0.69438685  0.30352264  0.4023821 ]
fold: 4
[ 0.71752179  0.30733163  0.40810444]
fold: 5
[ 0.70219096  0.3342881   0.42251521]
precision: 0.679366124465  recall: 0.313083943066  f1: 0.404209869025 --- 39786.0

In [60]:
#llr precision: 0.632059422601  recall: 0.306911656684  f1: 0.389728618382 --- 39786.0
#Hybrid_svd precision: 0.540616355878  recall: 0.122568676777  f1: 0.188867837509 --- 39786.0
#als precision: 0.574768962349  recall: 0.154765744996  f1: 0.232415857722 --- 39786.0
#sgd precision: 0.522492554867  recall: 0.116681592379  f1: 0.182113478188 --- 39786.0
#SVD precision: 0.531278228807  recall: 0.119701346615  f1: 0.184269894611 --- 39786.0
#SVD-EM precision: 0.576567716327  recall: 0.159558142114  f1: 0.236321594653 --- 39786.0
#NMF_alg precision: 0.532487775416  recall: 0.125034210484  f1: 0.191971985488 --- 39786.0
#CBF_regression precision: 0.536374177877  recall: 0.128159010191  f1: 0.196055670058 --- 39786.0
#CBF_averageprofile precision: 0.561491582647  recall: 0.118988755524  f1: 0.185138199893 --- 39786.0
#AssociationRules precision: 0.679366124465  recall: 0.313083943066  f1: 0.404209869025 --- 39786.0

In [ ]: