In [1]:
import numpy as np
import pandas as pd
import copy
import collections
from scipy import linalg
import math
from collections import defaultdict
In [65]:
#data
df = pd.read_csv('./data/ml-100k/u.data',sep='\t',header=None)
#movie list
df_info = pd.read_csv('./data/ml-100k/u.item',sep='|',header=None)
movielist = [df_info[1].tolist()[indx]+';'+str(indx+1) for indx in xrange(len(df_info[1].tolist()))]
nmovies = len(movielist)
nusers = len(df[0].drop_duplicates().tolist())
min_ratings = 50
movies_rated = list(df[1])
counts = collections.Counter(movies_rated)
dfout = pd.DataFrame(columns=['user']+movielist)
toremovelist = []
for i in range(1,nusers):
tmpmovielist = [0 for j in range(nmovies)]
dftmp =df[df[0]==i]
for k in dftmp.index:
if counts[dftmp.ix[k][1]]>= min_ratings:
tmpmovielist[dftmp.ix[k][1]-1] = dftmp.ix[k][2]
else:
toremovelist.append(dftmp.ix[k][1])
dfout.loc[i] = [i]+tmpmovielist
toremovelist = list(set(toremovelist))
dfout.drop(dfout.columns[toremovelist], axis=1, inplace=True)
dfout.to_csv('data/utilitymatrix.csv',index=None)
In [38]:
df = pd.read_csv('data/utilitymatrix.csv')
df.head(2)
Out[38]:
In [2]:
def imputation(inp,Ri):
Ri = Ri.astype(float)
def userav():
for i in xrange(len(Ri)):
Ri[i][Ri[i]==0] = sum(Ri[i])/float(len(Ri[i][Ri[i]>0]))
return Ri
def itemav():
for i in xrange(len(Ri[0])):
Ri[:,i][Ri[:,i]==0] = sum(Ri[:,i])/float(len(Ri[:,i][Ri[:,i]>0]))
return Ri
switch = {'useraverage':userav(),'itemaverage':itemav()}
return switch[inp]
In [3]:
from scipy.stats import pearsonr
from scipy.spatial.distance import cosine
def sim(x,y,metric='cos'):
if metric == 'cos':
return 1.-cosine(x,y)
else:#correlation
return pearsonr(x,y)[0]
In [4]:
def CF_userbased(u_vec,K,data,indxs=False):
def FindKNeighbours(r,data,K):
neighs = []
cnt=0
for u in xrange(len(data)):
if data[u,r]>0 and cnt<K:
neighs.append(data[u])
cnt +=1
elif cnt==K:
break
return np.array(neighs)
def CalcRating(u_vec,r,neighs):
rating = 0.
den = 0.
for j in xrange(len(neighs)):
rating += neighs[j][-1]*float(neighs[j][r]-neighs[j][neighs[j]>0][:-1].mean())
den += abs(neighs[j][-1])
if den>0:
rating = np.round(u_vec[u_vec>0].mean()+(rating/den),0)
else:
rating = np.round(u_vec[u_vec>0].mean(),0)
if rating>5:
return 5.
elif rating<1:
return 1.
return rating
#add similarity col
data = data.astype(float)
nrows = len(data)
ncols = len(data[0])
data_sim = np.zeros((nrows,ncols+1))
data_sim[:,:-1] = data
#calc similarities:
for u in xrange(nrows):
if np.array_equal(data_sim[u,:-1],u_vec)==False: #list(data_sim[u,:-1]) != list(u_vec):
data_sim[u,ncols] = sim(data_sim[u,:-1],u_vec,'pearson')
else:
data_sim[u,ncols] = 0.
#order by similarity:
data_sim =data_sim[data_sim[:,ncols].argsort()][::-1]
#find the K users for each item not rated:
u_rec = np.zeros(len(u_vec))
for r in xrange(ncols):
if u_vec[r]==0:
neighs = FindKNeighbours(r,data_sim,K)
#calc the predicted rating
u_rec[r] = CalcRating(u_vec,r,neighs)
if indxs:
#take out the rated movies
seenindxs = [indx for indx in xrange(len(u_vec)) if u_vec[indx]>0]
u_rec[seenindxs] = -1
recsvec = np.argsort(u_rec)[::-1][np.argsort(u_rec)>0]
return recsvec
return u_rec
In [5]:
class CF_itembased(object):
def __init__(self,data):
#calc item similarities matrix
nitems = len(data[0])
self.data = data
self.simmatrix = np.zeros((nitems,nitems))
for i in xrange(nitems):
for j in xrange(nitems):
if j>=i:#triangular matrix
self.simmatrix[i,j] = sim(data[:,i],data[:,j])
else:
self.simmatrix[i,j] = self.simmatrix[j,i]
def GetKSimItemsperUser(self,r,K,u_vec):
items = np.argsort(self.simmatrix[r])[::-1]
items = items[items!=r]
cnt=0
neighitems = []
for i in items:
if u_vec[i]>0 and cnt<K:
neighitems.append(i)
cnt+=1
elif cnt==K:
break
return neighitems
def CalcRating(self,r,u_vec,neighitems):
rating = 0.
den = 0.
for i in neighitems:
rating += self.simmatrix[r,i]*u_vec[i]
den += abs(self.simmatrix[r,i])
if den>0:
rating = np.round(rating/den,0)
else:
rating = np.round(self.data[:,r][self.data[:,r]>0].mean(),0)
return rating
def CalcRatings(self,u_vec,K,indxs=False):
#u_rec = copy.copy(u_vec)
u_rec = np.zeros(len(u_vec))
for r in xrange(len(u_vec)):
if u_vec[r]==0:
neighitems = self.GetKSimItemsperUser(r,K,u_vec)
#calc predicted rating
u_rec[r] = self.CalcRating(r,u_vec,neighitems)
if indxs:
#take out the rated movies
seenindxs = [indx for indx in xrange(len(u_vec)) if u_vec[indx]>0]
u_rec[seenindxs]=-1
recsvec = np.argsort(u_rec)[::-1][np.argsort(u_rec)>0]
return recsvec
return u_rec
In [6]:
class SlopeOne(object):
def __init__(self,Umatrix):
#calc item similarities matrix
nitems = len(Umatrix[0])
self.difmatrix = np.zeros((nitems,nitems))
self.nratings = np.zeros((nitems,nitems))
def diffav_n(x,y):
xy = np.vstack((x, y)).T
xy = xy[(xy[:,0]>0) & (xy[:,1]>0)]
nxy = len(xy)
if nxy == 0:
#print 'no common'
return [1000.,0]
return [float(sum(xy[:,0])-sum(xy[:,1]))/nxy,nxy]
for i in xrange(nitems):
for j in xrange(nitems):
if j>=i:#triangular matrix
self.difmatrix[i,j],self.nratings[i,j] = diffav_n(Umatrix[:,i],Umatrix[:,j])
else:
self.difmatrix[i,j] = -self.difmatrix[j,i]
self.nratings[i,j] = self.nratings[j,i]
def GetKSimItemsperUser(self,r,K,u_vec):
items = np.argsort(self.difmatrix[r])
items = items[items!=r]
cnt=0
neighitems = []
for i in items:
if u_vec[i]>0 and cnt<K:
neighitems.append(i)
cnt+=1
elif cnt==K:
break
return neighitems
def CalcRating(self,r,u_vec,neighitems):
rating = 0.
den = 0.
for i in neighitems:
if abs(self.difmatrix[r,i])!=1000:
rating += (self.difmatrix[r,i]+u_vec[i])*self.nratings[r,i]
den += self.nratings[r,i]
if den==0:
#print 'no similar diff'
return 0.
rating = np.round(rating/den,0)
if rating >5:
return 5.
elif rating <1.:
return 1.
return rating
def CalcRatings(self,u_vec,K):
#u_rec = copy.copy(u_vec)
u_rec = np.zeros(len(u_vec))
for r in xrange(len(u_vec)):
if u_vec[r]==0:
neighitems = self.GetKSimItemsperUser(r,K,u_vec)
#calc predicted rating
u_rec[r] = self.CalcRating(r,u_vec,neighitems)
return u_rec
In [7]:
def SGD(Umatrix, K, iterations=100, alpha=0.00001, l=0.001, tol=0.001):
nrows = len(Umatrix)
ncols = len(Umatrix[0])
P = np.random.rand(nrows,K)
Q = np.random.rand(ncols,K)
Qt = Q.T
cost=-1
for it in xrange(iterations):
for i in xrange(nrows):
for j in xrange(ncols):
if Umatrix[i][j] > 0:
eij = Umatrix[i][j] -np.dot(P[i,:],Qt[:,j])
for k in xrange(K):
P[i][k] += alpha*(2*eij*Qt[k][j]-l*P[i][k])
Qt[k][j] += alpha*(2*eij*P[i][k]-l*Qt[k][j])
cost = 0
for i in xrange(nrows):
for j in xrange(ncols):
if Umatrix[i][j]>0:
cost += pow(Umatrix[i][j]-np.dot(P[i,:],Qt[:,j]),2)
for k in xrange(K):
cost += float(l/2.0)*(pow(P[i][k],2)+pow(Qt[k][j],2))
if cost < tol:
break
return np.round(np.dot(P,Qt),0)
In [8]:
def ALS(Umatrix, K, iterations=50, l=0.001, tol=0.001):
nrows = len(Umatrix)
ncols = len(Umatrix[0])
P = np.random.rand(nrows,K)
Q = np.random.rand(ncols,K)
Qt = Q.T
err = 0.
Umatrix = Umatrix.astype(float)
mask = Umatrix>0.
mask[mask==True]=1
mask[mask==False]=0
mask = mask.astype(np.float64, copy=False)
for it in xrange(iterations):
for u, mask_u in enumerate(mask):
P[u] = np.linalg.solve(np.dot(Qt, np.dot(np.diag(mask_u), Qt.T)) + l*np.eye(K),
np.dot(Qt, np.dot(np.diag(mask_u), Umatrix[u].T))).T
for i, mask_i in enumerate(mask.T):
Qt[:,i] = np.linalg.solve(np.dot(P.T, np.dot(np.diag(mask_i), P)) + l*np.eye(K),
np.dot(P.T, np.dot(np.diag(mask_i), Umatrix[:,i])))
err=np.sum((mask*(Umatrix - np.dot(P, Qt)))**2)
if err < tol:
break
return np.round(np.dot(P,Qt),0)
In [9]:
from sklearn.decomposition import NMF
def NMF_alg(Umatrix,K,inp='none',l=0.001):
R_tmp = copy.copy(Umatrix)
R_tmp = R_tmp.astype(float)
#imputation
if inp != 'none':
R_tmp = imputation(inp,Umatrix)
nmf = NMF(n_components=K,alpha=l)
P = nmf.fit_transform(R_tmp)
R_tmp = np.dot(P,nmf.components_)
return R_tmp
In [10]:
from sklearn.decomposition import TruncatedSVD
def SVD(Umatrix,K,inp='none'):
R_tmp = copy.copy(Umatrix)
R_tmp = R_tmp.astype(float)
#imputation
if inp != 'none':
R_tmp = imputation(inp,Umatrix)
means = np.array([ R_tmp[i][R_tmp[i]>0].mean() for i in xrange(len(R_tmp))]).reshape(-1,1)
R_tmp = R_tmp-means
svd = TruncatedSVD(n_components=K, random_state=4)
R_k = svd.fit_transform(R_tmp)
R_tmp = svd.inverse_transform(R_k)
R_tmp = means+R_tmp
return np.round(R_tmp,0)
In [11]:
def SVD_EM(Umatrix,K,inp='none',iterations=50,tol=0.001):
R_tmp = copy.copy(Umatrix)
R_tmp = R_tmp.astype(float)
nrows = len(Umatrix)
ncols = len(Umatrix[0])
#imputation
if inp != 'none':
R_tmp = imputation(inp,Umatrix)
#define svd
svd = TruncatedSVD(n_components=K, random_state=4)
err = -1
for it in xrange(iterations):
#m-step
R_k = svd.fit_transform(R_tmp)
R_tmp = svd.inverse_transform(R_k)
#e-step and error evaluation
err = 0
for i in xrange(nrows):
for j in xrange(ncols):
if Umatrix[i][j]>0:
err += pow(Umatrix[i][j]-R_tmp[i][j],2)
R_tmp[i][j] = Umatrix[i][j]
if err < tol:
print it,'toll reached!'
break
return np.round(R_tmp,0)
In [66]:
#matrix movies's content
movieslist = [int(m.split(';')[-1]) for m in dfout.columns[1:]]
moviescats = ['unknown','Action','Adventure','Animation','Children\'s','Comedy','Crime','Documentary',
'Drama','Fantasy','Film-Noir','Horror','Musical','Mystery',
'Romance','Sci-Fi','Thriller','War','Western']
dfout_movies = pd.DataFrame(columns=['movie_id']+moviescats)
startcatsindx = 5
cnt= 0
for m in movieslist:
dfout_movies.loc[cnt] = [m]+df_info.iloc[m-1][startcatsindx:].tolist()
cnt +=1
print dfout_movies.head()
dfout_movies.to_csv('data/movies_content.csv',index=None)
In [12]:
class CBF_averageprofile(object):
def __init__(self,Movies,Movieslist):
#calc user profiles:
self.nfeatures = len(Movies[0])
self.Movieslist = Movieslist
self.Movies = Movies
def GetRecMovies(self,u_vec,indxs=False):
#generate user profile
nmovies = len(u_vec)
nfeatures = self.nfeatures
mean_u = u_vec[u_vec>0].mean()
diff_u = u_vec-mean_u
features_u = np.zeros(nfeatures).astype(float)
cnts = np.zeros(nfeatures)
for m in xrange(nmovies):
if u_vec[m]>0:#u has rated m
features_u += self.Movies[m]*(diff_u[m])
cnts += self.Movies[m]
#average:
for m in xrange(nfeatures):
if cnts[m]>0:
features_u[m] = features_u[m]/float(cnts[m])
#calc sim:
sims = np.zeros(nmovies)
for m in xrange(nmovies):
if u_vec[m]==0:#sim only for movies not yet rated by the user
sims[m] = sim(features_u,self.Movies[m])
#order movies
order_movies_indxs = np.argsort(sims)[::-1]
if indxs:
return order_movies_indxs
return self.Movieslist[order_movies_indxs]
In [13]:
class CBF_regression(object):
def __init__(self,Movies,Umatrix,alpha=0.01,l=0.0001,its=50,tol=0.001):
#calc parameters:
self.nfeatures = len(Movies[0])+1#intercept
nusers = len(Umatrix)
nmovies = len(Umatrix[0])
#add intercept col
movies_feats = np.ones((nmovies,self.nfeatures))
movies_feats[:,1:] = Movies
self.movies_feats = movies_feats.astype(float)
#set Umatrix as float
self.Umatrix = Umatrix.astype(float)
#initialize the matrix:
Pmatrix = np.random.rand(nusers,self.nfeatures)
Pmatrix[:,0]=1.
err = 0.
cost = -1
for it in xrange(its):
print 'it:',it,' -- ',cost
for u in xrange(nusers):
for f in xrange(self.nfeatures):
if f==0:#no regularization
for m in xrange(nmovies):
if self.Umatrix[u,m]>0:
diff = np.dot(Pmatrix[u],self.movies_feats[m])-self.Umatrix[u,m]
Pmatrix[u,f] += -alpha*(diff*self.movies_feats[m][f])
else:
for m in xrange(nmovies):
if self.Umatrix[u,m]>0:
diff = np.dot(Pmatrix[u],self.movies_feats[m])-self.Umatrix[u,m]
Pmatrix[u,f] += -alpha*(diff*self.movies_feats[m][f] +l*Pmatrix[u][f])
cost = 0
for u in xrange(nusers):
for m in xrange(nmovies):
if self.Umatrix[u][m]>0:
cost += 0.5*pow(Umatrix[u][m]-np.dot(Pmatrix[u],self.movies_feats[m]),2)
for f in xrange(1,self.nfeatures):
cost += float(l/2.0)*(pow(Pmatrix[u][f],2))
if cost < tol:
print 'err',cost
break
self.Pmatrix = Pmatrix
def CalcRatings(self,u_vec):
#find u_vec
s = 0.
u_feats = np.zeros(len(self.Pmatrix[0]))
#in case the user is not present in the utility matrix find the most similar
for u in xrange(len(self.Umatrix)):
#print self.Umatrix[u]
tmps = sim(self.Umatrix[u],u_vec)
if tmps > s:
s = tmps
u_feats = self.Pmatrix[u]
if s == 1.:
break
new_vec = np.zeros(len(u_vec))
for r in xrange(len(u_vec)):
if u_vec[r]==0:
new_vec[r] = np.dot(u_feats,self.movies_feats[r])
return new_vec
In [14]:
class LogLikelihood(object):
def __init__(self,Umatrix,Movieslist,likethreshold=3):
self.Movieslist = Movieslist
#calculate loglikelihood ratio for each pair
self.nusers = len(Umatrix)
self.Umatrix =Umatrix
self.likethreshold = likethreshold
self.likerange = range(self.likethreshold+1,5+1)
self.dislikerange = range(1,self.likethreshold+1)
self.loglikelihood_ratio()
def calc_k(self,a,b):
tmpk = [[0 for j in range(2)] for i in range(2)]
for ratings in self.Umatrix:
if ratings[a] in self.likerange and ratings[b] in self.likerange:
tmpk[0][0] += 1
if ratings[a] in self.likerange and ratings[b] in self.dislikerange:
tmpk[0][1] += 1
if ratings[a] in self.dislikerange and ratings[b] in self.likerange:
tmpk[1][0] += 1
if ratings[a] in self.dislikerange and ratings[b] in self.dislikerange:
tmpk[1][1] += 1
return tmpk
def calc_llr(self,k_matrix):
Hcols=Hrows=Htot=0.0
if sum(k_matrix[0])+sum(k_matrix[1])==0:
return 0.
invN = 1.0/(sum(k_matrix[0])+sum(k_matrix[1]))
for i in range(0,2):
if((k_matrix[0][i]+k_matrix[1][i])!=0.0):
Hcols += invN*(k_matrix[0][i]+k_matrix[1][i])*math.log((k_matrix[0][i]+k_matrix[1][i])*invN )#sum of rows
if((k_matrix[i][0]+k_matrix[i][1])!=0.0):
Hrows += invN*(k_matrix[i][0]+k_matrix[i][1])*math.log((k_matrix[i][0]+k_matrix[i][1])*invN )#sum of cols
for j in range(0,2):
if(k_matrix[i][j]!=0.0):
Htot +=invN*k_matrix[i][j]*math.log(invN*k_matrix[i][j])
return 2.0*(Htot-Hcols-Hrows)/invN
def loglikelihood_ratio(self):
nitems = len(self.Movieslist)
self.items_llr= pd.DataFrame(np.zeros((nitems,nitems))).astype(float)
for i in xrange(nitems):
for j in xrange(nitems):
if(j>=i):
tmpk=self.calc_k(i,j)
self.items_llr.ix[i,j] = self.calc_llr(tmpk)
else:
self.items_llr.ix[i,j] = self.items_llr.iat[j,i]
def GetRecItems(self,u_vec,indxs=False):
items_weight = np.dot(u_vec,self.items_llr)
sortedweight = np.argsort(items_weight)
seenindxs = [indx for indx in xrange(len(u_vec)) if u_vec[indx]>0]
seenmovies = np.array(self.Movieslist)[seenindxs]
#remove seen items
recitems = np.array(self.Movieslist)[sortedweight]
recitems = [m for m in recitems if m not in seenmovies]
if indxs:
items_weight[seenindxs]=-1
recsvec = np.argsort(items_weight)[::-1][np.argsort(items_weight)>0]
return recsvec
return recitems[::-1]
In [15]:
class AssociationRules(object):
def __init__(self,Umatrix,Movieslist,min_support=0.1,min_confidence=0.1,likethreshold=3):
self.min_support = min_support
self.min_confidence = min_confidence
self.Movieslist = Movieslist
#transform utility matrix to sets of liked items
nitems = len(Umatrix[0])
transactions = []
for u in Umatrix:
s = [i for i in xrange(len(u)) if u[i]>likethreshold]
if len(s)>0:
transactions.append(s)
#find sets of 2 items
flat = [item for sublist in transactions for item in sublist]
inititems = map(frozenset,[ [item] for item in frozenset(flat)])
set_trans = map(set, transactions)
sets_init, self.dict_sets_support = self.filterSet(set_trans, inititems)
setlen = 2
items_tmp = self.combine_lists(sets_init, setlen)
self.freq_sets, sup_tmp = self.filterSet(set_trans, items_tmp)
self.dict_sets_support.update(sup_tmp)
self.ass_matrix = np.zeros((nitems,nitems))
for freqset in self.freq_sets:
#print 'freqset',freqset
list_setitems = [frozenset([item]) for item in freqset]
#print "freqSet", freqset, 'H1', list_setitems
self.calc_confidence_matrix(freqset, list_setitems)
def filterSet(self,set_trans, likeditems):
itemscnt = {}
for id in set_trans:
for item in likeditems:
if item.issubset(id):
itemscnt.setdefault(item, 0)
itemscnt[item] += 1
num_items = float(len(set_trans))
freq_sets = []
dict_sets = {}
for key in itemscnt:
support = itemscnt[key] / num_items
if support >= self.min_support:
freq_sets.insert(0, key)
dict_sets[key] = support
return freq_sets, dict_sets
def combine_lists(self,freq_sets, setlen):
setitems_list = []
nsets = len(freq_sets)
for i in range(nsets):
for j in range(i + 1, nsets):
setlist1 = list(freq_sets[i])[:setlen - 2]
setlist2 = list(freq_sets[j])[:setlen - 2]
if set(setlist1) == set(setlist2):
setitems_list.append(freq_sets[i].union(freq_sets[j]))
return setitems_list
def calc_confidence_matrix(self,freqset, list_setitems):
for target in list_setitems:
confidence = self.dict_sets_support[freqset] / self.dict_sets_support[freqset - target]
if confidence >= self.min_confidence:
self.ass_matrix[list(freqset - target)[0]][list(target)[0]] = confidence
def GetRecItems(self,u_vec,indxs=False):
vec_recs = np.dot(u_vec,self.ass_matrix)
sortedweight = np.argsort(vec_recs)
seenindxs = [indx for indx in xrange(len(u_vec)) if u_vec[indx]>0]
seenmovies = np.array(self.Movieslist)[seenindxs]
#remove seen items
recitems = np.array(self.Movieslist)[sortedweight]
recitems = [m for m in recitems if m not in seenmovies]
if indxs:
vec_recs[seenindxs]=-1
recsvec = np.argsort(vec_recs)[::-1][np.argsort(vec_recs)>0]
return recsvec
return recitems[::-1]
In [16]:
class Hybrid_cbf_cf(object):
def __init__(self,Movies,Movieslist,Umatrix):
#calc user profiles:
self.nfeatures = len(Movies[0])
self.Movieslist = Movieslist
self.Movies = Movies.astype(float)
self.Umatrix_mfeats = np.zeros((len(Umatrix),len(Umatrix[0])+self.nfeatures))
means = np.array([ Umatrix[i][Umatrix[i]>0].mean() for i in xrange(len(Umatrix))]).reshape(-1,1)
diffs = np.array([ [Umatrix[i][j]-means[i] if Umatrix[i][j]>0 else 0.
for j in xrange(len(Umatrix[i])) ] for i in xrange(len(Umatrix))])
self.Umatrix_mfeats[:,:len(Umatrix[0])] = Umatrix#diffs
self.nmovies = len(Movies)
#calc item features for each user
for u in xrange(len(Umatrix)):
u_vec = Umatrix[u]
self.Umatrix_mfeats[u,len(Umatrix[0]):] = self.GetUserItemFeatures(u_vec)
def GetUserItemFeatures(self,u_vec):
mean_u = u_vec[u_vec>0].mean()
#diff_u = u_vec-mean_u
features_u = np.zeros(self.nfeatures).astype(float)
cnts = np.zeros(self.nfeatures)
for m in xrange(self.nmovies):
if u_vec[m]>0:#u has rated m
features_u += self.Movies[m]*u_vec[m]#self.Movies[m]*(diff_u[m])
cnts += self.Movies[m]
#average:
for m in xrange(self.nfeatures):
if cnts[m]>0:
features_u[m] = features_u[m]/float(cnts[m])
return features_u
def CalcRatings(self,u_vec,K):
def FindKNeighbours(r,data,K):
neighs = []
cnt=0
for u in xrange(len(data)):
if data[u,r]>0 and cnt<K:
neighs.append(data[u])
cnt +=1
elif cnt==K:
break
return np.array(neighs)
def CalcRating(u_vec,r,neighs):
rating = 0.
den = 0.
for j in xrange(len(neighs)):
rating += neighs[j][-1]*float(neighs[j][r]-neighs[j][neighs[j]>0][:-1].mean())
den += abs(neighs[j][-1])
if den>0:
rating = np.round(u_vec[u_vec>0].mean()+(rating/den),0)
else:
rating = np.round(u_vec[u_vec>0].mean(),0)
if rating>5:
return 5.
elif rating<1:
return 1.
return rating
#add similarity col
nrows = len(self.Umatrix_mfeats)
ncols = len(self.Umatrix_mfeats[0])
data_sim = np.zeros((nrows,ncols+1))
data_sim[:,:-1] = self.Umatrix_mfeats
u_rec = np.zeros(len(u_vec))
#calc similarities:
mean = u_vec[u_vec>0].mean()
u_vec_feats = u_vec#np.array([u_vec[i]-mean if u_vec[i]>0 else 0 for i in xrange(len(u_vec))])
u_vec_feats = np.append(u_vec_feats,self.GetUserItemFeatures(u_vec))
for u in xrange(nrows):
if np.array_equal(data_sim[u,:-1],u_vec)==False: #list(data_sim[u,:-1]) != list(u_vec):
data_sim[u,ncols] = sim(data_sim[u,:-1],u_vec_feats)
else:
data_sim[u,ncols] = 0.
#order by similarity:
data_sim =data_sim[data_sim[:,ncols].argsort()][::-1]
#find the K users for each item not rated:
for r in xrange(self.nmovies):
if u_vec[r]==0:
neighs = FindKNeighbours(r,data_sim,K)
#calc the predicted rating
u_rec[r] = CalcRating(u_vec,r,neighs)
return u_rec
In [17]:
class Hybrid_svd(object):
def __init__(self,Movies,Movieslist,Umatrix,K,inp):
#calc user profiles:
self.nfeatures = len(Movies[0])
self.Movieslist = Movieslist
self.Movies = Movies.astype(float)
R_tmp = copy.copy(Umatrix)
R_tmp = R_tmp.astype(float)
#imputation
if inp != 'none':
R_tmp = imputation(inp,Umatrix)
Umatrix_mfeats = np.zeros((len(Umatrix),len(Umatrix[0])+self.nfeatures))
means = np.array([ Umatrix[i][Umatrix[i]>0].mean() for i in xrange(len(Umatrix))]).reshape(-1,1)
diffs = np.array([ [float(Umatrix[i][j]-means[i])
if Umatrix[i][j]>0 else float(R_tmp[i][j]-means[i]) for j in xrange(len(Umatrix[i])) ]
for i in xrange(len(Umatrix))])
Umatrix_mfeats[:,:len(Umatrix[0])] = diffs#R_tmp
self.nmovies = len(Movies)
#calc item features for each user
for u in xrange(len(Umatrix)):
u_vec = Umatrix[u]
Umatrix_mfeats[u,len(Umatrix[0]):] = self.GetUserItemFeatures(u_vec)
#calc svd
svd = TruncatedSVD(n_components=K, random_state=4)
R_k = svd.fit_transform(Umatrix_mfeats)
R_tmp = means+svd.inverse_transform(R_k)
self.matrix = np.round(R_tmp[:,:self.nmovies],0)
def GetUserItemFeatures(self,u_vec):
mean_u = u_vec[u_vec>0].mean()
diff_u = u_vec-mean_u
features_u = np.zeros(self.nfeatures).astype(float)
cnts = np.zeros(self.nfeatures)
for m in xrange(self.nmovies):
if u_vec[m]>0:#u has rated m
features_u += self.Movies[m]*(diff_u[m])#self.Movies[m]*u_vec[m]
cnts += self.Movies[m]
#average:
for m in xrange(self.nfeatures):
if cnts[m]>0:
features_u[m] = features_u[m]/float(cnts[m])
return features_u
In [18]:
def cross_validation(df,k):
val_num = int(len(df)/float(k))
print val_num
df_trains = []
df_vals = []
for i in xrange(k):
start_val = (k-i-1)*val_num
end_val = start_val+val_num
df_trains.append(pd.concat([df[:start_val],df[end_val:]]))
df_vals.append(df[start_val:end_val])
return df_trains,df_vals
In [19]:
import random
def HideRandomRatings(u_vec, ratiovals=0.5):
u_test = np.zeros(len(u_vec))
u_vals = np.zeros(len(u_vec))
cnt = 0
nratings = len(u_vec[u_vec>0])
for i in xrange(len(u_vec)):
if u_vec[i]>0:
if bool(random.getrandbits(1)) or cnt>=int(nratings*ratiovals):
u_test[i]=u_vec[i]
else:#random choice to hide the rating:
cnt +=1
u_vals[i]=u_vec[i]
return u_test,u_vals
In [20]:
#load data
df = pd.read_csv('data/utilitymatrix.csv')
print df.head(4)
df_movies = pd.read_csv('data/movies_content.csv')
movies = df_movies.values[:,1:]
print 'check:::',len(df.columns[1:]),'--',len(df_movies)
movieslist = list(df.columns[1:])
#k-fold cv 5 folds
nfolds = 5
df_trains,df_vals = cross_validation(df,nfolds)
In [21]:
def SE(u_preds,u_vals):
nratings = len(u_vals)
se = 0.
cnt = 0
for i in xrange(nratings):
if u_vals[i]>0:
se += (u_vals[i]-u_preds[i])*(u_vals[i]-u_preds[i])
cnt += 1
return se,cnt
In [22]:
nmovies = len(df_vals[0].values[:,1:][0])
vals_vecs_folds = []
tests_vecs_folds = []
for i in xrange(nfolds):
u_vecs = df_vals[i].values[:,1:]
vtests = np.empty((0,nmovies),float)
vvals = np.empty((0,nmovies),float)
for u_vec in u_vecs:
u_test,u_vals = HideRandomRatings(u_vec)
vvals = np.vstack([vvals,u_vals])
vtests = np.vstack([vtests,u_test])
vals_vecs_folds.append(vvals)
tests_vecs_folds.append(vtests)
In [40]:
err_itembased = 0.
cnt_itembased = 0
err_userbased = 0.
cnt_userbased = 0
err_slopeone = 0.
cnt_slopeone = 0
err_cbfcf = 0.
cnt_cbfcf = 0
for i in xrange(nfolds):
Umatrix = df_trains[i].values[:,1:]
cfitembased = CF_itembased(Umatrix)
cfslopeone = SlopeOne(Umatrix)
cbfcf = Hybrid_cbf_cf(movies,movieslist,Umatrix)
print 'fold:',i+1
vec_vals = vals_vecs_folds[i]
vec_tests = tests_vecs_folds[i]
for j in xrange(len(vec_vals)):
u_vals = vec_vals[j]
u_test = vec_tests[j]
#cbfcf
u_preds = cbfcf.CalcRatings(u_test,5)
e,c = SE(u_preds,u_vals)
err_cbfcf +=e
cnt_cbfcf +=c
#cf_userbased
u_preds = CF_userbased(u_test,5,Umatrix)
e,c = SE(u_preds,u_vals)
err_userbased +=e
cnt_userbased +=c
#cf_itembased
u_preds = cfitembased.CalcRatings(u_test,5)
e,c = SE(u_preds,u_vals)
err_itembased +=e
cnt_itembased +=c
#slope one
u_preds = cfslopeone.CalcRatings(u_test,5)
e,c = SE(u_preds,u_vals)
err_slopeone +=e
cnt_slopeone +=c
rmse_userbased = np.sqrt(err_userbased/float(cnt_userbased))
rmse_itembased = np.sqrt(err_itembased/float(cnt_itembased))
rmse_slopeone = np.sqrt(err_slopeone/float(cnt_slopeone))
print 'user_userbased rmse:',rmse_userbased,'--',cnt_userbased
print 'user_itembased rmse:',rmse_itembased,'--',cnt_itembased
print 'slope one rmse:',rmse_slopeone,'--',cnt_slopeone
rmse_cbfcf = np.sqrt(err_cbfcf/float(cnt_cbfcf))
print 'cbfcf rmse:',rmse_cbfcf,'---',cnt_cbfcf
In [63]:
err_svd = 0.
cnt_svd = 0
err_svd_em = 0.
cnt_svd_em = 0
err_als = 0.
cnt_als = 0
err_cbfreg = 0.
cnt_cbfreg = 0
for i in xrange(nfolds):
Umatrix = df_trains[i].values[:,1:]
print 'fold:',i+1
teststartindx = len(Umatrix)
vals_vecs = vals_vecs_folds[i]
tests_vecs = tests_vecs_folds[i]
for k in xrange(len(vals_vecs)):
u_vals = vals_vecs[k]
u_test = tests_vecs[k]
#add test vector to utility matrix
Umatrix = np.vstack([Umatrix,u_test])
#svd_em_matrix = Hybrid_svd(movies,movieslist,Umatrix,20,'useraverage').matrix#SVD_EM(Umatrix,20,'useraverage',1)
svd_matrix = SVD(Umatrix,20,'itemaverage')
cbf_reg = CBF_regression(movies,Umatrix)
#als_umatrix = SGD(Umatrix,20,50)#ALS(Umatrix,20,50)#NMF_alg(Umatrix,20,'itemaverage',0.001)
#evaluate errors
for indx in xrange(len(vals_vecs)):
#e,c = SE(als_umatrix[teststartindx+indx],vals_vecs[indx])
#err_als += e
#cnt_als += c
u_preds = cbf_reg.CalcRatings(Umatrix[teststartindx+indx])
e,c = SE(u_preds,vals_vecs[indx])
err_cbfreg +=e
cnt_cbfreg +=c
e,c = SE(svd_matrix[teststartindx+indx],vals_vecs[indx])
err_svd +=e
cnt_svd +=c
#e,c = SE(svd_em_matrix[teststartindx+indx],vals_vecs[indx])
#err_svd_em +=e
#cnt_svd_em +=c
if cnt_svd==0: cnt_svd=1
if cnt_svd_em==0: cnt_svd_em=1
if cnt_als==0: cnt_als=1
if cnt_cbfreg==0: cnt_cbfreg=1
rmse_als = np.sqrt(err_als/float(cnt_als))
rmse_svd = np.sqrt(err_svd/float(cnt_svd))
rmse_svd_em = np.sqrt(err_svd_em/float(cnt_svd_em))
rmse_cbfreg = np.sqrt(err_cbfreg/float(cnt_cbfreg))
print 'svd rmse:',rmse_svd,'--',cnt_svd
#print 'svd_em rmse:',rmse_svd_em,'--',cnt_svd_em
#print 'als rmse:',rmse_als,'--',cnt_als
print 'cbfreg rmse:',rmse_cbfreg,'--',cnt_cbfreg
In [ ]:
#user_userbased rmse: 1.01381431911 -- 39972
#user_itembased rmse: 1.0301785707 -- 39972
#slope one rmse: 1.07792084094 -- 39972
#cbfcf rmse: 1.0134317593 --- 39972
#svd rmse: 1.0145666769 -- 39972
#cbfreg rmse: 1.09495415915 -- 39972
#NMF_alg rmse: 0.972259334147 -- 39972
#SVD EM rmse: 1.03845070461 -- 39972
#HYBRID SVD rmse: 1.01385133337 -- 39972
#ALS rmse: 2.58784908254 -- 39972
#SGD rmse: 1.35396020834 -- 39972
In [33]:
def ClassificationMetrics(vec_vals,vec_recs,likethreshold=3,shortlist=50,ratingsval=False,vec_test=None):
#convert vals in indxs vec
indxs_like = [i for i in xrange(len(vec_vals)) if vec_vals[i]>likethreshold]
indxs_dislike = [i for i in xrange(len(vec_vals)) if vec_vals[i]<=likethreshold and vec_vals[i]>0]
cnt = len(indxs_like)+len(indxs_dislike)
indxs_rec = []
if ratingsval:
#convert ratings into items's list
if vec_test==None:
raise 'Error no test vector'
indxs_rec = [i for i in xrange(len(vec_recs)) if vec_recs[i]>likethreshold and vec_test[i]<1][:shortlist]
else:
#consider only the first slot of recs
indxs_rec = vec_recs[:shortlist]
tp = len(set(indxs_rec).intersection(set(indxs_like)))
fp = len(set(indxs_rec).intersection(set(indxs_dislike)))
fn = len(set(indxs_like)^(set(indxs_rec).intersection(set(indxs_like))))
precision = 0.
if tp+fp>0:
precision = float(tp)/(tp+fp)
recall = 0.
if tp+fn>0:
recall = float(tp)/(tp+fn)
f1 = 0.
if recall+precision >0:
f1 = 2.*precision*recall/(precision+recall)
return np.array([precision,recall,f1]),cnt
In [61]:
tot_measures = np.zeros(3)
cnt_vals = 0.
#CF memory based
for i in xrange(nfolds):
Umatrix = df_trains[i].values[:,1:]
#cfitembased = CF_itembased(Umatrix)
#cfslopeone = SlopeOne(Umatrix)
#cbfcf = Hybrid_cbf_cf(movies,movieslist,Umatrix)
print 'fold:',i+1
tot_measures_fold = np.zeros(3)
vals_vecs = vals_vecs_folds[i]
tests_vecs = tests_vecs_folds[i]
for j in xrange(len(vals_vecs)):
u_vals = vals_vecs[j]
u_test = tests_vecs[j]
u_preds = CF_userbased(u_test,20,Umatrix)#cfslopeone.CalcRatings(u_test,5)#cfitembased.CalcRatings(u_test,5)#cbfcf.CalcRatings(u_test,20)
tmp_measures,cnt_tmp = ClassificationMetrics(u_vals,u_preds,3,50,True,u_test)
tot_measures_fold += tmp_measures
cnt_vals += cnt_tmp
tot_measures_fold /= float(len(vals_vecs))
print tot_measures_fold
tot_measures += tot_measures_fold
tot_measures /= float(nfolds)
print 'precision:',tot_measures[0],' recall:',tot_measures[1],' f1:',tot_measures[2],'---',cnt_vals
In [62]:
#CF_userbased precision: 0.595593265581 recall: 0.179752374596 f1: 0.260247197345 --- 39786.0
#CF_itembased precision: 0.573049057653 recall: 0.150154902908 f1: 0.224407731332 --- 39786.0
#SlopeOne precision: 0.572945843878 recall: 0.166998383035 f1: 0.24433916059 --- 39786.0
#Hybrid_cbf_cf precision: 0.600636639987 recall: 0.183293616752 f1: 0.26385405692 --- 39786.0
In [55]:
#CF model based
cnt_vals=0.
tot_measures = np.zeros(3)
for i in xrange(nfolds):
Umatrix = df_trains[i].values[:,1:]
print 'fold:',i+1
teststartindx = len(Umatrix)
vals_vecs = vals_vecs_folds[i]
tests_vecs = tests_vecs_folds[i]
for k in xrange(len(vals_vecs)):
u_vals = vals_vecs[k]
u_test = tests_vecs[k]
#add test vector to utility matrix
Umatrix = np.vstack([Umatrix,u_test])
#svd_matrix = SVD_EM(Umatrix,20,'useraverage',30)#SVD(Umatrix,20,'itemaverage') #Hybrid_svd(movies,movieslist,Umatrix,20,'useraverage').matrix#SGD(Umatrix,20,50)#ALS(Umatrix,20,50)
#matrix=NMF_alg(Umatrix,20,'useraverage')
#cbf_reg = CBF_regression(movies,Umatrix)
#cbf_av = CBF_averageprofile(movies,movieslist)
#llr = LogLikelihood(Umatrix,movieslist)
assrules = AssociationRules(Umatrix,movieslist)
tot_measures_fold = np.zeros(3)
for indx in xrange(len(vals_vecs)):
#u_preds = cbf_reg.CalcRatings(Umatrix[teststartindx+indx])#cbf_av.GetRecMovies(Umatrix[teststartindx+indx],True)
#u_preds = svd_matrix[teststartindx+indx]#matrix[teststartindx+indx]
u_preds = assrules.GetRecItems(Umatrix[teststartindx+indx],True)#llr.GetRecItems(Umatrix[teststartindx+indx],True)
tmp_measures,cnt_tmp = ClassificationMetrics(vals_vecs[indx],u_preds,3,50,False,Umatrix[teststartindx+indx])
tot_measures_fold += tmp_measures
cnt_vals += cnt_tmp
tot_measures_fold = tot_measures_fold/float(len(vals_vecs))
print tot_measures_fold
tot_measures += tot_measures_fold
tot_measures = tot_measures/float(nfolds)
print 'precision:',tot_measures[0],' recall:',tot_measures[1],' f1:',tot_measures[2],'---',cnt_vals
In [60]:
#llr precision: 0.632059422601 recall: 0.306911656684 f1: 0.389728618382 --- 39786.0
#Hybrid_svd precision: 0.540616355878 recall: 0.122568676777 f1: 0.188867837509 --- 39786.0
#als precision: 0.574768962349 recall: 0.154765744996 f1: 0.232415857722 --- 39786.0
#sgd precision: 0.522492554867 recall: 0.116681592379 f1: 0.182113478188 --- 39786.0
#SVD precision: 0.531278228807 recall: 0.119701346615 f1: 0.184269894611 --- 39786.0
#SVD-EM precision: 0.576567716327 recall: 0.159558142114 f1: 0.236321594653 --- 39786.0
#NMF_alg precision: 0.532487775416 recall: 0.125034210484 f1: 0.191971985488 --- 39786.0
#CBF_regression precision: 0.536374177877 recall: 0.128159010191 f1: 0.196055670058 --- 39786.0
#CBF_averageprofile precision: 0.561491582647 recall: 0.118988755524 f1: 0.185138199893 --- 39786.0
#AssociationRules precision: 0.679366124465 recall: 0.313083943066 f1: 0.404209869025 --- 39786.0
In [ ]: