In [5]:
%matplotlib inline
from random import random
import math
import numpy as np
import copy
from scipy import stats
import matplotlib.pyplot as plt
import pickle as pkl
from scipy.spatial import distance
import seaborn as sns
sns.set_style('darkgrid')
In [2]:
def loadMovieLens(path='./data/movielens'):
#Get movie titles
movies={}
rev_movies={}
for idx,line in enumerate(open(path+'/u.item')):
idx,title=line.split('|')[0:2]
movies[idx]=title
rev_movies[title]=idx
# Load data
prefs={}
for line in open(path+'/u.data'):
(user,movieid,rating,ts)=line.split('\t')
prefs.setdefault(user,{})
prefs[user][movies[movieid]]=float(rating)
return prefs,rev_movies
In [3]:
data,movies = loadMovieLens("data/ml-100k")
Content example
In [28]:
data['3']
Out[28]:
In [5]:
def getRawArray(data):
d = []
for u in data.keys():
for i in data[u].keys():
d.append([u,i,data[u][i]])
return np.array(d)
In [6]:
# splitting while avoiding to reduce the dataset too much
def split_train_test(data,percent_test):
test={}
train={}
movie={}
for u in data.keys():
test.setdefault(u,{})
train.setdefault(u,{})
for movie in data[u]:
#print(data[u][movie])
if (random()<percent_test):
test[u][movie]=data[u][movie]
else:
train[u][movie]=data[u][movie]
return train, test
In [7]:
def split_train_test_by_movies(data,percent_test):
test={}
train={}
movie={}
for u in data.keys():
for movie in data[u]:
if (random()<percent_test):
try:
test[movie][u]=data[u][movie]
except KeyError:
test.setdefault(movie,{})
test[movie][u]=data[u][movie]
else:
try:
train[movie][u]=data[u][movie]
except KeyError:
train.setdefault(movie,{})
train[movie][u]=data[u][movie]
return train, test
In [29]:
percent_test=0.2
train,test=split_train_test(data,percent_test)
split used for convenience on the average by movie baseline
In [30]:
percent_test=0.2
m_train,m_test=split_train_test_by_movies(data,percent_test)
cleaning
18 movies have no ratings at all
In [31]:
def deleteUnseenInTest(train,test):
for k in test.keys():
try:
train[k]
except KeyError:
test.pop(k,None)
In [62]:
def deleteUnknowData(triplet_test, trainUsers, trainItems) :
to_Del = []
for i,t in enumerate(triplet_test):
if not t[0] in trainUsers:
to_Del.append(i)
elif not t[1] in trainItems:
to_Del.append(i)
return np.delete(triplet_test, to_Del, 0)
In [32]:
deleteUnseenInTest(train,test)
deleteUnseenInTest(m_train,m_test)
In [75]:
len(test)
Out[75]:
Matrix used for fast evaluation
In [76]:
def getTriplet(data):
triplet = []
for u in data.keys():
for i in data[u].keys():
triplet.append([u,i,data[u][i]])
return triplet
def getDataByUsers(triplet) :
dataByUsers = {}
for t in triplet:
if not t[0] in dataByUsers.keys():
dataByUsers[t[0]] = {}
dataByUsers[t[0]][t[1]] = float(t[2])
return dataByUsers
def getDataByItems(triplet) :
dataByItems = {}
for t in triplet:
if not t[1] in dataByItems.keys():
dataByItems[t[1]] = {}
dataByItems[t[1]][t[0]] = float(t[2])
return dataByItems
# Split l'ensemble des triplets
def splitTrainTest(triplet, testProp) :
perm = np.random.permutation(triplet)
splitIndex = int(testProp * len(triplet))
return perm[splitIndex:], perm[:splitIndex]
# supprime des données de test les données inconnus en train
def deleteUnknowData(triplet_test, trainUsers, trainItems) :
to_Del = []
for i,t in enumerate(triplet_test):
if not t[0] in trainUsers:
to_Del.append(i)
elif not t[1] in trainItems:
to_Del.append(i)
return np.delete(triplet_test, to_Del, 0)
In [78]:
%%time
triplet = getTriplet(data)
# split 80% train 20% test
arrayTrain, arrayTest = splitTrainTest(triplet , 0.2)
# train
trainUsers = getDataByUsers(arrayTrain)
trainItems = getDataByItems(arrayTrain)
#print len(triplet_test)
arrayTest = deleteUnknowData(arrayTest, trainUsers, trainItems)
#print len(triplet_test)
# test
testUsers = getDataByUsers(arrayTest)
testItems = getDataByItems(arrayTest)
In [66]:
arrayAll = getRawArray(data)
arrayTrain = getRawArray(train)
arrayTest = getRawArray(test)
arrayTest = deleteUnknowData(arrayTest,train,m_train)
In [49]:
arrayTest[:10,:10]
Out[49]:
In [35]:
class baselineMeanUser:
def __init__(self):
self.users={}
def fit(self,train):
for user in train.keys():
note=0.0
for movie in train[user].keys():
note+=train[user][movie]
note=note/len(train[user])
self.users[user]=note
def predict(self,users):
return [self.users[u] for u in users]
In [36]:
baseline_mu= baselineMeanUser()
baseline_mu.fit(train)
pred = baseline_mu.predict(evalArrayTest[:,0])
print("Mean Error %0.6f" %(
(np.array(pred) - np.array(evalArrayTest[:,2], float)) ** 2).mean())
In [37]:
class baselineMeanMovie:
def __init__(self):
self.movies={}
def fit(self,train):
for movie in train.keys():
note=0.0
for user in train[movie].keys():
note+=train[movie][user]
note=note/len(train[movie])
self.movies[movie]=note
def predict(self,movies):
res=[]
for m in movies:
try:
res.append(self.movies[m])
except:
res.append(3)
return res
In [38]:
baseline_mm= baselineMeanMovie()
baseline_mm.fit(m_train)
pred = baseline_mm.predict(evalArrayTest[:,1])
print("Mean Error %0.6f" %(
(np.array(pred) - np.array(evalArrayTest[:,2], float)) ** 2).mean())
Raw matrix are used for convenience and clarity.
Structure like scipy sparse matrix or python dictionnaries may be used for speedup.
Complete dataset
In [39]:
rawMatrix = np.zeros((len(data.keys()),1682))
for u in data:
for m in data[u]:
rawMatrix[int(u)-1][int(movies[m])-1] = data[u][m]
In [40]:
print(np.shape(rawMatrix))
rawMatrix[:5,:5]
Out[40]:
Train and test dataset
In [41]:
rawMatrixTrain = np.zeros((len(data.keys()),1682))
for u in train:
for m in train[u]:
rawMatrixTrain[int(u)-1][int(movies[m])-1] = train[u][m]
rawMatrixTest = np.zeros((len(data.keys()),1682))
for u in test:
for m in test[u]:
rawMatrixTest[int(u)-1][int(movies[m])-1] = test[u][m]
In [42]:
#from scipy import linalg
def nmf(X, latent_features, max_iter=100, eps = 1e-5,printevery=100):
print "NMF with %d latent features, %d iterations."%(latent_features, max_iter)
# mask used to ignore null element (coded by zero)
mask = np.sign(X)
# randomly initialized matrix
rows, columns = X.shape
A = np.random.rand(rows, latent_features)
Y = np.random.rand(latent_features, columns)
# Not used as I couldn't find significant improvments
#Y = linalg.lstsq(A, X)[0] # initializing that way as recommanded in a blog post
#Y = np.maximum(Y, eps) # avoiding too low values
masked_X = mask * X
masktest = np.sign(rawMatrixTest) # used for prints
masktrain = np.sign(rawMatrixTrain) # used for prints
for i in range(1, max_iter + 1):
top = np.dot(masked_X, Y.T)
bottom = (np.dot((mask * np.dot(A, Y)), Y.T)) + eps
A *= top / bottom
top = np.dot(A.T, masked_X)
bottom = np.dot(A.T, mask * np.dot(A, Y)) + eps
Y *= top / bottom
# evaluation
if i % printevery == 0 or i == 1 or i == max_iter:
X_est = np.dot(A, Y)
q = masktest*X_est - rawMatrixTest
q_train = masktrain*X_est - rawMatrixTrain
print "Iteration %d, Err %.05f, Err train %.05f"%( i, (q*q).sum()/ masktest.sum(), (q_train*q_train).sum()/ masktest.sum() )
return A, Y
In [44]:
%%time
A,Y = nmf(rawMatrixTrain,100,eps = 1e-5,max_iter=5,printevery=1)
resMatrix = A.dot(Y)
We see that it quickly get better than the baseline.
However, we see below that it overfit after that:
In [46]:
%%time
A,Y = nmf(rawMatrixTrain,50,eps = 1e-5,max_iter=500,printevery=100)
resMatrix = A.dot(Y)
This is due to the high sparsity of the matrix.
We can of course reduce the features matrix size to avoid overfitting, but that will limit further improvments.
In [47]:
%%time
A,Y = nmf(rawMatrixTrain,1,eps = 1e-5,max_iter=100,printevery=20)
resMatrix = A.dot(Y)
Despite good results in few seconds on this dataset, this can only get us so far.
We then have to add regularization to the cost function.
In [50]:
## This class is used to make predictions
class evalMF:
def __init__(self,resMatrix,dicU,dicI):
self.resMatrix=resMatrix
self.dicU = dicU
self.dicI = dicI
def fit(self):
pass
def predict(self,user,movie):
return self.resMatrix[int(user)-1][int(self.dicI[movie])-1]
In [51]:
mf = evalMF(resMatrix,data,movies)
In [52]:
# np.array([ (float(ra[2]) - mf.predict(ra[0],ra[1]))**2 for ra in evalArrayTest]).mean()
# faster evaluation
masqueTest=np.sign(rawMatrixTest)
q = masqueTest*resMatrix - rawMatrixTest
(q*q).sum()/ masqueTest.sum()
Out[52]:
Let's see some predictions
In [53]:
print data["1"]["Akira (1988)"]
print mf.predict("1","Akira (1988)")
print data["1"]["I.Q. (1994)"]
print mf.predict("1","I.Q. (1994)")
We usualy see an important difference between users, so we need to take the bias into account.
In [54]:
summ=0
for i in data["1"]:
summ+=(float(data["1"][i]) - mf.predict("1",i))**2
summ/len(data["1"])
Out[54]:
In [55]:
summ=0
for i in data["3"]:
summ+=(float(data["3"][i]) - mf.predict("3",i))**2
summ/len(data["3"])
Out[55]:
We have not been very successful with incorporating the bias and L1 into that implementation...
We build a simpler model below, and then add the regularization and bias.
In [57]:
class FactoMatriceBiais():
def __init__(self, k, epsilon=1e-3, nbIter=2000, lamb=0.5):
self.k = k
self.lamb = lamb
self.epsilon = epsilon
self.nbIter = nbIter
def fit(self, trainUsers, trainItems, triplet):
self.p = {}
self.q = {}
self.bu = {} #biais sur les utilisateurs
self.bi = {} #biais sur les items
self.mu = np.random.random() * 2 - 1
for j in range(len(triplet)): # On initialise les cases vides en random
u = triplet[j][0]
i = triplet[j][1]
if not u in self.p:
self.p[u] = np.random.rand(1,self.k) # matrice ligne pour un users
self.bu[u] = np.random.rand() * 2 - 1
if not i in self.q:
self.q[i] = np.random.rand(self.k,1) # matrice colonne pour un item
self.bi[i] = np.random.rand() * 2 - 1
loss = []
for it in range(self.nbIter):
ind = np.random.randint(len(triplet))
u = triplet[ind][0]
i = triplet[ind][1]
tmp = trainUsers[u][i] - (self.mu + self.bi[i] + self.bu[u] +self.p[u].dot(self.q[i])[0][0])
self.p[u] = (1 - self.lamb * self.epsilon) * self.p[u] + self.epsilon * 2 * tmp * self.q[i].transpose()
self.bu[u] = (1 - self.lamb * self.epsilon) * self.bu[u] + self.epsilon * 2 * tmp
self.q[i] = (1 - self.lamb * self.epsilon) * self.q[i] + self.epsilon * 2 * tmp * self.p[u].transpose()
self.bi[i] = (1 - self.lamb * self.epsilon) * self.bi[i] + self.epsilon * 2 * tmp
self.mu = (1 - self.lamb * self.epsilon) * self.mu + self.epsilon * 2 * tmp
loss.append(tmp*tmp) # erreur sans régularisation
#loss.append(tmp**2 + self.lamb *(np.linalg.norm(self.p[u]).sum()**2 + np.linalg.norm(self.q[i]).sum()**2))
if ((it)%(self.nbIter*0.2) == 0) :
print "itération : " , it
print "loss : ", np.mean(loss)
print "-------"
loss = []
# evaluation
if i % printevery == 0 or i == 1 or i == max_iter:
X_est = np.dot(A, Y)
q = masktest*X_est - rawMatrixTest
q_train = masktrain*X_est - rawMatrixTrain
print "Iteration %d, Err %.05f, Err train %.05f"%( i, (q*q).sum()/ masktest.sum(), (q_train*q_train).sum()/ masktest.sum() )
def predict(self, triplet_test):
pred = np.zeros(len(triplet_test))
for ind,t in enumerate(triplet_test):
pred[ind] = self.mu + self.bu[t[0]] + self.bi[t[1]] + self.p[t[0]].dot(self.q[t[1]])[0][0]
return pred
def score(self, triplet_test) :
return ((self.predict(triplet_test) - np.array(triplet_test[:,2], float)) ** 2).mean()
In [83]:
%%time
k = 10
epsilon = 7e-3
nbIter = 20*len(arrayTrain)
lamb = 0.2
model = FactoMatriceBiais(k, epsilon=epsilon, nbIter=nbIter,lamb=lamb)
model.fit(trainUsers, trainItems, arrayTrain)
print "erreur en test:", model.score(arrayTest)
In [32]:
class tSNE():
def __init__(self,perp, nIter, lr, moment, dim=2):
self.perp = perp # entre 5 et 50
self.nIter = nIter
self.lr = lr
self.moment = moment
self.dim = dim
def fit(self,data):
nEx = np.shape(data)[0]
# Matrice des distances de ||xi - xj||² #
normx = np.sum(data**2,1)
normx = np.reshape(normx, (1, nEx))
distancex = normx + normx.T - 2 * data.dot(data.T)
# Calcul des sigma ---------------------------------------------------------------#
lperp = np.log2(self.perp)
# initialisation bornes pour la recherche dichotomique #
sup = np.ones((nEx,1)) * np.max(distancex)
inf = np.zeros((nEx,1))
self.sigma = (sup + inf) / 2.
# recherche dichotomique #
stop = False
while not stop:
# Calculer la matrice des p(i|j)
self.pcond = np.exp(-distancex / (2. * (self.sigma**2)))
self.pcond = self.pcond / np.sum(self.pcond - np.eye(nEx),1).reshape(nEx,1)
# Calculer l'entropie de p(i|j)
entropy = - np.sum(self.pcond * np.log2(self.pcond), 0)
# Mise a jour des bornes
# Si il faut augmenter sigma
up = entropy < lperp
inf[up,0] = self.sigma[up,0]
# Si il faut baisser sigma
down = entropy > lperp
sup[down,0] = self.sigma[down,0]
# Mise a jour de sigma et condition d'arrêt
old = self.sigma
self.sigma = ((sup + inf) / 2.)
if np.max(np.abs(old - self.sigma)) < 1e-5:
stop = True
#print np.exp(entropy)
#print self.sigma.T
#--------------------------------------------------------------------------#
#initialiser y
self.embeddings = np.zeros((self.nIter+2, nEx, self.dim))
self.embeddings[1] = np.random.randn(nEx, self.dim) * 1e-4
#--------------------------------------------------------------------------#
# p(ij)
self.pij = (self.pcond + self.pcond.T) / (2.*nEx)
np.fill_diagonal(self.pij, 0)
# Descente de Gradient
#loss = []
for t in xrange(1,self.nIter+1):
# Matrice des distances
normy = np.sum((self.embeddings[t]**2),1)
normy = np.reshape(normy, (1, nEx))
distancey = normy + normy.T - 2 * self.embeddings[t].dot(self.embeddings[t].T)
# q(ij)
# self.qij = (distancey.sum() + nEx*(nEx-1)) / (1 + distancey)
# np.fill_diagonal(self.qij, 0)
self.qij = 1 / (1 + distancey)
np.fill_diagonal(self.qij, 0)
self.qij = self.qij / self.qij.sum()
# Descente de gradient
yt = self.embeddings[t]
tmpgrad = 4 * ((self.pij - self.qij) / (1 + distancey)).reshape(nEx, nEx,1)
for i in range(nEx):
dy = (tmpgrad[i] * (yt[i]-yt)).sum(0)
self.embeddings[t+1][i] = yt[i] - self.lr * dy + self.moment * (yt[i] - self.embeddings[t-1,i])
#l = stats.entropy(self.qij, self.pij, 2).mean()
#loss.append(l)
#if (t % 100 == 0):
# print t,l
#if (t % 100 == 0):
# print t
In [36]:
X_ini = np.vstack([data.data[data.target==i]
for i in range(10)])
cols = np.hstack([data.target[data.target==i]
for i in range(10)])
In [41]:
%%time
from sklearn import datasets
from scipy import stats
data = datasets.load_digits()
model = tSNE(10,500,1000,0)
model.fit(X_ini)
In [42]:
palette = np.array(sns.color_palette("hls", 10))
t = np.shape(model.embeddings)[0] -1
# We create a scatter plot.
f = plt.figure(figsize=(8, 8))
ax = plt.subplot(aspect='equal')
sc = ax.scatter(model.embeddings[t,:,0], model.embeddings[t,:,1], lw=0, s=40,
c=palette[cols.astype(np.int)])
plt.xlim(-25, 25)
plt.ylim(-25, 25)
ax.axis('off')
ax.axis('tight')
#plt.plot(mod.embedding_[12][0],mod.embedding_[12][1], 'bv')
plt.show()
For reference, let's compare it with sklearn's TSNE
In [6]:
from sklearn.manifold import TSNE
In [43]:
mod = TSNE(random_state=1337)
In [44]:
%%time
X = mod.fit_transform(X_ini)
In [45]:
palette = np.array(sns.color_palette("hls", 10))
# We create a scatter plot.
f = plt.figure(figsize=(8, 8))
ax = plt.subplot(aspect='equal')
sc = ax.scatter(X[:,0], X[:,1], lw=0, s=40,
c=palette[cols.astype(np.int)])
plt.xlim(-25, 25)
plt.ylim(-25, 25)
ax.axis('off')
ax.axis('tight')
#plt.plot(mod.embedding_[12][0],mod.embedding_[12][1], 'bv')
plt.show()
It produce similar results, albeit faster, as expected.
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]: