TME4 FDMS Collaborative Filtering

Florian Toqué & Paul Willot



In [5]:

    
%matplotlib inline
from random import random
import math
import numpy as np
import copy
from scipy import stats
import matplotlib.pyplot as plt
import pickle as pkl
from scipy.spatial import distance
import seaborn as sns
sns.set_style('darkgrid')

Loading the data



In [2]:

    
def loadMovieLens(path='./data/movielens'):
    #Get movie titles
    movies={}
    rev_movies={}
    for idx,line in enumerate(open(path+'/u.item')):
        idx,title=line.split('|')[0:2]
        movies[idx]=title
        rev_movies[title]=idx

    # Load data
    prefs={}
    for line in open(path+'/u.data'):
        (user,movieid,rating,ts)=line.split('\t')
        prefs.setdefault(user,{})
        prefs[user][movies[movieid]]=float(rating)
        
    return prefs,rev_movies



In [3]:

    
data,movies = loadMovieLens("data/ml-100k")

Content example



In [28]:

    
data['3']









    Out[28]:





{'187 (1997)': 2.0,
 'Air Force One (1997)': 2.0,
 'Alien: Resurrection (1997)': 3.0,
 'Apostle, The (1997)': 4.0,
 'Bean (1997)': 2.0,
 'Boogie Nights (1997)': 5.0,
 'Chasing Amy (1997)': 3.0,
 'Conspiracy Theory (1997)': 5.0,
 'Contact (1997)': 2.0,
 'Cop Land (1997)': 4.0,
 'Crash (1996)': 1.0,
 'Critical Care (1997)': 1.0,
 "Dante's Peak (1997)": 2.0,
 'Deconstructing Harry (1997)': 3.0,
 'Deep Rising (1998)': 1.0,
 'Desperate Measures (1998)': 4.0,
 "Devil's Advocate, The (1997)": 3.0,
 "Devil's Own, The (1997)": 1.0,
 'Edge, The (1997)': 4.0,
 'Event Horizon (1997)': 4.0,
 'Everyone Says I Love You (1996)': 2.0,
 'Fallen (1998)': 3.0,
 'G.I. Jane (1997)': 2.0,
 'Game, The (1997)': 2.0,
 'Good Will Hunting (1997)': 2.0,
 'Hard Rain (1998)': 3.0,
 'Hoodlum (1997)': 3.0,
 'House of Yes, The (1997)': 1.0,
 'How to Be a Player (1997)': 1.0,
 'In the Name of the Father (1993)': 2.0,
 'Jackie Brown (1997)': 5.0,
 'Kiss the Girls (1997)': 1.0,
 'L.A. Confidential (1997)': 2.0,
 'Liar Liar (1997)': 2.0,
 'Lost Highway (1997)': 2.0,
 'Mad City (1997)': 3.0,
 'Man Who Knew Too Little, The (1997)': 4.0,
 'Mimic (1997)': 2.0,
 'Mother (1996)': 5.0,
 'Murder at 1600 (1997)': 3.0,
 'Paradise Lost: The Child Murders at Robin Hood Hills (1996)': 5.0,
 'Playing God (1997)': 1.0,
 'Prophecy II, The (1998)': 3.0,
 'Return of the Jedi (1983)': 4.0,
 "Schindler's List (1993)": 4.0,
 'Scream (1996)': 2.0,
 'Sphere (1998)': 3.0,
 'Spice World (1997)': 2.0,
 'Starship Troopers (1997)': 3.0,
 'U Turn (1997)': 3.0,
 "Ulee's Gold (1997)": 3.0,
 'Wag the Dog (1997)': 5.0,
 'Wedding Singer, The (1998)': 3.0}

Splitting data between train/test

We avoid to let unseen data form the train set in the test set.
We also try to minimise the dataset reduction by splitting on each user.



In [5]:

    
def getRawArray(data):
    d = []
    for u in data.keys():
        for i in data[u].keys():
            d.append([u,i,data[u][i]])
    return np.array(d)



In [6]:

    
# splitting while avoiding to reduce the dataset too much
def split_train_test(data,percent_test):
    test={}
    train={}
    movie={}
    for u in data.keys():
        test.setdefault(u,{})
        train.setdefault(u,{})
        for movie in data[u]:
            #print(data[u][movie])
            if (random()<percent_test):
                test[u][movie]=data[u][movie]
            else:
                train[u][movie]=data[u][movie]
    return train, test



In [7]:

    
def split_train_test_by_movies(data,percent_test):
    test={}
    train={}
    movie={}
    for u in data.keys():
        for movie in data[u]:
            if (random()<percent_test):
                try:
                    test[movie][u]=data[u][movie]
                except KeyError:
                    test.setdefault(movie,{})
                    test[movie][u]=data[u][movie]
            else:
                try:
                    train[movie][u]=data[u][movie]
                except KeyError:
                    train.setdefault(movie,{})
                    train[movie][u]=data[u][movie]
    return train, test



In [29]:

    
percent_test=0.2
train,test=split_train_test(data,percent_test)

split used for convenience on the average by movie baseline



In [30]:

    
percent_test=0.2
m_train,m_test=split_train_test_by_movies(data,percent_test)

cleaning
18 movies have no ratings at all



In [31]:

    
def deleteUnseenInTest(train,test):
    for k in test.keys():
        try:
            train[k]
        except KeyError:
            test.pop(k,None)



In [62]:

    
def deleteUnknowData(triplet_test, trainUsers, trainItems) :
    to_Del = []
    for i,t in enumerate(triplet_test):
        if not t[0] in trainUsers:
            to_Del.append(i)
        elif not t[1] in trainItems:
            to_Del.append(i)
    return np.delete(triplet_test, to_Del, 0)



In [32]:

    
deleteUnseenInTest(train,test)
deleteUnseenInTest(m_train,m_test)



In [75]:

    
len(test)









    Out[75]:





943

Matrix used for fast evaluation



In [76]:

    
def getTriplet(data):
    triplet = []
    for u in data.keys():
        for i in data[u].keys():
            triplet.append([u,i,data[u][i]])
    return triplet

def getDataByUsers(triplet) :
    dataByUsers = {}
    for t in triplet:
        if not t[0] in dataByUsers.keys():
            dataByUsers[t[0]] = {}
        dataByUsers[t[0]][t[1]] = float(t[2])
    return dataByUsers

def getDataByItems(triplet) :
    dataByItems = {}
    for t in triplet:
        if not t[1] in dataByItems.keys():
            dataByItems[t[1]] = {}
        dataByItems[t[1]][t[0]] = float(t[2])
    return dataByItems

# Split l'ensemble des triplets  
def splitTrainTest(triplet, testProp) :
    perm = np.random.permutation(triplet)
    splitIndex = int(testProp * len(triplet))
    return perm[splitIndex:], perm[:splitIndex]

# supprime des données de test les données inconnus en train
def deleteUnknowData(triplet_test, trainUsers, trainItems) :
    to_Del = []
    for i,t in enumerate(triplet_test):
        if not t[0] in trainUsers:
            to_Del.append(i)
        elif not t[1] in trainItems:
            to_Del.append(i)
    return np.delete(triplet_test, to_Del, 0)



In [78]:

    
%%time

triplet = getTriplet(data)

# split 80% train 20% test
arrayTrain, arrayTest = splitTrainTest(triplet , 0.2)

# train
trainUsers = getDataByUsers(arrayTrain)
trainItems = getDataByItems(arrayTrain)

#print len(triplet_test)
arrayTest = deleteUnknowData(arrayTest, trainUsers, trainItems)
#print len(triplet_test)

# test
testUsers = getDataByUsers(arrayTest)
testItems = getDataByItems(arrayTest)









    



CPU times: user 4.73 s, sys: 70.9 ms, total: 4.8 s
Wall time: 4.77 s



In [66]:

    
arrayAll = getRawArray(data)
arrayTrain = getRawArray(train)
arrayTest = getRawArray(test)
arrayTest = deleteUnknowData(arrayTest,train,m_train)



In [49]:

    
arrayTest[:10,:10]









    Out[49]:





array([['344', 'Birdcage, The (1996)', '4.0'],
       ['344', 'Enchanted April (1991)', '4.0'],
       ['344', 'Face/Off (1997)', '4.0'],
       ['344', 'Cable Guy, The (1996)', '1.0'],
       ['344', 'Contact (1997)', '3.0'],
       ['344', 'Twister (1996)', '3.0'],
       ['344', 'Wizard of Oz, The (1939)', '4.0'],
       ['344', 'Alien (1979)', '5.0'],
       ['344', 'Sabrina (1954)', '4.0'],
       ['344', 'Terminator 2: Judgment Day (1991)', '4.0']], 
      dtype='|S79')

Baseline: mean by user



In [35]:

    
class baselineMeanUser:
    def __init__(self):
        self.users={}
    def fit(self,train):
        for user in train.keys():
            note=0.0
            for movie in train[user].keys():
                note+=train[user][movie]
            note=note/len(train[user])
            self.users[user]=note
        
    def predict(self,users):
        return [self.users[u] for u in users]



In [36]:

    
baseline_mu= baselineMeanUser()
baseline_mu.fit(train)
pred = baseline_mu.predict(evalArrayTest[:,0])
print("Mean Error %0.6f" %(
        (np.array(pred) - np.array(evalArrayTest[:,2], float)) ** 2).mean())









    



Mean Error 1.083275



In [37]:

    
class baselineMeanMovie:
    def __init__(self):
        self.movies={}
    def fit(self,train):
        for movie in train.keys():
            note=0.0
            for user in train[movie].keys():
                note+=train[movie][user]
            note=note/len(train[movie])
            self.movies[movie]=note
        
    def predict(self,movies):
        res=[]
        for m in movies:
            try:
                res.append(self.movies[m])
            except:
                res.append(3)
        return res



In [38]:

    
baseline_mm= baselineMeanMovie()
baseline_mm.fit(m_train)
pred = baseline_mm.predict(evalArrayTest[:,1])
print("Mean Error %0.6f" %(
        (np.array(pred) - np.array(evalArrayTest[:,2], float)) ** 2).mean())









    



Mean Error 0.997245

Raw matrix are used for convenience and clarity.
Structure like scipy sparse matrix or python dictionnaries may be used for speedup.

Complete dataset



In [39]:

    
rawMatrix = np.zeros((len(data.keys()),1682))
for u in data:
    for m in data[u]:
        rawMatrix[int(u)-1][int(movies[m])-1] = data[u][m]



In [40]:

    
print(np.shape(rawMatrix))
rawMatrix[:5,:5]









    



(943, 1682)






    Out[40]:





array([[ 5.,  3.,  4.,  3.,  3.],
       [ 4.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.],
       [ 4.,  3.,  0.,  0.,  0.]])

Train and test dataset



In [41]:

    
rawMatrixTrain = np.zeros((len(data.keys()),1682))
for u in train:
    for m in train[u]:
        rawMatrixTrain[int(u)-1][int(movies[m])-1] = train[u][m]
        
rawMatrixTest = np.zeros((len(data.keys()),1682))
for u in test:
    for m in test[u]:
        rawMatrixTest[int(u)-1][int(movies[m])-1] = test[u][m]

Non-negative Matrix Factorization

Fast implementation using numpy's matrix processing.



In [42]:

    
#from scipy import linalg

def nmf(X, latent_features, max_iter=100, eps = 1e-5,printevery=100):

    print "NMF with %d latent features, %d iterations."%(latent_features, max_iter)

    # mask used to ignore null element (coded by zero)
    mask = np.sign(X)

    # randomly initialized matrix
    rows, columns = X.shape
    A = np.random.rand(rows, latent_features)
    
    Y = np.random.rand(latent_features, columns)
    # Not used as I couldn't find significant improvments
    #Y = linalg.lstsq(A, X)[0]  # initializing that way as recommanded in a blog post
    #Y = np.maximum(Y, eps)     # avoiding too low values

    masked_X = mask * X
    masktest = np.sign(rawMatrixTest)    # used for prints
    masktrain = np.sign(rawMatrixTrain)  # used for prints

    for i in range(1, max_iter + 1):

        top = np.dot(masked_X, Y.T)
        bottom = (np.dot((mask * np.dot(A, Y)), Y.T)) + eps
        A *= top / bottom
        
        top = np.dot(A.T, masked_X)
        bottom = np.dot(A.T, mask * np.dot(A, Y)) + eps
        Y *= top / bottom


        # evaluation
        if i % printevery == 0 or i == 1 or i == max_iter:
            X_est = np.dot(A, Y)
            q = masktest*X_est - rawMatrixTest
            q_train = masktrain*X_est - rawMatrixTrain
            print "Iteration %d, Err %.05f, Err train %.05f"%( i, (q*q).sum()/ masktest.sum(), (q_train*q_train).sum()/ masktest.sum() )
            
    return A, Y



In [44]:

    
%%time
A,Y = nmf(rawMatrixTrain,100,eps = 1e-5,max_iter=5,printevery=1)
resMatrix = A.dot(Y)









    



NMF with 100 latent features, 5 iterations.
Iteration 1, Err 0.95206, Err train 3.43047
Iteration 2, Err 0.92303, Err train 3.24684
Iteration 3, Err 0.92202, Err train 3.18071
Iteration 4, Err 0.92199, Err train 3.12156
Iteration 5, Err 0.92212, Err train 3.06438
CPU times: user 886 ms, sys: 120 ms, total: 1.01 s
Wall time: 597 ms

We see that it quickly get better than the baseline.
However, we see below that it overfit after that:



In [46]:

    
%%time
A,Y = nmf(rawMatrixTrain,50,eps = 1e-5,max_iter=500,printevery=100)
resMatrix = A.dot(Y)









    



NMF with 50 latent features, 500 iterations.
Iteration 1, Err 0.96234, Err train 3.49261
Iteration 100, Err 1.26887, Err train 0.98554
Iteration 200, Err 1.47912, Err train 0.68758
Iteration 300, Err 1.58307, Err train 0.58338
Iteration 400, Err 1.65987, Err train 0.52863
Iteration 500, Err 1.72085, Err train 0.49444
CPU times: user 41.9 s, sys: 4.06 s, total: 46 s
Wall time: 21.1 s

This is due to the high sparsity of the matrix.
We can of course reduce the features matrix size to avoid overfitting, but that will limit further improvments.



In [47]:

    
%%time
A,Y = nmf(rawMatrixTrain,1,eps = 1e-5,max_iter=100,printevery=20)
resMatrix = A.dot(Y)









    



NMF with 1 latent features, 100 iterations.
Iteration 1, Err 0.97223, Err train 3.59361
Iteration 20, Err 0.90697, Err train 3.31840
Iteration 40, Err 0.90697, Err train 3.31840
Iteration 60, Err 0.90697, Err train 3.31840
Iteration 80, Err 0.90697, Err train 3.31840
Iteration 100, Err 0.90697, Err train 3.31840
CPU times: user 3.39 s, sys: 945 ms, total: 4.33 s
Wall time: 3.18 s

Despite good results in few seconds on this dataset, this can only get us so far.
We then have to add regularization to the cost function.

Evaluation



In [50]:

    
## This class is used to make predictions
class evalMF:
    def __init__(self,resMatrix,dicU,dicI):
        self.resMatrix=resMatrix
        self.dicU = dicU
        self.dicI = dicI
    def fit(self):
        pass
        
    def predict(self,user,movie):
        return self.resMatrix[int(user)-1][int(self.dicI[movie])-1]



In [51]:

    
mf = evalMF(resMatrix,data,movies)



In [52]:

    
# np.array([ (float(ra[2]) - mf.predict(ra[0],ra[1]))**2 for ra in evalArrayTest]).mean()
# faster evaluation
masqueTest=np.sign(rawMatrixTest)
q = masqueTest*resMatrix - rawMatrixTest
(q*q).sum()/ masqueTest.sum()









    Out[52]:





0.90696972456076508

Let's see some predictions



In [53]:

    
print data["1"]["Akira (1988)"]
print mf.predict("1","Akira (1988)")
print data["1"]["I.Q. (1994)"]
print mf.predict("1","I.Q. (1994)")









    



4.0
3.45413807016
3.0
3.29466277508

We usualy see an important difference between users, so we need to take the bias into account.



In [54]:

    
summ=0
for i in data["1"]:
    summ+=(float(data["1"][i]) - mf.predict("1",i))**2
summ/len(data["1"])









    Out[54]:





0.94932552964387784



In [55]:

    
summ=0
for i in data["3"]:
    summ+=(float(data["3"][i]) - mf.predict("3",i))**2
summ/len(data["3"])









    Out[55]:





1.3086215369194938

We have not been very successful with incorporating the bias and L1 into that implementation...
We build a simpler model below, and then add the regularization and bias.



In [57]:

    
class FactoMatriceBiais():
    def __init__(self, k, epsilon=1e-3, nbIter=2000, lamb=0.5):
        self.k = k
        self.lamb = lamb
        self.epsilon = epsilon
        self.nbIter = nbIter

    def fit(self, trainUsers, trainItems, triplet):

        self.p = {}
        self.q = {}
        self.bu = {} #biais sur les utilisateurs
        self.bi = {} #biais sur les items
        self.mu = np.random.random() * 2 - 1
        
        for j in range(len(triplet)): # On initialise les cases vides en random
            u = triplet[j][0]
            i = triplet[j][1]
            if not u in self.p:
                self.p[u] = np.random.rand(1,self.k) # matrice ligne pour un users
                self.bu[u] = np.random.rand() * 2 - 1
            if not i in self.q:
                self.q[i] = np.random.rand(self.k,1) # matrice colonne pour un item
                self.bi[i] = np.random.rand() * 2 - 1
        loss = []   
        for it in range(self.nbIter):
            ind = np.random.randint(len(triplet))
            u = triplet[ind][0]
            i = triplet[ind][1]
            
            tmp = trainUsers[u][i] - (self.mu + self.bi[i] + self.bu[u] +self.p[u].dot(self.q[i])[0][0])
            self.p[u] = (1 - self.lamb * self.epsilon) * self.p[u] + self.epsilon * 2 * tmp * self.q[i].transpose()
            self.bu[u] = (1 - self.lamb * self.epsilon) * self.bu[u] + self.epsilon * 2 * tmp
            self.q[i] = (1 - self.lamb * self.epsilon) * self.q[i] + self.epsilon * 2 * tmp * self.p[u].transpose()
            self.bi[i] = (1 - self.lamb * self.epsilon) * self.bi[i] + self.epsilon * 2 * tmp
            self.mu = (1 - self.lamb * self.epsilon) * self.mu + self.epsilon * 2 * tmp
            
            loss.append(tmp*tmp) # erreur sans régularisation
            #loss.append(tmp**2 + self.lamb *(np.linalg.norm(self.p[u]).sum()**2 + np.linalg.norm(self.q[i]).sum()**2))
            
            if ((it)%(self.nbIter*0.2) == 0) :
                print "itération : " , it
                print "loss : ", np.mean(loss)
                print "-------"
                loss = []
            # evaluation
            if i % printevery == 0 or i == 1 or i == max_iter:
                X_est = np.dot(A, Y)
                q = masktest*X_est - rawMatrixTest
                q_train = masktrain*X_est - rawMatrixTrain
                print "Iteration %d, Err %.05f, Err train %.05f"%( i, (q*q).sum()/ masktest.sum(), (q_train*q_train).sum()/ masktest.sum() )

            
    def predict(self, triplet_test):
        pred = np.zeros(len(triplet_test))
        for ind,t in enumerate(triplet_test):
            pred[ind] = self.mu + self.bu[t[0]] + self.bi[t[1]] + self.p[t[0]].dot(self.q[t[1]])[0][0]
        return pred
    
    def score(self, triplet_test) :
        return ((self.predict(triplet_test) - np.array(triplet_test[:,2], float)) ** 2).mean()



In [83]:

    
%%time
k = 10
epsilon = 7e-3
nbIter = 20*len(arrayTrain)
lamb = 0.2
model = FactoMatriceBiais(k, epsilon=epsilon, nbIter=nbIter,lamb=lamb)
model.fit(trainUsers, trainItems, arrayTrain)
print "erreur en test:", model.score(arrayTest)









    



itération :  0
loss :  0.00404642136261
-------
itération :  319020
loss :  0.940624718786
-------
itération :  638040
loss :  0.830525925094
-------
itération :  957060
loss :  0.792305926705
-------
itération :  1276080
loss :  0.75115414982
-------
erreur en test: 0.877802567235
CPU times: user 46.3 s, sys: 434 ms, total: 46.8 s
Wall time: 47.2 s



In [32]:

    
class tSNE():
    def __init__(self,perp, nIter, lr, moment, dim=2):
        self.perp = perp # entre 5 et 50
        self.nIter = nIter
        self.lr = lr
        self.moment = moment
        self.dim = dim 
    def fit(self,data):
        nEx = np.shape(data)[0]
        # Matrice des distances de ||xi - xj||² #
        normx = np.sum(data**2,1)
        normx = np.reshape(normx, (1, nEx))
        distancex = normx + normx.T - 2 * data.dot(data.T)
        # Calcul des sigma ---------------------------------------------------------------#
        lperp = np.log2(self.perp)
        # initialisation bornes pour la recherche dichotomique #
        sup = np.ones((nEx,1)) * np.max(distancex)
        inf = np.zeros((nEx,1))
        self.sigma = (sup + inf) / 2.
        # recherche dichotomique #
        stop = False
        while not stop:
            # Calculer la matrice des p(i|j)
            self.pcond = np.exp(-distancex / (2. * (self.sigma**2)))
            self.pcond = self.pcond / np.sum(self.pcond - np.eye(nEx),1).reshape(nEx,1)
            # Calculer l'entropie de p(i|j)
            entropy = - np.sum(self.pcond * np.log2(self.pcond), 0)
            # Mise a jour des bornes
              # Si il faut augmenter sigma
            up = entropy < lperp 
            inf[up,0] = self.sigma[up,0]
              # Si il faut baisser sigma
            down = entropy > lperp 
            sup[down,0] = self.sigma[down,0]
            # Mise a jour de sigma et condition d'arrêt
            old = self.sigma
            self.sigma = ((sup + inf) / 2.)
            if np.max(np.abs(old - self.sigma)) < 1e-5:
                stop = True
                #print np.exp(entropy)
                #print self.sigma.T  
        #--------------------------------------------------------------------------#
        #initialiser y
        self.embeddings = np.zeros((self.nIter+2, nEx, self.dim))
        self.embeddings[1] = np.random.randn(nEx, self.dim) * 1e-4
        #--------------------------------------------------------------------------#
        # p(ij)
        self.pij = (self.pcond + self.pcond.T) / (2.*nEx)
        np.fill_diagonal(self.pij, 0)
        # Descente de Gradient
        #loss = []
        for t in xrange(1,self.nIter+1):
            # Matrice des distances 
            normy = np.sum((self.embeddings[t]**2),1)
            normy = np.reshape(normy, (1, nEx))
            distancey = normy + normy.T - 2 * self.embeddings[t].dot(self.embeddings[t].T)
            # q(ij)
            # self.qij = (distancey.sum() + nEx*(nEx-1)) / (1 + distancey)
            # np.fill_diagonal(self.qij, 0)
            self.qij = 1 / (1 + distancey)
            np.fill_diagonal(self.qij, 0)
            self.qij = self.qij / self.qij.sum()
            # Descente de gradient
            yt = self.embeddings[t]
            tmpgrad = 4 * ((self.pij - self.qij) / (1 + distancey)).reshape(nEx, nEx,1)
            for i in range(nEx):
                dy = (tmpgrad[i] * (yt[i]-yt)).sum(0)
                self.embeddings[t+1][i] = yt[i] - self.lr * dy + self.moment * (yt[i] - self.embeddings[t-1,i])
            #l = stats.entropy(self.qij, self.pij, 2).mean()
            #loss.append(l)
            #if (t % 100 == 0):
            #    print t,l
            #if (t % 100 == 0):
            #    print t



In [36]:

    
X_ini = np.vstack([data.data[data.target==i]
               for i in range(10)])
cols = np.hstack([data.target[data.target==i]
               for i in range(10)])



In [41]:

    
%%time
from sklearn import datasets
from scipy import stats
data = datasets.load_digits()

model = tSNE(10,500,1000,0)
model.fit(X_ini)









    



CPU times: user 7min 27s, sys: 1min 29s, total: 8min 56s
Wall time: 21min 44s



In [42]:

    
palette = np.array(sns.color_palette("hls", 10))
t = np.shape(model.embeddings)[0] -1

# We create a scatter plot.
f = plt.figure(figsize=(8, 8))
ax = plt.subplot(aspect='equal')
sc = ax.scatter(model.embeddings[t,:,0], model.embeddings[t,:,1], lw=0, s=40,
                c=palette[cols.astype(np.int)])
plt.xlim(-25, 25)
plt.ylim(-25, 25)
ax.axis('off')
ax.axis('tight')

#plt.plot(mod.embedding_[12][0],mod.embedding_[12][1], 'bv')
        
plt.show()

For reference, let's compare it with sklearn's TSNE



In [6]:

    
from sklearn.manifold import TSNE



In [43]:

    
mod = TSNE(random_state=1337)



In [44]:

    
%%time
X = mod.fit_transform(X_ini)









    



CPU times: user 24.7 s, sys: 5.29 s, total: 30 s
Wall time: 30.3 s



In [45]:

    
palette = np.array(sns.color_palette("hls", 10))

# We create a scatter plot.
f = plt.figure(figsize=(8, 8))
ax = plt.subplot(aspect='equal')
sc = ax.scatter(X[:,0], X[:,1], lw=0, s=40,
                c=palette[cols.astype(np.int)])
plt.xlim(-25, 25)
plt.ylim(-25, 25)
ax.axis('off')
ax.axis('tight')

#plt.plot(mod.embedding_[12][0],mod.embedding_[12][1], 'bv')
        
plt.show()

It produce similar results, albeit faster, as expected.



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]: