UMAP Experiments



In [1]:

    
import warnings
warnings.filterwarnings('ignore')



In [2]:

    
%matplotlib inline



In [3]:

    
import numpy as np



In [27]:

    
# noisy line
# X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2], [4, 3], [4, -1]])

# clear line
X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2], [4, 3], [0, 0]])

# V shape
# X = np.array([[-1, 1], [-2, 2], [-3, 3], [1, 1], [2, 2], [3, 3], [4, 4]])



In [28]:

    
X









    Out[28]:





array([[-1, -1],
       [-2, -1],
       [-3, -2],
       [ 1,  1],
       [ 2,  1],
       [ 3,  2],
       [ 4,  3],
       [ 0,  0]])



In [29]:

    
import matplotlib.pyplot as plt



In [30]:

    
plt.figure(figsize=(10,10))
plt.scatter(X[:, 0], X[:, 1])
# plt.savefig('original.png')









    Out[30]:





<matplotlib.collections.PathCollection at 0x1e7304487b8>



In [31]:

    
umap_clf = umap.UMAP()



In [32]:

    
umap_clf.fit(X)









    Out[32]:





UMAP(a=None, angular_rp_forest=False, b=None, init='spectral',
   learning_rate=1.0, local_connectivity=1.0, metric='euclidean',
   metric_kwds=None, min_dist=0.1, n_components=2, n_epochs=None,
   n_neighbors=15, negative_sample_rate=5, random_state=None,
   repulsion_strength=1.0, set_op_mix_ratio=1.0, spread=1.0,
   target_metric='categorical', target_metric_kwds=None,
   target_n_neighbors=-1, target_weight=0.5, transform_queue_size=4.0,
   transform_seed=42, verbose=False)



In [33]:

    
X_transformed = umap_clf.transform(X)
X_transformed









    Out[33]:





array([[ 3.6886566, 17.00542  ],
       [ 4.5674624, 17.548746 ],
       [ 4.6304417, 16.833479 ],
       [ 4.400698 , 18.657553 ],
       [ 4.2689643, 19.575293 ],
       [ 3.3750377, 19.867044 ],
       [ 3.410143 , 19.179314 ],
       [ 3.7172391, 17.998352 ]], dtype=float32)



In [34]:

    
plt.figure(figsize=(10,10))
plt.scatter(X_transformed[:, 0], X_transformed[:, 1])
# plt.savefig('reduced.png')









    Out[34]:





<matplotlib.collections.PathCollection at 0x1e72ffa6160>



In [40]:

    
umap_clf = umap.UMAP(n_components=1)



In [43]:

    
X_transformed = umap_clf.fit_transform(X)



In [44]:

    
X_transformed









    Out[44]:





array([[-16.016811],
       [-16.54715 ],
       [-17.116419],
       [-18.691952],
       [-18.05029 ],
       [-19.895098],
       [-19.309942],
       [-15.376902]], dtype=float32)



In [42]:

    
plt.figure(figsize=(10,10))
plt.plot(X_transformed)









    Out[42]:





[<matplotlib.lines.Line2D at 0x1e7314f34a8>]

Maybe we get better results on actual clusters



In [45]:

    
from sklearn.datasets.samples_generator import make_blobs
centers = [[1, 1], [-1, -1], [1, -1]]
X, labels_true = make_blobs(n_samples=750, centers=centers, cluster_std=0.4, random_state=42)



In [46]:

    
X









    Out[46]:





array([[ 0.71263709, -0.64203025],
       [ 0.86291419,  0.67908909],
       [-0.37379039, -1.0263001 ],
       ...,
       [ 1.45712913,  1.30077321],
       [ 1.86127298,  0.69306097],
       [ 1.27278119,  0.8758933 ]])



In [47]:

    
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from itertools import cycle, islice

def plot_dataset(X, y_pred=[0], fname=None):
    # normalize dataset for easier parameter selection
    X = StandardScaler().fit_transform(X)

    # last color is black to properly display label -1 as noise (black)
    colors = np.append(np.array(list(islice(cycle(['#377eb8', '#ff7f00', '#4daf4a',
                                     '#f781bf', '#a65628', '#984ea3',
                                     '#999999', '#e41a1c', '#dede00']),
                              int(max(y_pred) + 1)))), ['#000000'])
    plt.figure(figsize=(10, 10))

    plt.scatter(X[:, 0], X[:, 1], s=10, color=colors[y_pred])

    plt.xlim(-2.5, 2.5)
    plt.ylim(-2.5, 2.5)
    plt.xticks(())
    plt.yticks(())
    if fname:
        plt.savefig(fname)



In [69]:

    
plot_dataset(X)



In [67]:

    
# UMAP?



In [63]:

    
umap_clf = umap.UMAP(n_components=2)
X_transformed = umap_clf.fit_transform(X)
plot_dataset(X_transformed, labels_true)



In [70]:

    
umap_clf = umap.UMAP(n_components=2, n_neighbors=50)
X_transformed = umap_clf.fit_transform(X)
plot_dataset(X_transformed, labels_true)



In [68]:

    
umap_clf = umap.UMAP(n_components=10, n_neighbors=5)
X_transformed = umap_clf.fit_transform(X)
plot_dataset(X_transformed, labels_true)



In [57]:

    
import umap
from sklearn.datasets import load_digits

digits = load_digits()

embedding = umap.UMAP().fit_transform(digits.data)



In [58]:

    
embedding









    Out[58]:





array([[17.183512 , -1.0178467],
       [-5.648297 ,  4.8143783],
       [-2.262863 ,  2.4185073],
       ...,
       [-3.9919388,  2.6743715],
       [-4.5486097, -2.592968 ],
       [-3.7394724,  2.1295016]], dtype=float32)



In [59]:

    
plot_dataset(embedding)



In [ ]: