In [8]:
%matplotlib inline
import matplotlib.pyplot as plt 
import seaborn as sns; sns.set() 
import numpy as np

In [9]:
from matplotlib.image import imread 

def make_hello(N=1000, rseed=42):
    # Make a plot with "HELLO" text; save as PNG
    fig, ax = plt.subplots(figsize=(4, 1))
    fig.subplots_adjust(left=0, right=1, bottom=0, top=1)
    ax.axis('off')
    ax.text(0.5, 0.4, 'HELLO', va='center', ha='center', weight='bold', size=85)
    fig.savefig('hello.png')
    plt.close(fig)
    # Open this PNG and draw random points from it
    data = imread('hello.png')[::-1, :, 0].T
    rng = np.random.RandomState(rseed)
    X = rng.rand(4 * N, 2)
    i, j = (X * data.shape).astype(int).T 
    mask = (data[i, j] < 1)
    X = X[mask]
    X[:, 0] *= (data.shape[0] / data.shape[1]) 
    X = X[:N]
    return X[np.argsort(X[:, 0])]

In [10]:
X = make_hello(1000)
colorize = dict(c=X[:, 0], cmap=plt.cm.get_cmap('rainbow', 5))
plt.scatter(X[:, 0], X[:, 1], **colorize)
plt.axis('equal');



In [11]:
def rotate(X, angle):
    theta = np.deg2rad(angle)
    R = [[np.cos(theta), np.sin(theta)],
         [-np.sin(theta), np.cos(theta)]]
    return np.dot(X, R)


X2 = rotate(X, 20) + 5
plt.scatter(X2[:, 0], X2[:, 1], **colorize)
plt.axis('equal');



In [12]:
from sklearn.metrics import pairwise_distances 
D = pairwise_distances(X)
D.shape


Out[12]:
(1000, 1000)

In [13]:
plt.imshow(D, cmap='Blues', interpolation='nearest')
plt.grid(False)
plt.colorbar();



In [14]:
D2 = pairwise_distances(X2)
np.allclose(D, D2)


Out[14]:
True

In [15]:
# multidimensional scaling algorithm (MDS) aims to do: given a distance matrix 
# between points, it recovers a D-dimensional coordinate representation of the data
from sklearn.manifold import MDS
# using the precomputed dissimilarity to specify that we are passing a distance matrix
model = MDS(n_components=2, dissimilarity='precomputed', random_state=1) 
out = model.fit_transform(D)
plt.scatter(out[:, 0], out[:, 1], **colorize)
plt.axis('equal');



In [16]:
def random_projection(X, dimension=3, rseed=42): 
    assert dimension >= X.shape[1]
    rng = np.random.RandomState(rseed)
    C = rng.randn(dimension, dimension) 
    e, V = np.linalg.eigh(np.dot(C, C.T)) 
    return np.dot(X, V[:X.shape[1]])


X3 = random_projection(X, 3)
X3.shape


Out[16]:
(1000, 3)

In [17]:
from mpl_toolkits import mplot3d 
ax = plt.axes(projection='3d')
ax.scatter3D(X3[:, 0], X3[:, 1], X3[:, 2], **colorize)
ax.view_init(azim=70, elev=50)



In [18]:
model = MDS(n_components=2, random_state=1)
out3 = model.fit_transform(X3)
plt.scatter(out3[:, 0], out3[:, 1], **colorize)
plt.axis('equal');


This is essentially the goal of a manifold learning estimator:

given high-dimensional embedded data, it seeks a low-dimensional

representation of the data that preserves certain relationships

within the data.

In the case of MDS, the quantity preserved is the distance between

every pair of points. Where MDS breaks down is when the embedding is nonlinear


In [19]:
def make_hello_s_curve(X): 
    t = (X[:, 0] - 2) * 0.75 * np.pi
    x = np.sin(t)
    y = X[:, 1]
    z = np.sign(t) * (np.cos(t) - 1) 
    return np.vstack((x, y, z)).T


XS = make_hello_s_curve(X)

In [20]:
ax = plt.axes(projection='3d')
ax.scatter3D(XS[:, 0], XS[:, 1], XS[:, 2], **colorize);



In [21]:
model = MDS(n_components=2, random_state=2) 
outS = model.fit_transform(XS) 
plt.scatter(outS[:, 0], outS[:, 1], **colorize) 
plt.axis('equal');



In [22]:
from sklearn.manifold import LocallyLinearEmbedding
model = LocallyLinearEmbedding(n_neighbors=100, n_components=2, method='modified',
                               eigen_solver='dense')
out = model.fit_transform(XS)
fig, ax = plt.subplots()
ax.scatter(out[:, 0], out[:, 1], **colorize)
ax.set_ylim(0.15, -0.15);



In [23]:
from sklearn.datasets import fetch_lfw_people 
faces = fetch_lfw_people(min_faces_per_person=30) 
faces.data.shape


Out[23]:
(2370, 2914)

In [24]:
%matplotlib inline
import matplotlib.pyplot as plt 
import seaborn as sns; sns.set() 
import numpy as np

In [25]:
fig, ax = plt.subplots(4, 8, subplot_kw=dict(xticks=[], yticks=[])) 
for i, axi in enumerate(ax.flat):
    axi.imshow(faces.images[i], cmap='gray')



In [26]:
from sklearn.decomposition import PCA
pca = PCA(100).fit(faces.data)
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('n components')
plt.ylabel('cumulative variance')


Out[26]:
<matplotlib.text.Text at 0x10e214110>

In [27]:
from sklearn.manifold import Isomap 
model = Isomap(n_components=2)
proj = model.fit_transform(faces.data)
proj.shape


Out[27]:
(2370, 2)

In [52]:
from matplotlib import offsetbox
def plot_components(data, model, images=None, ax=None, thumb_frac=0.05, cmap='gray'):
    ax = ax or plt.gca()
    proj = model.fit_transform(data)
    ax.plot(proj[:, 0], proj[:, 1], '.k')
    if images is not None:
        min_dist_2 = (thumb_frac * max(proj.max(0) - proj.min(0))) ** 2 
        shown_images = np.array([2 * proj.max(0)])
        for i in range(data.shape[0]):
            dist = np.sum((proj[i] - shown_images) ** 2, 1) 
            if np.min(dist) < min_dist_2:
                # don't show points that are too close
                continue
            shown_images = np.vstack([shown_images, proj[i]])
            imagebox = offsetbox.AnnotationBbox(
                offsetbox.OffsetImage(images[i], cmap=cmap), proj[i])
            ax.add_artist(imagebox)

In [53]:
fig, ax = plt.subplots(figsize=(10, 10))
plot_components(faces.data, model=Isomap(n_components=2),
                images=faces.images[:, ::2, ::2])



In [30]:
from sklearn.datasets import fetch_mldata 
import scipy.io

# mnist = fetch_mldata('MNIST original')
mnist = scipy.io.loadmat('mnist-original.mat')
mnist['data'] = mnist['data'].T
mnist['data'].shape


Out[30]:
(70000, 784)

In [31]:
fig, ax = plt.subplots(6, 8, subplot_kw=dict(xticks=[], yticks=[])) 
for i, axi in enumerate(ax.flat):
    axi.imshow(mnist['data'][1250 * i].reshape(28, 28), cmap='gray_r')



In [32]:
# use only 1/30 of the data: full dataset takes a long time! 
data = mnist['data'][::30]
target = mnist['label'].T[::30]
model = Isomap(n_components=2)
proj = model.fit_transform(data)
plt.scatter(proj[:, 0], proj[:, 1], c=target, cmap=plt.cm.get_cmap('jet', 10))
plt.colorbar(ticks=range(10))
plt.clim(-0.5, 9.5);



In [46]:
# Choose 1/4 of the "1" digits to project
data = mnist['data'][(mnist['label'] == 1).flatten()][::4, :]  ## !!!
fig, ax = plt.subplots(figsize=(10, 10))
model = Isomap(n_neighbors=5, n_components=2, eigen_solver='dense')
plot_components(data, model, images=data.reshape((-1, 28, 28)),
                ax=ax, thumb_frac=0.05, cmap='gray_r')



In [49]:
(mnist['label'] == 1).shape


Out[49]:
(1, 70000)

In [ ]: