In [1]:
from __future__ import unicode_literals, division, print_function, absolute_import
from builtins import range
from time import time
import numpy as np
np.random.seed(28)
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.ticker import NullFormatter
from sklearn.metrics.pairwise import linear_kernel
from sklearn.manifold import Isomap
from sklearn.decomposition import KernelPCA, PCA
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import fetch_mldata, make_s_curve
from simec_torch import SimilarityEncoder
from utils import center_K, check_embed_match, check_similarity_match
from utils_plotting import plot_mnist
%matplotlib inline
%load_ext autoreload
%autoreload 2
In [2]:
# load digits
mnist = fetch_mldata('MNIST original', data_home='data')
X = mnist.data/255. # normalize to 0-1
y = np.array(mnist.target, dtype=int)
# subsample 10000 random data points
np.random.seed(42)
n_samples = 10000
n_test = 2000
rnd_idx = np.random.permutation(X.shape[0])[:n_samples]
X_test, y_test = X[rnd_idx[:n_test],:], y[rnd_idx[:n_test]]
X, y = X[rnd_idx[n_test:],:], y[rnd_idx[n_test:]]
ss = StandardScaler(with_std=False)
X = ss.fit_transform(X)
X_test = ss.transform(X_test)
n_train, n_features = X.shape
In [3]:
# centered linear kernel matrix
K_lin = center_K(np.dot(X, X.T))
In [4]:
# linear kPCA
kpca = KernelPCA(n_components=2, kernel='linear')
X_embed = kpca.fit_transform(X)
X_embed_test = kpca.transform(X_test)
plot_mnist(X_embed, y, X_embed_test, y_test, title='MNIST - linear Kernel PCA')
print("error similarity match: msqe: %.10f ; r^2: %.10f ; rho: %.10f" % check_similarity_match(X_embed, K_lin))
In [5]:
# on how many target similarities you want to train - faster and works equally well than training on all
n_targets = 1000 # K_lin.shape[1]
# initialize the model
simec = SimilarityEncoder(X.shape[1], 2, n_targets)
# train the model to get an embedding with which the target similarities
# can be linearly approximated
simec.fit(X, K_lin[:,:n_targets], epochs=25, lr=0.001, s_ll_reg=500., S_ll=K_lin[:n_targets,:n_targets])
# get the embeddings
X_embeds = simec.transform(X)
X_embed_tests = simec.transform(X_test)
plot_mnist(X_embeds, y, X_embed_tests, y_test, title='MNIST - SimEc (lin. kernel, linear)')
# correlation with the embedding produced by the spectral method should be high
print("correlation with lin kPCA : %f" % check_embed_match(X_embed, X_embeds)[1])
print("correlation with lin kPCA (test): %f" % check_embed_match(X_embed_test, X_embed_tests)[1])
# similarity match error should be similar to the one from kpca
print("error similarity match: msqe: %.10f ; r^2: %.10f ; rho: %.10f" % check_similarity_match(X_embeds, K_lin))
print("error similarity match: msqe: %.10f ; r^2: %.10f ; rho: %.10f" % check_similarity_match(simec.predict(X), K_lin[:,:n_targets], X_embed_is_S_approx=True))
In [6]:
# isomap
isomap = Isomap(n_neighbors=10, n_components=2)
X_embed = isomap.fit_transform(X)
X_embed_test = isomap.transform(X_test)
plot_mnist(X_embed, y, X_embed_test, y_test, title='MNIST - isomap')
In [7]:
# non-linear SimEc to approximate isomap solution
K_geod = center_K(-0.5*(isomap.dist_matrix_**2))
n_targets = 1000
# initialize the model
simec = SimilarityEncoder(X.shape[1], 2, n_targets, hidden_layers=[(20, 'tanh')])
# train the model to get an embedding with which the target similarities
# can be linearly approximated
simec.fit(X, K_geod[:,:n_targets], epochs=25, lr=0.01, s_ll_reg=500., S_ll=K_geod[:n_targets,:n_targets])
# get the embeddings
X_embeds = simec.transform(X)
X_embed_tests = simec.transform(X_test)
plot_mnist(X_embeds, y, X_embed_tests, y_test, title='MNIST - SimEc (isomap, 1 h.l.)')
print("correlation with isomap : %f" % check_embed_match(X_embed, X_embeds)[1])
print("correlation with isomap (test): %f" % check_embed_match(X_embed_test, X_embed_tests)[1])
print("error similarity match: msqe: %.10f ; r^2: %.10f ; rho: %.10f" % check_similarity_match(X_embeds, K_geod))
print("error similarity match: msqe: %.10f ; r^2: %.10f ; rho: %.10f" % check_similarity_match(simec.predict(X), K_geod[:,:n_targets], X_embed_is_S_approx=True))
In [8]:
n_points = 3000
X, color = make_s_curve(n_points, random_state=0)
n_components = 2
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(X[:, 0], X[:, 1], X[:, 2], c=color, cmap=plt.cm.Spectral);
In [9]:
fig = plt.figure(figsize=(15, 8))
# PCA
t0 = time()
Y = PCA(n_components=2).fit_transform(X)
t1 = time()
print("PCA: %.2g sec" % (t1 - t0))
ax = fig.add_subplot(221)
plt.scatter(Y[:, 0], Y[:, 1], c=color, cmap=plt.cm.Spectral)
plt.title("PCA (%.2g sec)" % (t1 - t0))
ax.xaxis.set_major_formatter(NullFormatter())
ax.yaxis.set_major_formatter(NullFormatter())
plt.axis('tight')
# Isomap
t0 = time()
isomap = Isomap(n_components=2, n_neighbors=10).fit(X)
Y = isomap.transform(X)
t1 = time()
print("Isomap: %.2g sec" % (t1 - t0))
ax = fig.add_subplot(222)
plt.scatter(Y[:, 0], Y[:, 1], c=color, cmap=plt.cm.Spectral)
plt.title("Isomap (%.2g sec)" % (t1 - t0))
ax.xaxis.set_major_formatter(NullFormatter())
ax.yaxis.set_major_formatter(NullFormatter())
plt.axis('tight')
# simec with linear kernel --> global structure, like linear kpca
n_targets = 1000
K_lin = center_K(linear_kernel(X))
t0 = time()
simec = SimilarityEncoder(X.shape[1], 2, n_targets)
simec.fit(X, K_lin[:,:n_targets], lr=0.1, s_ll_reg=500., S_ll=K_lin[:n_targets,:n_targets])
Y = simec.transform(X)
t1 = time()
print("linear SimEc: %.2g sec" % (t1 - t0))
ax = fig.add_subplot(223)
plt.scatter(Y[:, 0], Y[:, 1], c=color, cmap=plt.cm.Spectral)
plt.title("linear SimEc (%.2g sec)" % (t1 - t0))
ax.xaxis.set_major_formatter(NullFormatter())
ax.yaxis.set_major_formatter(NullFormatter())
plt.axis('tight')
# simec with geodesic distance (from isomap) --> local manifold
# since the geodesic distance is not trivially derived from the original data
# like the linear kernel, we need to add non-linear hidden layers to get a
# good embedding similar to the original isomap embedding
G = center_K(-0.5*(isomap.dist_matrix_**2))
t0 = time()
simec = SimilarityEncoder(X.shape[1], 2, n_targets, hidden_layers=[(25, "tanh"), (10, "tanh")])
simec.fit(X, G[:,:n_targets], lr=0.01, s_ll_reg=500., S_ll=G[:n_targets,:n_targets])
Y = simec.transform(X)
t1 = time()
print("isomap SimEc: %.2g sec" % (t1 - t0))
ax = fig.add_subplot(224)
plt.scatter(Y[:, 0], Y[:, 1], c=color, cmap=plt.cm.Spectral)
plt.title("isomap SimEc (2hl; %.2g sec)" % (t1 - t0))
ax.xaxis.set_major_formatter(NullFormatter())
ax.yaxis.set_major_formatter(NullFormatter())
plt.axis('tight');
In [ ]: