In this iPython Notebook are some examples to illustrate the potential of Similarity Encoders (SimEc) for creating similarity preserving embeddings. For further details and theoretical background on this new neural network architecture, please refer to the corresponding paper.
The notebook is structured as follows: After a short toy example, we present several experiments performed on the sklearn handwritten digits dataset (a small real world dataset which works great to quickly test some things out). Then we present the results on the MNIST and 20 newsgroups datasets reported in the original paper.
In [1]:
from __future__ import unicode_literals, division, print_function, absolute_import
from builtins import range
import numpy as np
np.random.seed(28)
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from scipy.spatial.distance import pdist, squareform
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.manifold import Isomap
from sklearn.decomposition import PCA, KernelPCA
from sklearn.random_projection import SparseRandomProjection
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_digits, fetch_mldata, fetch_20newsgroups
import tensorflow as tf
tf.set_random_seed(28)
import keras
# find nlputils at https://github.com/cod3licious/nlputils
from nlputils.features import FeatureTransform, features2mat
from simec import SimilarityEncoder
from utils import center_K, check_embed_match, check_similarity_match
from utils_datasets import load_dataset
from utils_plotting import get_colors, plot2d, plot3d, plot_digits, plot_mnist, plot_20news
%matplotlib inline
%load_ext autoreload
%autoreload 2
# set this to True if you want to save the figures from the paper
savefigs = False
In [2]:
n_train = 1000
n_test = 500
dataset = '3_circles'
# get training and test data
X, Y_plot = load_dataset(dataset, n_train, 3)
X_test, Y_plot_test = load_dataset(dataset, n_test, 5)
ss = StandardScaler(with_std=False)
X = ss.fit_transform(X)
X_test = ss.transform(X_test)
# plot the original data
plot3d(X, Y_plot, X_test, Y_plot_test, title='%s dataset' % dataset.replace('_', ' '))
In [3]:
# simple 2D PCA solution
pca = PCA(n_components=2)
X_embed = pca.fit_transform(X)
X_embed_test = pca.transform(X_test)
plot2d(X_embed, Y_plot, X_embed_test, Y_plot_test,
title='%s embedded with PCA' % dataset.replace('_', ' '))
print("similarity approximation : msqe: %.10f ; r^2: %.10f ; rho: %.10f" % check_similarity_match(X_embed, center_K(np.dot(X, X.T))))
print("similarity approximation (test): msqe: %.10f ; r^2: %.10f ; rho: %.10f" % check_similarity_match(X_embed_test, center_K(np.dot(X_test, X_test.T))))
In [4]:
# similarity encoder no activation function, linear similarities as targets
K_lin = center_K(np.dot(X, X.T))
simec = SimilarityEncoder(X.shape[1], 2, K_lin.shape[1], l2_reg_emb=0.01, s_ll_reg=0.5, S_ll=K_lin, opt=keras.optimizers.Adamax(lr=0.01))
simec.fit(X, K_lin)
X_embeds = simec.transform(X)
X_embed_tests = simec.transform(X_test)
plot2d(X_embeds, Y_plot, X_embed_tests, Y_plot_test,
title='%s embedded with a linear similarity encoder' % dataset.replace('_', ' '))
print("correlation with PCA : %f" % check_embed_match(X_embed, X_embeds)[1])
print("correlation with PCA (test): %f" % check_embed_match(X_embed_test, X_embed_tests)[1])
print("similarity approximation : msqe: %.10f ; r^2: %.10f ; rho: %.10f" % check_similarity_match(X_embeds, K_lin))
print("similarity approximation (test): msqe: %.10f ; r^2: %.10f ; rho: %.10f" % check_similarity_match(X_embed_tests, center_K(np.dot(X_test, X_test.T))))
In [5]:
# load digits dataset
digits = load_digits()
X = digits.data
X /= float(X.max())
ss = StandardScaler(with_std=False)
X = ss.fit_transform(X)
y = digits.target
n_samples, n_features = X.shape
n_targets = 1000
In [6]:
# PCA
pca = PCA(n_components=2)
X_embedp = pca.fit_transform(X)
plot_digits(X_embedp, digits, title='Digits embedded with PCA')
In [7]:
# linear kPCA - same as regular PCA...;)
kpca = KernelPCA(n_components=2, kernel='linear')
X_embed = kpca.fit_transform(X)
plot_digits(X_embed, digits, title='Digits embedded with linear Kernel PCA')
print("correlation of PCA and linear kPCA: %f" % check_embed_match(X_embed, X_embedp)[1])
K_lin = center_K(np.dot(X, X.T))
print("similarity approximation: msqe: %.10f ; r^2: %.10f ; rho: %.10f" % check_similarity_match(X_embed, K_lin))
In [8]:
# linear similarity encoder
simec = SimilarityEncoder(X.shape[1], 2, n_targets, l2_reg_emb=0.0001, l2_reg_out=0.0000001,
s_ll_reg=0.5, S_ll=K_lin[:n_targets,:n_targets], opt=keras.optimizers.Adamax(lr=0.01))
simec.fit(X, K_lin[:,:n_targets], epochs=15)
X_embeds = simec.transform(X)
plot_digits(X_embeds, digits, title='Digits embedded with a linear similarity encoder')
print("correlation with linear kPCA: %f" % check_embed_match(X_embed, X_embeds)[1])
print("similarity approximation: msqe: %.10f ; r^2: %.10f ; rho: %.10f" % check_similarity_match(X_embeds, K_lin))
In [9]:
# check how many relevant dimensions there are - obviously at most # feature dim, not # of data points
eigenvals = np.linalg.eigvalsh(K_lin)[::-1]
plt.figure();
plt.plot(list(range(1, K_lin.shape[0]+1)), eigenvals, '-o', markersize=3);
plt.plot([1, K_lin.shape[0]],[0,0], 'k--', linewidth=0.5);
plt.xlim(1, X.shape[1]+1);
plt.title('Eigenvalue Spectrum of the linear Kernel');
In [10]:
mse_kpca, rsq_kpca = [], []
mse_simec, rsq_simec = [], []
e_dims = [2, 4, 6, 10, 15, 25, 35, X.shape[1]]
for e_dim in e_dims:
print(e_dim)
kpca = KernelPCA(n_components=e_dim, kernel='linear')
X_embed = kpca.fit_transform(X)
mse_k, rsq, _ = check_similarity_match(X_embed, K_lin)
mse_kpca.append(mse_k)
rsq_kpca.append(rsq)
simec = SimilarityEncoder(X.shape[1], e_dim, n_targets, l2_reg_emb=0.0001, l2_reg_out=0.0000001,
s_ll_reg=0.5, S_ll=K_lin[:n_targets,:n_targets],
opt=keras.optimizers.Adamax(lr=0.01))
simec.fit(X, K_lin[:,:n_targets])
X_embeds = simec.transform(X)
mse, rsq, _ = check_similarity_match(X_embeds, K_lin)
mse_simec.append(mse)
rsq_simec.append(rsq)
print("mse kpca: %f; mse simec: %f" % (mse_k, mse))
print("correlation with linear kPCA: %f" % check_embed_match(X_embed, X_embeds)[1])
plt.figure();
plt.plot(e_dims, mse_kpca, '-o', markersize=3, label='kPCA');
plt.plot(e_dims, mse_simec, '-o', markersize=3, label='SimEc');
plt.legend(loc=0);
plt.title('Mean Squared Error');
plt.plot([0, e_dims[-1]], [0,0], 'k--', linewidth=0.5);
plt.xticks(e_dims, e_dims);
plt.figure();
plt.plot(e_dims, rsq_kpca, '-o', markersize=3, label='kPCA');
plt.plot(e_dims, rsq_simec, '-o', markersize=3, label='SimEc');
plt.plot([0, e_dims[-1]], [1,1], 'k--', linewidth=0.5);
plt.legend(loc=0);
plt.title('$R^2$');
plt.xticks(e_dims, e_dims);
In [11]:
# Gaussian kernel PCA
D = squareform(pdist(X, 'euclidean'))
gamma = 1./(np.median(D)**2)
K_rbf = center_K(rbf_kernel(X, X, gamma))
kpca = KernelPCA(n_components=2, kernel='rbf', gamma=gamma)
X_embed = kpca.fit_transform(X)
plot_digits(X_embed, digits, title='Digits embedded with Gaussian Kernel PCA')
print("similarity approximation: msqe: %.10f ; r^2: %.10f ; rho: %.10f" % check_similarity_match(X_embed, K_rbf))
In [12]:
# non-linear SimEc with rbf kernel
simec = SimilarityEncoder(X.shape[1], 2, n_targets, hidden_layers=[(100, 'tanh')], l2_reg=0.00000001,
l2_reg_emb=0.0001, l2_reg_out=0.0000001, s_ll_reg=0.5, S_ll=K_rbf[:n_targets,:n_targets],
opt=keras.optimizers.Adamax(lr=0.01))
simec.fit(X, K_rbf[:,:n_targets], epochs=15)
X_embeds = simec.transform(X)
plot_digits(X_embeds, digits, title='Digits - SimEc (rbf kernel, 1 h.l.)')
print("correlation with Gaussian kPCA: %f" % check_embed_match(X_embed, X_embeds)[1])
print("similarity approximation: msqe: %.10f ; r^2: %.10f ; rho: %.10f" % check_similarity_match(X_embeds, K_rbf))
In [13]:
# isomap
isomap = Isomap(n_neighbors=10, n_components=2)
X_embed = isomap.fit_transform(X)
plot_digits(X_embed, digits, title='Digits embedded with isomap')
K_geod = center_K(-0.5*(isomap.dist_matrix_**2))
print("similarity approximation: msqe: %.10f ; r^2: %.10f ; rho: %.10f" % check_similarity_match(X_embed, K_geod))
In [14]:
# non-linear SimEc based on isomap
simec = SimilarityEncoder(X.shape[1], 2, n_targets, hidden_layers=[(100, 'tanh')], s_ll_reg=10.,
S_ll=K_geod[:n_targets,:n_targets], opt=keras.optimizers.Adamax(lr=0.01))
simec.fit(X, K_geod[:,:n_targets])
X_embeds = simec.transform(X)
plot_digits(X_embeds, digits, title='Digits - SimEc (geodesic, 1 h.l.)')
print("correlation with isomap: %f" % check_embed_match(X_embed, X_embeds)[1])
print("similarity approximation: msqe: %.10f ; r^2: %.10f ; rho: %.10f" % check_similarity_match(X_embeds, K_geod))
In [15]:
# non-linear SimEc based on isomap - 2 hidden layers
simec = SimilarityEncoder(X.shape[1], 2, n_targets, hidden_layers=[(200, 'tanh'), (100, 'tanh')],
s_ll_reg=10., S_ll=K_geod[:n_targets,:n_targets],
opt=keras.optimizers.Adamax(lr=0.01))
simec.fit(X, K_geod[:,:n_targets])
X_embeds = simec.transform(X)
plot_digits(X_embeds, digits, title='Digits - SimEc (geodesic, 2 h.l.)')
print("correlation with isomap: %f" % check_embed_match(X_embed, X_embeds)[1])
print("similarity approximation: msqe: %.10f ; r^2: %.10f ; rho: %.10f" % check_similarity_match(X_embeds, K_geod))
In [16]:
# load digits
mnist = fetch_mldata('MNIST original', data_home='data')
X = mnist.data/255. # normalize to 0-1
y = np.array(mnist.target, dtype=int)
# subsample 10000 random data points
np.random.seed(42)
n_samples = 10000
n_test = 2000
n_targets = 1000
rnd_idx = np.random.permutation(X.shape[0])[:n_samples]
X_test, y_test = X[rnd_idx[:n_test],:], y[rnd_idx[:n_test]]
X, y = X[rnd_idx[n_test:],:], y[rnd_idx[n_test:]]
ss = StandardScaler(with_std=False)
X = ss.fit_transform(X)
X_test = ss.transform(X_test)
n_train, n_features = X.shape
keras.backend.clear_session()
In [17]:
# linear kPCA
K_lin = center_K(np.dot(X, X.T))
K_lin_test = center_K(np.dot(X_test, X_test.T))
kpca = KernelPCA(n_components=2, kernel='linear')
X_embed = kpca.fit_transform(X)
X_embed_test = kpca.transform(X_test)
plot_mnist(X_embed, y, X_embed_test, y_test, title='MNIST - linear Kernel PCA')
if savefigs: plt.savefig('fig_spectral_mnist_lin_kpca.png', dpi=300)
print("similarity approximation : msqe: %.10f ; r^2: %.10f ; rho: %.10f" % check_similarity_match(X_embed, K_lin))
print("similarity approximation (test): msqe: %.10f ; r^2: %.10f ; rho: %.10f" % check_similarity_match(X_embed_test, K_lin_test))
In [18]:
# linear similarity encoder
simec = SimilarityEncoder(X.shape[1], 2, n_targets, l2_reg_emb=0.001, l2_reg_out=0.00001,
s_ll_reg=0.5, S_ll=K_lin[:n_targets,:n_targets],
opt=keras.optimizers.Adamax(lr=0.005))
simec.fit(X, K_lin[:,:n_targets])
X_embeds = simec.transform(X)
X_embed_tests = simec.transform(X_test)
plot_mnist(X_embeds, y, X_embed_tests, y_test, title='MNIST - SimEc (lin. kernel, 0 h.l.)')
if savefigs: plt.savefig('fig_spectral_mnist_lin_simec.png', dpi=300)
print("correlation with lin kPCA : %f" % check_embed_match(X_embed, X_embeds)[1])
print("correlation with lin kPCA (test): %f" % check_embed_match(X_embed_test, X_embed_tests)[1])
print("similarity approximation : msqe: %.10f ; r^2: %.10f ; rho: %.10f" % check_similarity_match(X_embeds, K_lin))
print("similarity approximation (test): msqe: %.10f ; r^2: %.10f ; rho: %.10f" % check_similarity_match(X_embed_tests, K_lin_test))
In [19]:
# check how many relevant dimensions there are - obviously at most # feature dim, not # of data points
eigenvals = np.linalg.eigvalsh(K_lin)[::-1]
plt.figure();
plt.plot(list(range(1, K_lin.shape[0]+1)), eigenvals, '-o', markersize=3);
plt.plot([1, K_lin.shape[0]],[0,0], 'k--', linewidth=0.5);
plt.xlim(1, 100);
plt.title('Eigenvalue Spectrum of the linear Kernel');
In [20]:
mse_rp, mse_rp_test, rsq_rp = [], [], []
mse_kpca, mse_kpca_test, rsq_kpca = [], [], []
mse_simec, mse_simec_test, rsq_simec = [], [], []
e_dims = [2, 4, 6, 10, 15, 25, 50, 100]
for e_dim in e_dims:
print(e_dim)
# random projections
rp = SparseRandomProjection(n_components=10*e_dim, random_state=42)
X_embed = rp.fit_transform(X)
X_embed_test = rp.transform(X_test)
mse_r, rsq, _ = check_similarity_match(X_embed, K_lin)
mse_rp.append(mse_r)
rsq_rp.append(rsq)
mse_rt, _, _ = check_similarity_match(X_embed_test, K_lin_test)
mse_rp_test.append(mse_rt)
# kpca
kpca = KernelPCA(n_components=e_dim, kernel='linear')
X_embed = kpca.fit_transform(X)
X_embed_test = kpca.transform(X_test)
mse_k, rsq, _ = check_similarity_match(X_embed, K_lin)
mse_kpca.append(mse_k)
rsq_kpca.append(rsq)
mse_kt, _, _ = check_similarity_match(X_embed_test, K_lin_test)
mse_kpca_test.append(mse_kt)
# simec
l = 0.002 if e_dim == 100 else 0.001
simec = SimilarityEncoder(X.shape[1], e_dim, n_targets, s_ll_reg=0.5, S_ll=K_lin[:n_targets,:n_targets],
l2_reg_emb=l, l2_reg_out=0.00001, opt=keras.optimizers.Adamax(lr=0.003))
simec.fit(X, K_lin[:,:n_targets])
X_embeds = simec.transform(X)
X_embed_tests = simec.transform(X_test)
mse, rsq, _ = check_similarity_match(X_embeds, K_lin)
mse_simec.append(mse)
rsq_simec.append(rsq)
mse_t, _, _ = check_similarity_match(X_embed_tests, K_lin_test)
mse_simec_test.append(mse_t)
print("mse rp: %f (%f); mse kpca: %f (%f); mse simec: %f (%f)" % (mse_r, mse_rt, mse_k, mse_kt, mse, mse_t))
print("correlation with linear kPCA : %f" % check_embed_match(X_embed, X_embeds)[1])
print("correlation with linear kPCA (test): %f" % check_embed_match(X_embed_test, X_embed_tests)[1])
keras.backend.clear_session()
colors = get_colors(15)
plt.figure();
plt.plot(e_dims, mse_kpca, '-o', markersize=3, c=colors[0], label='kPCA');
plt.plot(e_dims, mse_kpca_test, '-o', markersize=3, c=colors[2], label='kPCA (test)');
plt.plot(e_dims, mse_simec, '-o', markersize=3, c=colors[6], label='SimEc');
plt.plot(e_dims, mse_simec_test, '-o', markersize=3, c=colors[8], label='SimEc (test)');
plt.plot(e_dims, mse_rp, '-o', markersize=3, c=colors[12], label='Sparse Random Proj. $\\times$ 10');
plt.plot(e_dims, mse_rp_test, '-o', markersize=3, c=colors[14], label='SPR $\\times$ 10 (test)');
plt.legend(loc=0);
plt.title('MNIST (linear kernel)');
plt.plot([0, e_dims[-1]], [0,0], 'k--', linewidth=0.5);
plt.xticks(e_dims, e_dims);
plt.xlabel('Number of Embedding Dimensions ($d$)')
plt.ylabel('Mean Squared Error of $\hat{S}$')
plt.figure();
colors = get_colors(10)
plt.plot(e_dims, mse_kpca, '-o', markersize=3, c=colors[4], label='kPCA');
plt.plot(e_dims, mse_kpca_test, '-o', markersize=3, c=colors[2], label='kPCA (test)');
plt.plot(e_dims, mse_simec, '-o', markersize=3, c=colors[8], label='SimEc');
plt.plot(e_dims, mse_simec_test, '-o', markersize=3, c=colors[6], label='SimEc (test)');
plt.legend(loc=0);
plt.title('MNIST (linear kernel)');
plt.plot([0, e_dims[-1]], [0,0], 'k--', linewidth=0.5);
plt.xticks(e_dims, e_dims);
plt.xlabel('Number of Embedding Dimensions ($d$)')
plt.ylabel('Mean Squared Error of $\hat{S}$')
#plt.ylabel(r'$\frac{1}{N^2} \sum ( S-\hat{S} )^2$')
print("e_dims=", e_dims)
print("mse_kpca=", mse_kpca)
print("mse_kpca_test=", mse_kpca_test)
print("mse_simec=", mse_simec)
print("mse_simec_test=", mse_simec_test)
if savefigs: plt.savefig('fig_spectral_mnist_lin_mse_edim.pdf', dpi=300)
plt.figure();
plt.plot(e_dims, rsq_kpca, '-o', markersize=3, label='kPCA');
plt.plot(e_dims, rsq_simec, '-o', markersize=3, label='SimEc');
plt.plot(e_dims, rsq_rp, '-o', markersize=3, label='SPR $\\times$ 10');
plt.plot([0, e_dims[-1]], [1,1], 'k--', linewidth=0.5);
plt.legend(loc=0);
plt.title('$R^2$');
plt.xticks(e_dims, e_dims);
In [21]:
# check effect of different number of targets
mse_simec, mse_simec_test = [], []
targets = [100, 250, 500, 750, 1000, 1500, 2500, 5000]
kpca = KernelPCA(n_components=10, kernel='linear')
X_embed = kpca.fit_transform(X)
X_embed_test = kpca.transform(X_test)
mse_k = check_similarity_match(X_embed, K_lin)[0]
mse_kt = check_similarity_match(X_embed_test, K_lin_test)[0]
for n in targets:
print(n)
simec = SimilarityEncoder(X.shape[1], 10, n, s_ll_reg=0.5, S_ll=K_lin[:n,:n], l2_reg_emb=0.001,
l2_reg_out=0.00001, opt=keras.optimizers.Adamax(lr=0.003))
simec.fit(X, K_lin[:,:n])
X_embed = simec.transform(X)
X_embed_test = simec.transform(X_test)
mse = check_similarity_match(X_embed, K_lin)[0]
mse_simec.append(mse)
mse_t = check_similarity_match(X_embed_test, K_lin_test)[0]
mse_simec_test.append(mse_t)
print("mse kpca: %f (%f); mse simec: %f (%f)" % (mse_k, mse_kt, mse, mse_t))
keras.backend.clear_session()
colors = get_colors(10)
plt.figure();
plt.plot([0, targets[-1]], [mse_k, mse_k], '--', linewidth=0.5, c=colors[8], label='kPCA');
plt.plot([0, targets[-1]], [mse_kt, mse_kt], '--', linewidth=0.5, c=colors[6], label='kPCA (test)');
plt.plot(targets, mse_simec, '-o', markersize=3, c=colors[8], label='SimEc');
plt.plot(targets, mse_simec_test, '-o', markersize=3, c=colors[6], label='SimEc (test)');
plt.legend(loc=0);
plt.title('MNIST (linear kernel)');
plt.xticks([100, 500, 1000, 1500, 2500, 5000], [100, 500, 1000, 1500, 2500, 5000]);
plt.xlabel('Number of Targets ($n$)')
plt.ylabel('Mean Squared Error of $\hat{S}$')
print("targets=", targets)
print("mse_k=", mse_k)
print("mse_kt=", mse_kt)
print("mse_simec=", mse_simec)
print("mse_simec_test=", mse_simec_test)
if savefigs: plt.savefig('fig_spectral_mnist_lin_mse_ntargets.pdf', dpi=300)
In [22]:
# missing targets
n_targets = 1000
np.random.seed(15)
mse_simec, mse_simec_test = [], []
kpca = KernelPCA(n_components=10, kernel='linear')
X_embed = kpca.fit_transform(X)
X_embed_test = kpca.transform(X_test)
mse_k = check_similarity_match(X_embed, K_lin)[0]
mse_kt = check_similarity_match(X_embed_test, K_lin_test)[0]
missing_targets = [0., 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
for m in missing_targets:
print(m)
K_lin_noisy = K_lin.copy()
K_lin_noisy[np.random.rand(*K_lin_noisy.shape)<=m] = -100
simec = SimilarityEncoder(X.shape[1], 10, n_targets, mask_value=-100, s_ll_reg=0.5,
S_ll=K_lin_noisy[:n_targets,:n_targets], l2_reg_emb=0.01,
l2_reg_out=0.00001, opt=keras.optimizers.Adamax(lr=0.003))
simec.fit(X, K_lin_noisy[:,:n_targets])
X_embed = simec.transform(X)
X_embed_test = simec.transform(X_test)
mse = check_similarity_match(X_embed, K_lin)[0]
mse_simec.append(mse)
mse_t = check_similarity_match(X_embed_test, K_lin_test)[0]
mse_simec_test.append(mse_t)
print("mse kpca: %f (%f); mse simec: %f (%f)" % (mse_k, mse_kt, mse, mse_t))
keras.backend.clear_session()
colors = get_colors(10)
plt.figure();
plt.plot([0, missing_targets[-1]], [mse_k, mse_k], '--', linewidth=0.5, c=colors[8], label='kPCA');
plt.plot([0, missing_targets[-1]], [mse_kt, mse_kt], '--', linewidth=0.5, c=colors[6], label='kPCA (test)');
plt.plot(missing_targets, mse_simec, '-o', markersize=3, c=colors[8], label='SimEc');
plt.plot(missing_targets, mse_simec_test, '-o', markersize=3, c=colors[6], label='SimEc (test)');
plt.legend(loc=0);
plt.title('MNIST (linear kernel)');
plt.xticks(missing_targets, missing_targets);
plt.xlabel('Fraction of Missing Targets')
plt.ylabel('Mean Squared Error of $\hat{S}$')
plt.figure();
plt.plot([0, missing_targets[-1]], [mse_k, mse_k], '--', linewidth=0.5, c=colors[8], label='kPCA');
plt.plot([0, missing_targets[-1]], [mse_kt, mse_kt], '--', linewidth=0.5, c=colors[6], label='kPCA (test)');
plt.plot(missing_targets, mse_simec, '-o', markersize=3, c=colors[8], label='SimEc');
plt.plot(missing_targets, mse_simec_test, '-o', markersize=3, c=colors[6], label='SimEc (test)');
plt.legend(loc=0);
plt.ylim([10, 40])
plt.title('MNIST (linear kernel)');
plt.xticks(missing_targets, missing_targets);
plt.xlabel('Fraction of Missing Targets')
plt.ylabel('Mean Squared Error of $\hat{S}$')
print("missing_targets=", missing_targets)
print("mse_k=", mse_k)
print("mse_kt=", mse_kt)
print("mse_simec=", mse_simec)
print("mse_simec_test=", mse_simec_test)
if savefigs: plt.savefig('fig_spectral_mnist_lin_mse_missingt.pdf', dpi=300)
In [23]:
# isomap
n_targets = 1000
isomap = Isomap(n_neighbors=10, n_components=2)
X_embed = isomap.fit_transform(X)
X_embed_test = isomap.transform(X_test)
plot_mnist(X_embed, y, X_embed_test, y_test, title='MNIST - isomap')
K_geod = center_K(-0.5*(isomap.dist_matrix_**2))
print("similarity approximation: msqe: %.10f ; r^2: %.10f ; rho: %.10f" % check_similarity_match(X_embed, K_geod))
In [24]:
# non-linear SimEc to approximate isomap solution
simec = SimilarityEncoder(X.shape[1], 2, n_targets, hidden_layers=[(200, 'tanh'), (100, 'tanh')], l2_reg=0.0005,
l2_reg_emb=0.001, l2_reg_out=0.000001, s_ll_reg=10., S_ll=K_geod[:n_targets,:n_targets],
opt=keras.optimizers.Adamax(lr=0.005))
simec.fit(X, K_geod[:,:n_targets])
X_embeds = simec.transform(X)
X_embed_tests = simec.transform(X_test)
plot_mnist(X_embeds, y, X_embed_tests, y_test, title='MNIST - SimEc (isomap, 2 h.l.)')
print("correlation with isomap: %f" % check_embed_match(X_embed, X_embeds)[1])
print("correlation with isomap (test): %f" % check_embed_match(X_embed_test, X_embed_tests)[1])
print("similarity approximation: msqe: %.10f ; r^2: %.10f ; rho: %.10f" % check_similarity_match(X_embeds, K_geod))
In [25]:
# Gaussian kernel PCA
D = squareform(pdist(X, 'euclidean'))
sigma = np.median(D)
gamma = 0.5/(sigma**2)
print("gamma: %.5f" % gamma)
K_rbf_nonc = rbf_kernel(X, X, gamma)
K_rbf = center_K(K_rbf_nonc)
K_rbf_test_nonc = rbf_kernel(X_test, X_test, gamma)
K_rbf_test = center_K(K_rbf_test_nonc)
kpca = KernelPCA(n_components=2, kernel='rbf', gamma=gamma)
X_embed = kpca.fit_transform(X)
X_embed_test = kpca.transform(X_test)
plot_mnist(X_embed, y, X_embed_test, y_test, title='MNIST - RBF Kernel PCA')
if savefigs: plt.savefig('fig_spectral_mnist_rbf_kpca.png', dpi=300)
print("similarity approximation : msqe: %.10f ; r^2: %.10f ; rho: %.10f" % check_similarity_match(X_embed, K_rbf))
print("similarity approximation (test): msqe: %.10f ; r^2: %.10f ; rho: %.10f" % check_similarity_match(X_embed_test, K_rbf_test))
In [26]:
# linear SimEc with rbf kernel
simec = SimilarityEncoder(X.shape[1], 2, n_targets, l2_reg_emb=0.0001, l2_reg_out=0.0000001,
s_ll_reg=0.5, S_ll=K_rbf[:n_targets,:n_targets], opt=keras.optimizers.Adamax(lr=0.0003))
simec.fit(X, K_rbf[:,:n_targets], epochs=25)
X_embeds = simec.transform(X)
X_embed_tests = simec.transform(X_test)
plot_mnist(X_embeds, y, X_embed_tests, y_test, title='MNIST - SimEc (RBF kernel, 0 h.l.)')
print("correlation with kPCA : %f" % check_embed_match(X_embed, X_embeds)[1])
print("correlation with kPCA (test): %f" % check_embed_match(X_embed_test, X_embed_tests)[1])
print("similarity approximation : msqe: %.10f ; r^2: %.10f ; rho: %.10f" % check_similarity_match(X_embeds, K_rbf))
print("similarity approximation (test): msqe: %.10f ; r^2: %.10f ; rho: %.10f" % check_similarity_match(X_embed_tests, K_rbf_test))
In [27]:
# non-linear SimEc with rbf kernel
simec = SimilarityEncoder(X.shape[1], 2, n_targets, hidden_layers=[(1000, 'tanh')], l2_reg=0.00000001,
l2_reg_emb=0.00001, l2_reg_out=0.0000001, s_ll_reg=5.,
S_ll=K_rbf[:n_targets,:n_targets], opt=keras.optimizers.Adamax(lr=0.0003))
simec.fit(X, K_rbf[:,:n_targets], epochs=25)
X_embeds = simec.transform(X)
X_embed_tests = simec.transform(X_test)
plot_mnist(X_embeds, y, X_embed_tests, y_test, title='MNIST - SimEc (RBF kernel, 1 h.l.)')
if savefigs: plt.savefig('fig_spectral_mnist_rbf_simec.png', dpi=300)
print("correlation with kPCA : %f" % check_embed_match(X_embed, X_embeds)[1])
print("correlation with kPCA (test): %f" % check_embed_match(X_embed_test, X_embed_tests)[1])
print("similarity approximation : msqe: %.10f ; r^2: %.10f ; rho: %.10f" % check_similarity_match(X_embeds, K_rbf))
print("similarity approximation (test): msqe: %.10f ; r^2: %.10f ; rho: %.10f" % check_similarity_match(X_embed_tests, K_rbf_test))
In [28]:
# check how many relevant dimensions there are - obviously at most # feature dim, not # of data points
eigenvals = np.linalg.eigvalsh(K_rbf)[::-1]
plt.figure();
plt.plot(list(range(1, K_rbf.shape[0]+1)), eigenvals, '-o', markersize=3);
plt.plot([1, K_rbf.shape[0]],[0,0], 'k--', linewidth=0.5);
plt.xlim(1, 100);
plt.title('Eigenvalue Spectrum of the RBF Kernel');
In [29]:
mse_rf, mse_rf_test, rsq_rf = [], [], []
mse_kpca, mse_kpca_test, rsq_kpca = [], [], []
mse_simec, mse_simec_test, rsq_simec = [], [], []
e_dims = [2, 4, 6, 10, 15, 25, 50, 100]
for e_dim in e_dims:
print(e_dim)
# random features
W = np.random.normal(size=(5*e_dim, X.shape[1]), scale=1./sigma)
tmp = X.dot(W.T)
X_embed = np.sqrt(1./(5*e_dim)) * np.hstack([np.cos(tmp), np.sin(tmp)])
tmp = X_test.dot(W.T)
X_embed_test = np.sqrt(1./(5*e_dim)) * np.hstack([np.cos(tmp), np.sin(tmp)])
mse_r, rsq, _ = check_similarity_match(X_embed, K_rbf_nonc)
mse_rf.append(mse_r)
rsq_rf.append(rsq)
mse_rt, _, _ = check_similarity_match(X_embed_test, K_rbf_test_nonc)
mse_rf_test.append(mse_rt)
# kernel pca
kpca = KernelPCA(n_components=e_dim, kernel='rbf', gamma=gamma)
X_embed = kpca.fit_transform(X)
X_embed_test = kpca.transform(X_test)
mse_k, rsq, _ = check_similarity_match(X_embed, K_rbf)
mse_kpca.append(mse_k)
rsq_kpca.append(rsq)
mse_kt, _, _ = check_similarity_match(X_embed_test, K_rbf_test)
mse_kpca_test.append(mse_kt)
# simec
simec = SimilarityEncoder(X.shape[1], e_dim, n_targets, hidden_layers=[(1000, 'tanh')],
l2_reg=0.00000001, l2_reg_emb=0.00001, l2_reg_out=0.0000001,
s_ll_reg=5., S_ll=K_rbf[:n_targets,:n_targets],
opt=keras.optimizers.Adamax(lr=0.0005))
simec.fit(X, K_rbf[:,:n_targets])
X_embeds = simec.transform(X)
X_embed_tests = simec.transform(X_test)
mse, rsq, _ = check_similarity_match(X_embeds, K_rbf)
mse_simec.append(mse)
rsq_simec.append(rsq)
mse_t, _, _ = check_similarity_match(X_embed_tests, K_rbf_test)
mse_simec_test.append(mse_t)
print("mse rf: %f (%f); mse kpca: %f (%f); mse simec: %f (%f)" % (mse_r, mse_rt, mse_k, mse_kt, mse, mse_t))
print("correlation with kPCA : %f" % check_embed_match(X_embed, X_embeds)[1])
print("correlation with kPCA (test): %f" % check_embed_match(X_embed_test, X_embed_tests)[1])
keras.backend.clear_session()
colors = get_colors(15)
plt.figure();
plt.plot(e_dims, mse_kpca, '-o', markersize=3, c=colors[0], label='kPCA');
plt.plot(e_dims, mse_kpca_test, '-o', markersize=3, c=colors[2], label='kPCA (test)');
plt.plot(e_dims, mse_simec, '-o', markersize=3, c=colors[6], label='SimEc');
plt.plot(e_dims, mse_simec_test, '-o', markersize=3, c=colors[8], label='SimEc (test)');
plt.plot(e_dims, mse_rf, '-o', markersize=3, c=colors[12], label='Random Features $\\times$ 10');
plt.plot(e_dims, mse_rf_test, '-o', markersize=3, c=colors[14], label='RF $\\times$ 10 (test)');
plt.legend(loc=0);
plt.title('MNIST (RBF kernel)');
plt.plot([0, e_dims[-1]], [0,0], 'k--', linewidth=0.5);
plt.xticks(e_dims, e_dims);
plt.ylim(ymax=0.005)
plt.xlabel('Number of Embedding Dimensions ($d$)')
plt.ylabel('Mean Squared Error of $\hat{S}$')
plt.figure();
colors = get_colors(10)
plt.plot(e_dims, mse_kpca, '-o', markersize=3, c=colors[4], label='kPCA');
plt.plot(e_dims, mse_kpca_test, '-o', markersize=3, c=colors[2], label='kPCA (test)');
plt.plot(e_dims, mse_simec, '-o', markersize=3, c=colors[8], label='SimEc');
plt.plot(e_dims, mse_simec_test, '-o', markersize=3, c=colors[6], label='SimEc (test)');
plt.legend(loc=0);
plt.title('MNIST (RBF kernel)');
plt.plot([0, e_dims[-1]], [0,0], 'k--', linewidth=0.5);
plt.xticks(e_dims, e_dims);
plt.xlabel('Number of Embedding Dimensions ($d$)')
plt.ylabel('Mean Squared Error of $\hat{S}$')
print("e_dims=", e_dims)
print("mse_kpca=", mse_kpca)
print("mse_kpca_test=", mse_kpca_test)
print("mse_simec=", mse_simec)
print("mse_simec_test=", mse_simec_test)
if savefigs: plt.savefig('fig_spectral_mnist_rbf_mse_edim.pdf', dpi=300)
plt.figure();
plt.plot(e_dims, rsq_kpca, '-o', markersize=3, label='kPCA');
plt.plot(e_dims, rsq_simec, '-o', markersize=3, label='SimEc');
plt.plot(e_dims, rsq_rf, '-o', markersize=3, label='RF $\\times$ 10');
plt.plot([0, e_dims[-1]], [1,1], 'k--', linewidth=0.5);
plt.legend(loc=0);
plt.title('$R^2$');
plt.xticks(e_dims, e_dims);
In [30]:
# missing targets
n_targets = 1000
np.random.seed(15)
mse_simec, mse_simec_test = [], []
kpca = KernelPCA(n_components=10, kernel='rbf', gamma=gamma)
X_embed = kpca.fit_transform(X)
X_embed_test = kpca.transform(X_test)
mse_k = check_similarity_match(X_embed, K_rbf)[0]
mse_kt = check_similarity_match(X_embed_test, K_rbf_test)[0]
missing_targets = [0., 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
for m in missing_targets:
print(m)
K_rbf_noisy = K_rbf.copy()
K_rbf_noisy[np.random.rand(*K_rbf_noisy.shape)<=m] = -100
simec = SimilarityEncoder(X.shape[1], 10, n_targets, hidden_layers=[(1000, 'tanh')],
l2_reg=0.00000001, l2_reg_emb=0.00001, l2_reg_out=0.0000001,
mask_value=-100, s_ll_reg=5., S_ll=K_rbf_noisy[:n_targets,:n_targets],
opt=keras.optimizers.Adamax(lr=0.0005))
simec.fit(X, K_rbf_noisy[:,:n_targets])
X_embed = simec.transform(X)
X_embed_test = simec.transform(X_test)
mse = check_similarity_match(X_embed, K_rbf)[0]
mse_simec.append(mse)
mse_t = check_similarity_match(X_embed_test, K_rbf_test)[0]
mse_simec_test.append(mse_t)
print("mse kpca: %f (%f); mse simec: %f (%f)" % (mse_k, mse_kt, mse, mse_t))
keras.backend.clear_session()
colors = get_colors(10)
plt.figure();
plt.plot([0, missing_targets[-1]], [mse_k, mse_k], '--', linewidth=0.5, c=colors[8], label='kPCA');
plt.plot([0, missing_targets[-1]], [mse_kt, mse_kt], '--', linewidth=0.5, c=colors[6], label='kPCA (test)');
plt.plot(missing_targets, mse_simec, '-o', markersize=3, c=colors[8], label='SimEc');
plt.plot(missing_targets, mse_simec_test, '-o', markersize=3, c=colors[6], label='SimEc (test)');
plt.legend(loc=0);
plt.title('MNIST (RBF kernel)');
plt.xticks(missing_targets, missing_targets);
plt.xlabel('Fraction of Missing Targets');
plt.ylabel('Mean Squared Error of $\hat{S}$');
To show that SimEc embeddings can also be computed for other types of data, we do some further experiments with the 20 newsgroups dataset. We subsample 7 of the 20 categories and remove meta information such as headers to avoid overfitting (see also http://scikit-learn.org/stable/datasets/twenty_newsgroups.html
). The posts are transformed into very high dimensional tf-idf vectors used as input to the SimEc and to compute the linear kernel matrix.
In [31]:
## load the data and transform it into a tf-idf representation
categories = [
"comp.graphics",
"rec.autos",
"rec.sport.baseball",
"sci.med",
"sci.space",
"soc.religion.christian",
"talk.politics.guns"
]
newsgroups_train = fetch_20newsgroups(subset='train', remove=(
'headers', 'footers', 'quotes'), data_home='data', categories=categories, random_state=42)
newsgroups_test = fetch_20newsgroups(subset='test', remove=(
'headers', 'footers', 'quotes'), data_home='data', categories=categories, random_state=42)
# store in dicts (if the text contains more than 3 words)
textdict = {i: t for i, t in enumerate(newsgroups_train.data) if len(t.split()) > 3}
textdict.update({i: t for i, t in enumerate(newsgroups_test.data, len(newsgroups_train.data)) if len(t.split()) > 3})
train_ids = [i for i in range(len(newsgroups_train.data)) if i in textdict]
test_ids = [i for i in range(len(newsgroups_train.data), len(textdict)) if i in textdict]
print("%i training and %i test samples" % (len(train_ids), len(test_ids)))
# transform into tf-idf features
ft = FeatureTransform(norm='max', weight=True, renorm='max')
docfeats = ft.texts2features(textdict, fit_ids=train_ids)
# organize in feature matrix
X, featurenames = features2mat(docfeats, train_ids)
X_test, _ = features2mat(docfeats, test_ids, featurenames)
print("%i features" % len(featurenames))
targets = np.hstack([newsgroups_train.target,newsgroups_test.target])
y = targets[train_ids]
y_test = targets[test_ids]
target_names = newsgroups_train.target_names
n_targets = 1000
In [32]:
# linear kPCA
kpca = KernelPCA(n_components=2, kernel='linear')
X_embed = kpca.fit_transform(X)
X_embed_test = kpca.transform(X_test)
plot_20news(X_embed, y, target_names, X_embed_test, y_test,
title='20newsgroups - linear Kernel PCA', legend=True)
if savefigs: plt.savefig('fig_spectral_20news_lin_kpca.png', dpi=300)
# compute linear kernel and center
K_lin = center_K(X.dot(X.T).A)
K_lin_test = center_K(X_test.dot(X_test.T).A)
print("similarity approximation : msqe: %.10f ; r^2: %.10f ; rho: %.10f" % check_similarity_match(X_embed, K_lin))
print("similarity approximation (test): msqe: %.10f ; r^2: %.10f ; rho: %.10f" % check_similarity_match(X_embed_test, K_lin_test))
In [33]:
# project to 2d with linear similarity encoder
n_targets = 1000
simec = SimilarityEncoder(X.shape[1], 2, n_targets, sparse_inputs=True, l2_reg_emb=0.00001,
l2_reg_out=0.00000001, s_ll_reg=0.5, S_ll=K_lin[:n_targets,:n_targets],
opt=keras.optimizers.Adamax(lr=0.005))
simec.fit(X, K_lin[:,:n_targets])
X_embeds = simec.transform(X)
X_embed_tests = simec.transform(X_test)
plot_20news(X_embeds, y, target_names, X_embed_tests, y_test,
title='20 newsgroups - SimEc (lin. kernel, 0 h.l.)', legend=True)
if savefigs: plt.savefig('fig_spectral_20news_lin_simec.png', dpi=300)
print("correlation with lin kPCA: %f" % check_embed_match(X_embed, X_embeds)[1])
print("correlation with lin kPCA (test): %f" % check_embed_match(X_embed_test, X_embed_tests)[1])
print("similarity approximation: msqe: %.10f ; r^2: %.10f ; rho: %.10f" % check_similarity_match(X_embeds, K_lin))
print("similarity approximation (test): msqe: %.10f ; r^2: %.10f ; rho: %.10f" % check_similarity_match(X_embed_tests, K_lin_test))
In [34]:
# check how many relevant dimensions there are - obviously at most # feature dim, not # of data points
eigenvals = np.linalg.eigvalsh(K_lin)[::-1]
plt.figure();
plt.plot(list(range(1, K_lin.shape[0]+1)), eigenvals, '-o', markersize=3);
plt.plot([1, K_lin.shape[0]],[0,0], 'k--', linewidth=0.5);
plt.xlim(1, 100);
plt.title('Eigenvalue Spectrum of the linear Kernel');
In [35]:
mse_kpca, mse_kpca_test, rsq_kpca = [], [], []
mse_simec, mse_simec_test, rsq_simec = [], [], []
e_dims = [2, 4, 6, 10, 15, 25, 50, 100]
for e_dim in e_dims:
print(e_dim)
kpca = KernelPCA(n_components=e_dim, kernel='linear')
X_embed = kpca.fit_transform(X)
X_embed_test = kpca.transform(X_test)
mse_k, rsq, _ = check_similarity_match(X_embed, K_lin)
mse_kpca.append(mse_k)
rsq_kpca.append(rsq)
mse_kt, _, _ = check_similarity_match(X_embed_test, K_lin_test)
mse_kpca_test.append(mse_kt)
simec = SimilarityEncoder(X.shape[1], e_dim, n_targets, sparse_inputs=True, l2_reg_emb=0.00001,
l2_reg_out=0.00000001, s_ll_reg=0.5, S_ll=K_lin[:n_targets,:n_targets],
opt=keras.optimizers.Adamax(lr=0.005))
simec.fit(X, K_lin[:,:n_targets])
X_embeds = simec.transform(X)
X_embed_tests = simec.transform(X_test)
mse, rsq, _ = check_similarity_match(X_embeds, K_lin)
mse_simec.append(mse)
rsq_simec.append(rsq)
mse_t, _, _ = check_similarity_match(X_embed_tests, K_lin_test)
mse_simec_test.append(mse_t)
print("mse kpca: %f (%f); mse simec: %f (%f)" % (mse_k, mse_kt, mse, mse_t))
print("correlation with linear kPCA : %f" % check_embed_match(X_embed, X_embeds)[1])
print("correlation with linear kPCA (test): %f" % check_embed_match(X_embed_test, X_embed_tests)[1])
keras.backend.clear_session()
colors = get_colors(10)
plt.figure();
plt.plot(e_dims, mse_kpca, '-o', markersize=3, c=colors[4], label='kPCA');
plt.plot(e_dims, mse_kpca_test, '-o', markersize=3, c=colors[2], label='kPCA (test)');
plt.plot(e_dims, mse_simec, '-o', markersize=3, c=colors[8], label='SimEc');
plt.plot(e_dims, mse_simec_test, '-o', markersize=3, c=colors[6], label='SimEc (test)');
plt.legend(loc=3);
plt.title('20 newsgroups (linear kernel)');
plt.plot([0, e_dims[-1]], [0,0], 'k--', linewidth=0.5);
plt.xticks(e_dims, e_dims);
plt.xlabel('Number of Embedding Dimensions ($d$)')
plt.ylabel('Mean Squared Error of $\hat{S}$')
print("e_dims=", e_dims)
print("mse_kpca=", mse_kpca)
print("mse_kpca_test=", mse_kpca_test)
print("mse_simec=", mse_simec)
print("mse_simec_test=", mse_simec_test)
if savefigs: plt.savefig('fig_spectral_20news_lin_mse_edim.pdf', dpi=300)
plt.figure();
plt.plot(e_dims, rsq_kpca, '-o', markersize=3, label='kPCA');
plt.plot(e_dims, rsq_simec, '-o', markersize=3, label='SimEc');
plt.plot([0, e_dims[-1]], [1,1], 'k--', linewidth=0.5);
plt.legend(loc=0);
plt.title('$R^2$');
plt.xticks(e_dims, e_dims);
In [36]:
# check effect of different number of targets
mse_simec, mse_simec_test = [], []
targets = [100, 250, 500, 750, 1000, 1500, 2500, K_lin.shape[1]]
e_dim = 10
kpca = KernelPCA(n_components=e_dim, kernel='linear')
X_embed = kpca.fit_transform(X)
X_embed_test = kpca.transform(X_test)
mse_k, _, _ = check_similarity_match(X_embed, K_lin)
mse_kt, _, _ = check_similarity_match(X_embed_test, K_lin_test)
for n in targets:
print(n)
simec = SimilarityEncoder(X.shape[1], e_dim, n, sparse_inputs=True, l2_reg_emb=0.00001,
l2_reg_out=0.00000001, s_ll_reg=0.5, S_ll=K_lin[:n,:n],
opt=keras.optimizers.Adamax(lr=0.005))
simec.fit(X, K_lin[:,:n])
X_embeds = simec.transform(X)
X_embed_tests = simec.transform(X_test)
mse, _, _ = check_similarity_match(X_embeds, K_lin)
mse_simec.append(mse)
mse_t, _, _ = check_similarity_match(X_embed_tests, K_lin_test)
mse_simec_test.append(mse_t)
print("mse kpca: %f (%f); mse simec: %f (%f)" % (mse_k, mse_kt, mse, mse_t))
print("correlation with linear kPCA : %f" % check_embed_match(X_embed, X_embeds)[1])
print("correlation with linear kPCA (test): %f" % check_embed_match(X_embed_test, X_embed_tests)[1])
colors = get_colors(10)
plt.figure();
plt.plot([0, targets[-1]], [mse_k, mse_k], '--', linewidth=0.5, c=colors[8], label='kPCA');
plt.plot([0, targets[-1]], [mse_kt, mse_kt], '--', linewidth=0.5, c=colors[6], label='kPCA (test)');
plt.plot(targets, mse_simec, '-o', markersize=3, c=colors[8], label='SimEc');
plt.plot(targets, mse_simec_test, '-o', markersize=3, c=colors[6], label='SimEc (test)');
plt.legend(loc=0);
plt.title('20 newsgroups (linear kernel)');
plt.xticks([100, 500, 1000, 1500, 2500, 4000], [100, 500, 1000, 1500, 2500, 4000]);
plt.xlabel('Number of Targets ($n$)')
plt.ylabel('Mean Squared Error of $\hat{S}$')
print("targets=", targets)
print("mse_k=", mse_k)
print("mse_kt=", mse_kt)
print("mse_simec=", mse_simec)
print("mse_simec_test=", mse_simec_test)
if savefigs: plt.savefig('fig_spectral_20news_lin_mse_ntargets.pdf', dpi=300)
In [37]:
# check effect of missing target values
n_targets = 1000
np.random.seed(10)
mse_simec, mse_simec_test = [], []
kpca = KernelPCA(n_components=10, kernel='linear')
X_embed = kpca.fit_transform(X)
X_embed_test = kpca.transform(X_test)
mse_k = check_similarity_match(X_embed, K_lin)[0]
mse_kt = check_similarity_match(X_embed_test, K_lin_test)[0]
missing_targets = [0., 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
for m in missing_targets:
print(m)
K_lin_noisy = K_lin.copy()
K_lin_noisy[np.random.rand(*K_lin_noisy.shape)<=m] = -100
simec = SimilarityEncoder(X.shape[1], 10, n_targets, mask_value=-100, sparse_inputs=True, l2_reg_emb=0.00001,
l2_reg_out=0.00000001, s_ll_reg=0.5, S_ll=K_lin_noisy[:n_targets,:n_targets],
opt=keras.optimizers.Adamax(lr=0.005))
simec.fit(X, K_lin_noisy[:,:n_targets])
X_embed = simec.transform(X)
X_embed_test = simec.transform(X_test)
mse = check_similarity_match(X_embed, K_lin)[0]
mse_simec.append(mse)
mse_t = check_similarity_match(X_embed_test, K_lin_test)[0]
mse_simec_test.append(mse_t)
print("mse kpca: %f (%f); mse simec: %f (%f)" % (mse_k, mse_kt, mse, mse_t))
keras.backend.clear_session()
colors = get_colors(10)
plt.figure();
plt.plot([0, missing_targets[-1]], [mse_k, mse_k], '--', linewidth=0.5, c=colors[8], label='kPCA');
plt.plot([0, missing_targets[-1]], [mse_kt, mse_kt], '--', linewidth=0.5, c=colors[6], label='kPCA (test)');
plt.plot(missing_targets, mse_simec, '-o', markersize=3, c=colors[8], label='SimEc');
plt.plot(missing_targets, mse_simec_test, '-o', markersize=3, c=colors[6], label='SimEc (test)');
plt.legend(loc=0);
plt.title('20 newsgroups (linear kernel)');
plt.xticks(missing_targets, missing_targets);
plt.xlabel('Fraction of Missing Targets')
plt.ylabel('Mean Squared Error of $\hat{S}$')
print("missing_targets=", missing_targets)
print("mse_k=", mse_k)
print("mse_kt=", mse_kt)
print("mse_simec=", mse_simec)
print("mse_simec_test=", mse_simec_test)
if savefigs: plt.savefig('fig_spectral_20news_lin_mse_missingt.pdf', dpi=300)
In [ ]: