Similarity Encoders with Keras

using the model definition from

In [1]:
from __future__ import unicode_literals, division, print_function, absolute_import
from builtins import range
import numpy as np
import matplotlib.pyplot as plt
from sklearn.manifold import Isomap
from sklearn.decomposition import KernelPCA
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import fetch_mldata, fetch_20newsgroups

import tensorflow as tf
import keras
from keras.models import Sequential, Model
from keras.layers import Input, Dense, Activation

from nlputils.features import FeatureTransform, features2mat

from simec import SimilarityEncoder
from utils import center_K, check_embed_match, check_similarity_match
from utils_plotting import plot_mnist, plot_20news

%matplotlib inline
%load_ext autoreload
%autoreload 2

/home/franzi/anaconda2/envs/python36/lib/python3.6/site-packages/h5py/ FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
  from ._conv import register_converters as _register_converters
Using TensorFlow backend.

MNIST with Linear Kernel

In [2]:
# load digits
mnist = fetch_mldata('MNIST original', data_home='data')
X =  # normalize to 0-1
y = np.array(, dtype=int)
# subsample 10000 random data points
n_samples = 10000
n_test = 2000
rnd_idx = np.random.permutation(X.shape[0])[:n_samples]
X_test, y_test = X[rnd_idx[:n_test],:], y[rnd_idx[:n_test]]
X, y = X[rnd_idx[n_test:],:], y[rnd_idx[n_test:]]
ss = StandardScaler(with_std=False)
X = ss.fit_transform(X)
X_test = ss.transform(X_test)
n_train, n_features = X.shape

In [3]:
# centered linear kernel matrix
K_lin = center_K(, X.T))

In [4]:
# linear kPCA
kpca = KernelPCA(n_components=2, kernel='linear')
X_embed = kpca.fit_transform(X)
X_embed_test = kpca.transform(X_test)
plot_mnist(X_embed, y, X_embed_test, y_test, title='MNIST - linear Kernel PCA')
print("error similarity match: msqe: %.10f ; r^2: %.10f ; rho: %.10f" % check_similarity_match(X_embed, K_lin))

error similarity match: msqe: 51.4531662441 ; r^2: 0.4438507851 ; rho: 0.6341602248

In [5]:
# on how many target similarities you want to train - faster and works equally well than training on all
n_targets = 1000  # K_lin.shape[1]
# initialize the model
simec = SimilarityEncoder(X.shape[1], 2, n_targets, s_ll_reg=0.5, S_ll=K_lin[:n_targets,:n_targets])
# train the model to get an embedding with which the target similarities
# can be linearly approximated, K_lin[:,:n_targets], epochs=25)
# get the embeddings
X_embeds = simec.transform(X)
X_embed_tests = simec.transform(X_test)
plot_mnist(X_embeds, y, X_embed_tests, y_test, title='MNIST - SimEc (lin. kernel, linear)')
# correlation with the embedding produced by the spectral method should be high
print("correlation with lin kPCA       : %f" % check_embed_match(X_embed, X_embeds)[1])
print("correlation with lin kPCA (test): %f" % check_embed_match(X_embed_test, X_embed_tests)[1])
# similarity match error should be similar to the one from kpca
print("error similarity match: msqe: %.10f ; r^2: %.10f ; rho: %.10f" % check_similarity_match(X_embeds, K_lin))

Epoch 1/25
8000/8000 [==============================] - 1s 155us/step - loss: 138.9871
Epoch 2/25
8000/8000 [==============================] - 1s 93us/step - loss: 134.7487
Epoch 3/25
8000/8000 [==============================] - 1s 88us/step - loss: 128.0694
Epoch 4/25
8000/8000 [==============================] - 1s 93us/step - loss: 121.4029
Epoch 5/25
8000/8000 [==============================] - 1s 93us/step - loss: 116.2597
Epoch 6/25
8000/8000 [==============================] - 1s 95us/step - loss: 112.6398
Epoch 7/25
8000/8000 [==============================] - 1s 93us/step - loss: 109.3631
Epoch 8/25
8000/8000 [==============================] - 1s 87us/step - loss: 105.8619
Epoch 9/25
8000/8000 [==============================] - 1s 89us/step - loss: 102.6419
Epoch 10/25
8000/8000 [==============================] - 1s 94us/step - loss: 100.4048
Epoch 11/25
8000/8000 [==============================] - 1s 92us/step - loss: 99.2771
Epoch 12/25
8000/8000 [==============================] - 1s 92us/step - loss: 98.8235
Epoch 13/25
8000/8000 [==============================] - 1s 90us/step - loss: 98.6205
Epoch 14/25
8000/8000 [==============================] - 1s 93us/step - loss: 98.5062
Epoch 15/25
8000/8000 [==============================] - 1s 90us/step - loss: 98.4333
Epoch 16/25
8000/8000 [==============================] - 1s 90us/step - loss: 98.3796
Epoch 17/25
8000/8000 [==============================] - 1s 89us/step - loss: 98.3371
Epoch 18/25
8000/8000 [==============================] - 1s 90us/step - loss: 98.3028
Epoch 19/25
8000/8000 [==============================] - 1s 96us/step - loss: 98.2718
Epoch 20/25
8000/8000 [==============================] - 1s 93us/step - loss: 98.2441
Epoch 21/25
8000/8000 [==============================] - 1s 88us/step - loss: 98.2178
Epoch 22/25
8000/8000 [==============================] - 1s 90us/step - loss: 98.1937
Epoch 23/25
8000/8000 [==============================] - 1s 93us/step - loss: 98.1695
Epoch 24/25
8000/8000 [==============================] - 1s 88us/step - loss: 98.1459
Epoch 25/25
8000/8000 [==============================] - 1s 90us/step - loss: 98.1206
correlation with lin kPCA       : 0.989194
correlation with lin kPCA (test): 0.989490
error similarity match: msqe: 57769.3758877199 ; r^2: 0.4418963431 ; rho: 0.6326533868

Non-linear MNIST embedding with isomap

In [6]:
# isomap
isomap = Isomap(n_neighbors=10, n_components=2)
X_embed = isomap.fit_transform(X)
X_embed_test = isomap.transform(X_test)
plot_mnist(X_embed, y, X_embed_test, y_test, title='MNIST - isomap')

In [7]:
# non-linear SimEc to approximate isomap solution
K_geod = center_K(-0.5*(isomap.dist_matrix_**2))
n_targets = 1000
# initialize the model
simec = SimilarityEncoder(X.shape[1], 2, n_targets, hidden_layers=[(20, 'tanh')], s_ll_reg=0.5, 
                          S_ll=K_geod[:n_targets,:n_targets], opt=keras.optimizers.Adamax(lr=0.01))
# train the model to get an embedding with which the target similarities
# can be linearly approximated, K_geod[:,:n_targets], epochs=25)
# get the embeddings
X_embeds = simec.transform(X)
X_embed_tests = simec.transform(X_test)
plot_mnist(X_embeds, y, X_embed_tests, y_test, title='MNIST - SimEc (isomap, 1 h.l.)')
print("correlation with isomap       : %f" % check_embed_match(X_embed, X_embeds)[1])
print("correlation with isomap (test): %f" % check_embed_match(X_embed_test, X_embed_tests)[1])

Epoch 1/25
8000/8000 [==============================] - 1s 144us/step - loss: 51178.2788
Epoch 2/25
8000/8000 [==============================] - 1s 123us/step - loss: 46771.8850
Epoch 3/25
8000/8000 [==============================] - 1s 124us/step - loss: 41210.9276
Epoch 4/25
8000/8000 [==============================] - 1s 122us/step - loss: 38588.5507
Epoch 5/25
8000/8000 [==============================] - 1s 122us/step - loss: 37866.1624
Epoch 6/25
8000/8000 [==============================] - 1s 121us/step - loss: 37444.9817
Epoch 7/25
8000/8000 [==============================] - 1s 123us/step - loss: 37145.0221
Epoch 8/25
8000/8000 [==============================] - 1s 122us/step - loss: 36876.5603
Epoch 9/25
8000/8000 [==============================] - 1s 122us/step - loss: 36615.4620
Epoch 10/25
8000/8000 [==============================] - 1s 121us/step - loss: 36335.6757
Epoch 11/25
8000/8000 [==============================] - 1s 123us/step - loss: 36026.0781
Epoch 12/25
8000/8000 [==============================] - 1s 134us/step - loss: 35676.8645
Epoch 13/25
8000/8000 [==============================] - 1s 122us/step - loss: 35261.1747
Epoch 14/25
8000/8000 [==============================] - 1s 136us/step - loss: 34778.4479
Epoch 15/25
8000/8000 [==============================] - 1s 129us/step - loss: 34220.4407
Epoch 16/25
8000/8000 [==============================] - 1s 124us/step - loss: 33603.7760
Epoch 17/25
8000/8000 [==============================] - 1s 136us/step - loss: 32931.2144
Epoch 18/25
8000/8000 [==============================] - 1s 121us/step - loss: 32257.5660
Epoch 19/25
8000/8000 [==============================] - 1s 125us/step - loss: 31622.0433
Epoch 20/25
8000/8000 [==============================] - 1s 131us/step - loss: 31084.5188
Epoch 21/25
8000/8000 [==============================] - 1s 135us/step - loss: 30660.2642
Epoch 22/25
8000/8000 [==============================] - 1s 125us/step - loss: 30352.2243
Epoch 23/25
8000/8000 [==============================] - 1s 125us/step - loss: 30165.2209
Epoch 24/25
8000/8000 [==============================] - 1s 125us/step - loss: 30061.4422
Epoch 25/25
8000/8000 [==============================] - 1s 125us/step - loss: 29994.4935
correlation with isomap       : 0.904213
correlation with isomap (test): 0.809611

20newsgroups embedding

In [8]:
## load the data and transform it into a tf-idf representation
categories = [
newsgroups_train = fetch_20newsgroups(subset='train', remove=(
    'headers', 'footers', 'quotes'), data_home='data', categories=categories, random_state=42)
newsgroups_test = fetch_20newsgroups(subset='test', remove=(
    'headers', 'footers', 'quotes'), data_home='data', categories=categories, random_state=42)
# store in dicts (if the text contains more than 3 words)
textdict = {i: t for i, t in enumerate( if len(t.split()) > 3}
textdict.update({i: t for i, t in enumerate(, len( if len(t.split()) > 3})
train_ids = [i for i in range(len( if i in textdict]
test_ids = [i for i in range(len(, len(textdict)) if i in textdict]
print("%i training and %i test samples" % (len(train_ids), len(test_ids)))
# transform into tf-idf features
ft = FeatureTransform(norm='max', weight=True, renorm='max')
docfeats = ft.texts2features(textdict, fit_ids=train_ids)
# organize in feature matrix
X, featurenames = features2mat(docfeats, train_ids)
X_test, _ = features2mat(docfeats, test_ids, featurenames)
print("%i features" % len(featurenames))
targets = np.hstack([,])
y = targets[train_ids]
y_test = targets[test_ids]
target_names = newsgroups_train.target_names
n_targets = 1000

3959 training and 2359 test samples
45813 features

In [9]:
# linear kPCA
kpca = KernelPCA(n_components=2, kernel='linear')
X_embed = kpca.fit_transform(X)
X_embed_test = kpca.transform(X_test)
plot_20news(X_embed, y, target_names, X_embed_test, y_test,
            title='20newsgroups - linear Kernel PCA', legend=True)
# compute linear kernel and center
K_lin = center_K(
K_lin_test = center_K(
print("similarity approximation       : msqe: %.10f ; r^2: %.10f ; rho: %.10f" % check_similarity_match(X_embed, K_lin))
print("similarity approximation (test): msqe: %.10f ; r^2: %.10f ; rho: %.10f" % check_similarity_match(X_embed_test, K_lin_test))

similarity approximation       : msqe: 0.0331868908 ; r^2: 0.2263374840 ; rho: 0.6310964767
similarity approximation (test): msqe: 0.0421617582 ; r^2: 0.1775899837 ; rho: 0.6165146584

In [10]:
# project to 2d with linear similarity encoder
# careful: our input is sparse!!!
simec = SimilarityEncoder(X.shape[1], 2, n_targets, sparse_inputs=True, opt=keras.optimizers.SGD(lr=50.))
# train the model to get an embedding with which the target similarities
# can be linearly approximated, K_lin[:,:n_targets], epochs=25)
# get the embeddings
X_embeds = simec.transform(X)
X_embed_tests = simec.transform(X_test)
plot_20news(X_embeds, y, target_names, X_embed_tests, y_test,
            title='20 newsgroups - SimEc (lin. kernel, linear)', legend=True)
print("correlation with lin kPCA       : %f" % check_embed_match(X_embed, X_embeds)[1])
print("correlation with lin kPCA (test): %f" % check_embed_match(X_embed_test, X_embed_tests)[1])
print("similarity approximation       : msqe: %.10f ; r^2: %.10f ; rho: %.10f" % check_similarity_match(X_embeds, K_lin))
print("similarity approximation (test): msqe: %.10f ; r^2: %.10f ; rho: %.10f" % check_similarity_match(X_embed_tests, K_lin_test))

Epoch 1/25
3959/3959 [==============================] - 1s 140us/step - loss: 0.0366
Epoch 2/25
3959/3959 [==============================] - 0s 104us/step - loss: 0.0318
Epoch 3/25
3959/3959 [==============================] - 0s 102us/step - loss: 0.0315
Epoch 4/25
3959/3959 [==============================] - 0s 104us/step - loss: 0.0311
Epoch 5/25
3959/3959 [==============================] - 0s 110us/step - loss: 0.0311
Epoch 6/25
3959/3959 [==============================] - 0s 109us/step - loss: 0.0311
Epoch 7/25
3959/3959 [==============================] - 0s 103us/step - loss: 0.0310
Epoch 8/25
3959/3959 [==============================] - 0s 106us/step - loss: 0.0311
Epoch 9/25
3959/3959 [==============================] - 0s 101us/step - loss: 0.0310
Epoch 10/25
3959/3959 [==============================] - 0s 106us/step - loss: 0.0310
Epoch 11/25
3959/3959 [==============================] - 0s 102us/step - loss: 0.0310
Epoch 12/25
3959/3959 [==============================] - 0s 111us/step - loss: 0.0311
Epoch 13/25
3959/3959 [==============================] - 0s 100us/step - loss: 0.0310
Epoch 14/25
3959/3959 [==============================] - 0s 102us/step - loss: 0.0310
Epoch 15/25
3959/3959 [==============================] - 0s 98us/step - loss: 0.0310
Epoch 16/25
3959/3959 [==============================] - 0s 100us/step - loss: 0.0310
Epoch 17/25
3959/3959 [==============================] - 0s 97us/step - loss: 0.0310
Epoch 18/25
3959/3959 [==============================] - 0s 98us/step - loss: 0.0310
Epoch 19/25
3959/3959 [==============================] - 0s 104us/step - loss: 0.0310
Epoch 20/25
3959/3959 [==============================] - 0s 98us/step - loss: 0.0310
Epoch 21/25
3959/3959 [==============================] - 0s 100us/step - loss: 0.0310
Epoch 22/25
3959/3959 [==============================] - 0s 103us/step - loss: 0.0311
Epoch 23/25
3959/3959 [==============================] - 0s 103us/step - loss: 0.0310
Epoch 24/25
3959/3959 [==============================] - 0s 106us/step - loss: 0.0310
Epoch 25/25
3959/3959 [==============================] - 0s 116us/step - loss: 0.0310
correlation with lin kPCA       : 0.939694
correlation with lin kPCA (test): 0.957125
similarity approximation       : msqe: 0.1682884082 ; r^2: 0.2108829888 ; rho: 0.6228658243
similarity approximation (test): msqe: 0.1391057260 ; r^2: 0.1664079620 ; rho: 0.6075047460