Similarity Encoders with Keras

using the model definition from simec.py


In [1]:
from __future__ import unicode_literals, division, print_function, absolute_import
from builtins import range
import numpy as np
np.random.seed(28)
import matplotlib.pyplot as plt
from sklearn.manifold import Isomap
from sklearn.decomposition import KernelPCA
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import fetch_mldata, fetch_20newsgroups

import tensorflow as tf
tf.set_random_seed(28)
import keras
from keras.models import Sequential, Model
from keras.layers import Input, Dense, Activation

# https://github.com/cod3licious/nlputils
from nlputils.features import FeatureTransform, features2mat

from simec import SimilarityEncoder
from utils import center_K, check_embed_match, check_similarity_match
from utils_plotting import plot_mnist, plot_20news

%matplotlib inline
%load_ext autoreload
%autoreload 2


/home/franzi/anaconda2/envs/python36/lib/python3.6/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
  from ._conv import register_converters as _register_converters
Using TensorFlow backend.

MNIST with Linear Kernel


In [2]:
# load digits
mnist = fetch_mldata('MNIST original', data_home='data')
X = mnist.data/255.  # normalize to 0-1
y = np.array(mnist.target, dtype=int)
# subsample 10000 random data points
np.random.seed(42)
n_samples = 10000
n_test = 2000
rnd_idx = np.random.permutation(X.shape[0])[:n_samples]
X_test, y_test = X[rnd_idx[:n_test],:], y[rnd_idx[:n_test]]
X, y = X[rnd_idx[n_test:],:], y[rnd_idx[n_test:]]
ss = StandardScaler(with_std=False)
X = ss.fit_transform(X)
X_test = ss.transform(X_test)
n_train, n_features = X.shape

In [3]:
# centered linear kernel matrix
K_lin = center_K(np.dot(X, X.T))

In [4]:
# linear kPCA
kpca = KernelPCA(n_components=2, kernel='linear')
X_embed = kpca.fit_transform(X)
X_embed_test = kpca.transform(X_test)
plot_mnist(X_embed, y, X_embed_test, y_test, title='MNIST - linear Kernel PCA')
print("error similarity match: msqe: %.10f ; r^2: %.10f ; rho: %.10f" % check_similarity_match(X_embed, K_lin))


error similarity match: msqe: 51.4531662441 ; r^2: 0.4438507851 ; rho: 0.6341602248

In [5]:
# on how many target similarities you want to train - faster and works equally well than training on all
n_targets = 1000  # K_lin.shape[1]
# initialize the model
simec = SimilarityEncoder(X.shape[1], 2, n_targets, s_ll_reg=0.5, S_ll=K_lin[:n_targets,:n_targets])
# train the model to get an embedding with which the target similarities
# can be linearly approximated
simec.fit(X, K_lin[:,:n_targets], epochs=25)
# get the embeddings
X_embeds = simec.transform(X)
X_embed_tests = simec.transform(X_test)
plot_mnist(X_embeds, y, X_embed_tests, y_test, title='MNIST - SimEc (lin. kernel, linear)')
# correlation with the embedding produced by the spectral method should be high
print("correlation with lin kPCA       : %f" % check_embed_match(X_embed, X_embeds)[1])
print("correlation with lin kPCA (test): %f" % check_embed_match(X_embed_test, X_embed_tests)[1])
# similarity match error should be similar to the one from kpca
print("error similarity match: msqe: %.10f ; r^2: %.10f ; rho: %.10f" % check_similarity_match(X_embeds, K_lin))


Epoch 1/25
8000/8000 [==============================] - 1s 155us/step - loss: 138.9871
Epoch 2/25
8000/8000 [==============================] - 1s 93us/step - loss: 134.7487
Epoch 3/25
8000/8000 [==============================] - 1s 88us/step - loss: 128.0694
Epoch 4/25
8000/8000 [==============================] - 1s 93us/step - loss: 121.4029
Epoch 5/25
8000/8000 [==============================] - 1s 93us/step - loss: 116.2597
Epoch 6/25
8000/8000 [==============================] - 1s 95us/step - loss: 112.6398
Epoch 7/25
8000/8000 [==============================] - 1s 93us/step - loss: 109.3631
Epoch 8/25
8000/8000 [==============================] - 1s 87us/step - loss: 105.8619
Epoch 9/25
8000/8000 [==============================] - 1s 89us/step - loss: 102.6419
Epoch 10/25
8000/8000 [==============================] - 1s 94us/step - loss: 100.4048
Epoch 11/25
8000/8000 [==============================] - 1s 92us/step - loss: 99.2771
Epoch 12/25
8000/8000 [==============================] - 1s 92us/step - loss: 98.8235
Epoch 13/25
8000/8000 [==============================] - 1s 90us/step - loss: 98.6205
Epoch 14/25
8000/8000 [==============================] - 1s 93us/step - loss: 98.5062
Epoch 15/25
8000/8000 [==============================] - 1s 90us/step - loss: 98.4333
Epoch 16/25
8000/8000 [==============================] - 1s 90us/step - loss: 98.3796
Epoch 17/25
8000/8000 [==============================] - 1s 89us/step - loss: 98.3371
Epoch 18/25
8000/8000 [==============================] - 1s 90us/step - loss: 98.3028
Epoch 19/25
8000/8000 [==============================] - 1s 96us/step - loss: 98.2718
Epoch 20/25
8000/8000 [==============================] - 1s 93us/step - loss: 98.2441
Epoch 21/25
8000/8000 [==============================] - 1s 88us/step - loss: 98.2178
Epoch 22/25
8000/8000 [==============================] - 1s 90us/step - loss: 98.1937
Epoch 23/25
8000/8000 [==============================] - 1s 93us/step - loss: 98.1695
Epoch 24/25
8000/8000 [==============================] - 1s 88us/step - loss: 98.1459
Epoch 25/25
8000/8000 [==============================] - 1s 90us/step - loss: 98.1206
correlation with lin kPCA       : 0.989194
correlation with lin kPCA (test): 0.989490
error similarity match: msqe: 57769.3758877199 ; r^2: 0.4418963431 ; rho: 0.6326533868

Non-linear MNIST embedding with isomap


In [6]:
# isomap
isomap = Isomap(n_neighbors=10, n_components=2)
X_embed = isomap.fit_transform(X)
X_embed_test = isomap.transform(X_test)
plot_mnist(X_embed, y, X_embed_test, y_test, title='MNIST - isomap')



In [7]:
# non-linear SimEc to approximate isomap solution
K_geod = center_K(-0.5*(isomap.dist_matrix_**2))
n_targets = 1000
# initialize the model
simec = SimilarityEncoder(X.shape[1], 2, n_targets, hidden_layers=[(20, 'tanh')], s_ll_reg=0.5, 
                          S_ll=K_geod[:n_targets,:n_targets], opt=keras.optimizers.Adamax(lr=0.01))
# train the model to get an embedding with which the target similarities
# can be linearly approximated
simec.fit(X, K_geod[:,:n_targets], epochs=25)
# get the embeddings
X_embeds = simec.transform(X)
X_embed_tests = simec.transform(X_test)
plot_mnist(X_embeds, y, X_embed_tests, y_test, title='MNIST - SimEc (isomap, 1 h.l.)')
print("correlation with isomap       : %f" % check_embed_match(X_embed, X_embeds)[1])
print("correlation with isomap (test): %f" % check_embed_match(X_embed_test, X_embed_tests)[1])


Epoch 1/25
8000/8000 [==============================] - 1s 144us/step - loss: 51178.2788
Epoch 2/25
8000/8000 [==============================] - 1s 123us/step - loss: 46771.8850
Epoch 3/25
8000/8000 [==============================] - 1s 124us/step - loss: 41210.9276
Epoch 4/25
8000/8000 [==============================] - 1s 122us/step - loss: 38588.5507
Epoch 5/25
8000/8000 [==============================] - 1s 122us/step - loss: 37866.1624
Epoch 6/25
8000/8000 [==============================] - 1s 121us/step - loss: 37444.9817
Epoch 7/25
8000/8000 [==============================] - 1s 123us/step - loss: 37145.0221
Epoch 8/25
8000/8000 [==============================] - 1s 122us/step - loss: 36876.5603
Epoch 9/25
8000/8000 [==============================] - 1s 122us/step - loss: 36615.4620
Epoch 10/25
8000/8000 [==============================] - 1s 121us/step - loss: 36335.6757
Epoch 11/25
8000/8000 [==============================] - 1s 123us/step - loss: 36026.0781
Epoch 12/25
8000/8000 [==============================] - 1s 134us/step - loss: 35676.8645
Epoch 13/25
8000/8000 [==============================] - 1s 122us/step - loss: 35261.1747
Epoch 14/25
8000/8000 [==============================] - 1s 136us/step - loss: 34778.4479
Epoch 15/25
8000/8000 [==============================] - 1s 129us/step - loss: 34220.4407
Epoch 16/25
8000/8000 [==============================] - 1s 124us/step - loss: 33603.7760
Epoch 17/25
8000/8000 [==============================] - 1s 136us/step - loss: 32931.2144
Epoch 18/25
8000/8000 [==============================] - 1s 121us/step - loss: 32257.5660
Epoch 19/25
8000/8000 [==============================] - 1s 125us/step - loss: 31622.0433
Epoch 20/25
8000/8000 [==============================] - 1s 131us/step - loss: 31084.5188
Epoch 21/25
8000/8000 [==============================] - 1s 135us/step - loss: 30660.2642
Epoch 22/25
8000/8000 [==============================] - 1s 125us/step - loss: 30352.2243
Epoch 23/25
8000/8000 [==============================] - 1s 125us/step - loss: 30165.2209
Epoch 24/25
8000/8000 [==============================] - 1s 125us/step - loss: 30061.4422
Epoch 25/25
8000/8000 [==============================] - 1s 125us/step - loss: 29994.4935
correlation with isomap       : 0.904213
correlation with isomap (test): 0.809611

20newsgroups embedding


In [8]:
## load the data and transform it into a tf-idf representation
categories = [
    "comp.graphics",
    "rec.autos",
    "rec.sport.baseball",
    "sci.med",
    "sci.space",
    "soc.religion.christian",
    "talk.politics.guns"
]
newsgroups_train = fetch_20newsgroups(subset='train', remove=(
    'headers', 'footers', 'quotes'), data_home='data', categories=categories, random_state=42)
newsgroups_test = fetch_20newsgroups(subset='test', remove=(
    'headers', 'footers', 'quotes'), data_home='data', categories=categories, random_state=42)
# store in dicts (if the text contains more than 3 words)
textdict = {i: t for i, t in enumerate(newsgroups_train.data) if len(t.split()) > 3}
textdict.update({i: t for i, t in enumerate(newsgroups_test.data, len(newsgroups_train.data)) if len(t.split()) > 3})
train_ids = [i for i in range(len(newsgroups_train.data)) if i in textdict]
test_ids = [i for i in range(len(newsgroups_train.data), len(textdict)) if i in textdict]
print("%i training and %i test samples" % (len(train_ids), len(test_ids)))
# transform into tf-idf features
ft = FeatureTransform(norm='max', weight=True, renorm='max')
docfeats = ft.texts2features(textdict, fit_ids=train_ids)
# organize in feature matrix
X, featurenames = features2mat(docfeats, train_ids)
X_test, _ = features2mat(docfeats, test_ids, featurenames)
print("%i features" % len(featurenames))
targets = np.hstack([newsgroups_train.target,newsgroups_test.target])
y = targets[train_ids]
y_test = targets[test_ids]
target_names = newsgroups_train.target_names
n_targets = 1000


3959 training and 2359 test samples
45813 features

In [9]:
# linear kPCA
kpca = KernelPCA(n_components=2, kernel='linear')
X_embed = kpca.fit_transform(X)
X_embed_test = kpca.transform(X_test)
plot_20news(X_embed, y, target_names, X_embed_test, y_test,
            title='20newsgroups - linear Kernel PCA', legend=True)
# compute linear kernel and center
K_lin = center_K(X.dot(X.T).A)
K_lin_test = center_K(X_test.dot(X_test.T).A)
print("similarity approximation       : msqe: %.10f ; r^2: %.10f ; rho: %.10f" % check_similarity_match(X_embed, K_lin))
print("similarity approximation (test): msqe: %.10f ; r^2: %.10f ; rho: %.10f" % check_similarity_match(X_embed_test, K_lin_test))


similarity approximation       : msqe: 0.0331868908 ; r^2: 0.2263374840 ; rho: 0.6310964767
similarity approximation (test): msqe: 0.0421617582 ; r^2: 0.1775899837 ; rho: 0.6165146584

In [10]:
# project to 2d with linear similarity encoder
# careful: our input is sparse!!!
simec = SimilarityEncoder(X.shape[1], 2, n_targets, sparse_inputs=True, opt=keras.optimizers.SGD(lr=50.))
# train the model to get an embedding with which the target similarities
# can be linearly approximated
simec.fit(X, K_lin[:,:n_targets], epochs=25)
# get the embeddings
X_embeds = simec.transform(X)
X_embed_tests = simec.transform(X_test)
plot_20news(X_embeds, y, target_names, X_embed_tests, y_test,
            title='20 newsgroups - SimEc (lin. kernel, linear)', legend=True)
print("correlation with lin kPCA       : %f" % check_embed_match(X_embed, X_embeds)[1])
print("correlation with lin kPCA (test): %f" % check_embed_match(X_embed_test, X_embed_tests)[1])
print("similarity approximation       : msqe: %.10f ; r^2: %.10f ; rho: %.10f" % check_similarity_match(X_embeds, K_lin))
print("similarity approximation (test): msqe: %.10f ; r^2: %.10f ; rho: %.10f" % check_similarity_match(X_embed_tests, K_lin_test))


Epoch 1/25
3959/3959 [==============================] - 1s 140us/step - loss: 0.0366
Epoch 2/25
3959/3959 [==============================] - 0s 104us/step - loss: 0.0318
Epoch 3/25
3959/3959 [==============================] - 0s 102us/step - loss: 0.0315
Epoch 4/25
3959/3959 [==============================] - 0s 104us/step - loss: 0.0311
Epoch 5/25
3959/3959 [==============================] - 0s 110us/step - loss: 0.0311
Epoch 6/25
3959/3959 [==============================] - 0s 109us/step - loss: 0.0311
Epoch 7/25
3959/3959 [==============================] - 0s 103us/step - loss: 0.0310
Epoch 8/25
3959/3959 [==============================] - 0s 106us/step - loss: 0.0311
Epoch 9/25
3959/3959 [==============================] - 0s 101us/step - loss: 0.0310
Epoch 10/25
3959/3959 [==============================] - 0s 106us/step - loss: 0.0310
Epoch 11/25
3959/3959 [==============================] - 0s 102us/step - loss: 0.0310
Epoch 12/25
3959/3959 [==============================] - 0s 111us/step - loss: 0.0311
Epoch 13/25
3959/3959 [==============================] - 0s 100us/step - loss: 0.0310
Epoch 14/25
3959/3959 [==============================] - 0s 102us/step - loss: 0.0310
Epoch 15/25
3959/3959 [==============================] - 0s 98us/step - loss: 0.0310
Epoch 16/25
3959/3959 [==============================] - 0s 100us/step - loss: 0.0310
Epoch 17/25
3959/3959 [==============================] - 0s 97us/step - loss: 0.0310
Epoch 18/25
3959/3959 [==============================] - 0s 98us/step - loss: 0.0310
Epoch 19/25
3959/3959 [==============================] - 0s 104us/step - loss: 0.0310
Epoch 20/25
3959/3959 [==============================] - 0s 98us/step - loss: 0.0310
Epoch 21/25
3959/3959 [==============================] - 0s 100us/step - loss: 0.0310
Epoch 22/25
3959/3959 [==============================] - 0s 103us/step - loss: 0.0311
Epoch 23/25
3959/3959 [==============================] - 0s 103us/step - loss: 0.0310
Epoch 24/25
3959/3959 [==============================] - 0s 106us/step - loss: 0.0310
Epoch 25/25
3959/3959 [==============================] - 0s 116us/step - loss: 0.0310
correlation with lin kPCA       : 0.939694
correlation with lin kPCA (test): 0.957125
similarity approximation       : msqe: 0.1682884082 ; r^2: 0.2108829888 ; rho: 0.6228658243
similarity approximation (test): msqe: 0.1391057260 ; r^2: 0.1664079620 ; rho: 0.6075047460