Similarity Encoders with Keras

In [1]:
from __future__ import unicode_literals, division, print_function, absolute_import
from builtins import range
import numpy as np
import matplotlib.pyplot as plt
from sklearn.manifold import Isomap
from sklearn.decomposition import KernelPCA
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import fetch_mldata, fetch_20newsgroups

import tensorflow as tf
import keras
from keras.models import Sequential, Model
from keras.layers import Input, Dense, Activation

from nlputils.features import FeatureTransform, features2mat

from simec import LastLayerReg
from utils import center_K, check_embed_match, check_similarity_match
from utils_plotting import plot_mnist, plot_20news

%matplotlib inline
%load_ext autoreload
%autoreload 2

/home/franzi/anaconda2/envs/python36/lib/python3.6/site-packages/h5py/ FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
  from ._conv import register_converters as _register_converters
Using TensorFlow backend.

MNIST with Linear Kernel

In [2]:
# load digits
mnist = fetch_mldata('MNIST original', data_home='data')
X =  # normalize to 0-1
y = np.array(, dtype=int)
# subsample 10000 random data points
n_samples = 10000
n_test = 2000
rnd_idx = np.random.permutation(X.shape[0])[:n_samples]
X_test, y_test = X[rnd_idx[:n_test],:], y[rnd_idx[:n_test]]
X, y = X[rnd_idx[n_test:],:], y[rnd_idx[n_test:]]
ss = StandardScaler(with_std=False)
X = ss.fit_transform(X)
X_test = ss.transform(X_test)
n_train, n_features = X.shape

In [3]:
# centered linear kernel matrix
K_lin = center_K(, X.T))

In [4]:
# linear kPCA
kpca = KernelPCA(n_components=2, kernel='linear')
X_embed = kpca.fit_transform(X)
X_embed_test = kpca.transform(X_test)
plot_mnist(X_embed, y, X_embed_test, y_test, title='MNIST - linear Kernel PCA')
print("error similarity match: msqe: %.10f ; r^2: %.10f ; rho: %.10f" % check_similarity_match(X_embed, K_lin))

error similarity match: msqe: 51.4531662441 ; r^2: 0.4438507851 ; rho: 0.6341602248

In [5]:
# on how many target similarities you want to train - faster and works equally well than training on all
n_targets = 1000  # K_lin.shape[1]
# initialize the model
model = Sequential()
# add the linear layer that maps to your embedding
model.add(Dense(2, input_shape=(X.shape[1],), kernel_regularizer=keras.regularizers.l2(0.01)))
# add another linear layer to get the linear approximation of the target similarities
model.add(Dense(n_targets, kernel_regularizer=LastLayerReg(s_ll_reg=0.5, S_ll=K_lin[:n_targets,:n_targets])))
# compile the model to minimize the MSE
opt = keras.optimizers.Adamax(lr=0.005)
model.compile(optimizer=opt, loss='mse')
# train the model to get an embedding with which the target similarities
# can be linearly approximated, K_lin[:,:n_targets], epochs=25)
# after training is complete, we loose the last layer (+ activation) to only get the embedding
model2 = Sequential(model.layers[:-2])
# the (linear) activations of the second to last layer are our embedding
X_embeds = model2.predict(X)
X_embed_tests = model2.predict(X_test)
plot_mnist(X_embeds, y, X_embed_tests, y_test, title='MNIST - SimEc (lin. kernel, linear)')
# correlation with the embedding produced by the spectral method should be high
print("correlation with lin kPCA       : %f" % check_embed_match(X_embed, X_embeds)[1])
print("correlation with lin kPCA (test): %f" % check_embed_match(X_embed_test, X_embed_tests)[1])
# similarity match error should be similar to the one from kpca
print("error similarity match: msqe: %.10f ; r^2: %.10f ; rho: %.10f" % check_similarity_match(X_embeds, K_lin))

Epoch 1/25
8000/8000 [==============================] - 1s 169us/step - loss: 117.2756
Epoch 2/25
8000/8000 [==============================] - 1s 99us/step - loss: 99.2442
Epoch 3/25
8000/8000 [==============================] - 1s 96us/step - loss: 98.4770
Epoch 4/25
8000/8000 [==============================] - 1s 99us/step - loss: 97.9257
Epoch 5/25
8000/8000 [==============================] - 1s 98us/step - loss: 97.2574
Epoch 6/25
8000/8000 [==============================] - 1s 95us/step - loss: 96.3561
Epoch 7/25
8000/8000 [==============================] - 1s 97us/step - loss: 95.0996
Epoch 8/25
8000/8000 [==============================] - 1s 101us/step - loss: 93.2983
Epoch 9/25
8000/8000 [==============================] - 1s 101us/step - loss: 90.7463
Epoch 10/25
8000/8000 [==============================] - 1s 94us/step - loss: 87.4388
Epoch 11/25
8000/8000 [==============================] - 1s 96us/step - loss: 83.8674
Epoch 12/25
8000/8000 [==============================] - 1s 96us/step - loss: 81.0287
Epoch 13/25
8000/8000 [==============================] - 1s 97us/step - loss: 79.5132
Epoch 14/25
8000/8000 [==============================] - 1s 96us/step - loss: 78.9788
Epoch 15/25
8000/8000 [==============================] - 1s 94us/step - loss: 78.8436
Epoch 16/25
8000/8000 [==============================] - 1s 97us/step - loss: 78.8110
Epoch 17/25
8000/8000 [==============================] - 1s 95us/step - loss: 78.8038
Epoch 18/25
8000/8000 [==============================] - 1s 104us/step - loss: 78.8016
Epoch 19/25
8000/8000 [==============================] - 1s 97us/step - loss: 78.7990
Epoch 20/25
8000/8000 [==============================] - 1s 96us/step - loss: 78.7984
Epoch 21/25
8000/8000 [==============================] - 1s 97us/step - loss: 78.7973
Epoch 22/25
8000/8000 [==============================] - 1s 109us/step - loss: 78.7977
Epoch 23/25
8000/8000 [==============================] - 1s 98us/step - loss: 78.7953
Epoch 24/25
8000/8000 [==============================] - 1s 97us/step - loss: 78.7977
Epoch 25/25
8000/8000 [==============================] - 1s 96us/step - loss: 78.7962
correlation with lin kPCA       : 0.980925
correlation with lin kPCA (test): 0.982280
error similarity match: msqe: 51.6534822241 ; r^2: 0.4416858158 ; rho: 0.6324372691

Non-linear MNIST embedding with isomap

In [6]:
# isomap
isomap = Isomap(n_neighbors=10, n_components=2)
X_embed = isomap.fit_transform(X)
X_embed_test = isomap.transform(X_test)
plot_mnist(X_embed, y, X_embed_test, y_test, title='MNIST - isomap')

In [7]:
# non-linear SimEc to approximate isomap solution
K_geod = center_K(-0.5*(isomap.dist_matrix_**2))
n_targets = 1000
# initialize the model
model = Sequential()
# optionally add some non-linear layers to the feed forward NN
model.add(Dense(20, input_shape=(X.shape[1],), kernel_regularizer=keras.regularizers.l2(0.01)))
# add the linear layer that maps to your embedding
# in this case we want to visualize our data so the embedding dim is 2
model.add(Dense(2, kernel_regularizer=keras.regularizers.l2(0.01)))
# add another linear layer to get the linear approximation of the target similarities
# compile the model to minimize the MSE
opt = keras.optimizers.Adamax(lr=0.005)
model.compile(optimizer=opt, loss='mse')
# train the model to get an embedding with which the target similarities
# can be linearly approximated, K_geod[:,:n_targets], epochs=30)
# after training is complete, we loose the last layer (+ activation) to only get the embedding
model2 = Sequential(model.layers[:-2])
# the (linear) activations of the second to last layer are our embedding
X_embeds = model2.predict(X)
X_embed_tests = model2.predict(X_test)
plot_mnist(X_embeds, y, X_embed_tests, y_test, title='MNIST - SimEc (isomap, 1 h.l.)')
print("correlation with isomap       : %f" % check_embed_match(X_embed, X_embeds)[1])
print("correlation with isomap (test): %f" % check_embed_match(X_embed_test, X_embed_tests)[1])

Epoch 1/30
8000/8000 [==============================] - 1s 86us/step - loss: 34893.1082
Epoch 2/30
8000/8000 [==============================] - 1s 70us/step - loss: 31729.7535
Epoch 3/30
8000/8000 [==============================] - 1s 71us/step - loss: 29316.6960
Epoch 4/30
8000/8000 [==============================] - 1s 69us/step - loss: 27526.3431
Epoch 5/30
8000/8000 [==============================] - 1s 69us/step - loss: 25319.7543
Epoch 6/30
8000/8000 [==============================] - 1s 67us/step - loss: 23184.3603
Epoch 7/30
8000/8000 [==============================] - 1s 69us/step - loss: 22018.5334
Epoch 8/30
8000/8000 [==============================] - 1s 69us/step - loss: 21426.3431
Epoch 9/30
8000/8000 [==============================] - 1s 69us/step - loss: 21077.3772
Epoch 10/30
8000/8000 [==============================] - 1s 72us/step - loss: 20854.1272
Epoch 11/30
8000/8000 [==============================] - 1s 65us/step - loss: 20707.7457
Epoch 12/30
8000/8000 [==============================] - 1s 68us/step - loss: 20604.4523
Epoch 13/30
8000/8000 [==============================] - 1s 72us/step - loss: 20528.2652
Epoch 14/30
8000/8000 [==============================] - 1s 71us/step - loss: 20462.0827
Epoch 15/30
8000/8000 [==============================] - 1s 64us/step - loss: 20409.4992
Epoch 16/30
8000/8000 [==============================] - 1s 65us/step - loss: 20366.3701
Epoch 17/30
8000/8000 [==============================] - 1s 73us/step - loss: 20332.8203
Epoch 18/30
8000/8000 [==============================] - 1s 71us/step - loss: 20303.3159
Epoch 19/30
8000/8000 [==============================] - 1s 72us/step - loss: 20280.5825
Epoch 20/30
8000/8000 [==============================] - 1s 71us/step - loss: 20254.1126
Epoch 21/30
8000/8000 [==============================] - 1s 64us/step - loss: 20241.2899
Epoch 22/30
8000/8000 [==============================] - 1s 73us/step - loss: 20222.6653
Epoch 23/30
8000/8000 [==============================] - 1s 72us/step - loss: 20210.8417
Epoch 24/30
8000/8000 [==============================] - 1s 68us/step - loss: 20197.6768
Epoch 25/30
8000/8000 [==============================] - 1s 68us/step - loss: 20184.4013
Epoch 26/30
8000/8000 [==============================] - 1s 70us/step - loss: 20172.1834
Epoch 27/30
8000/8000 [==============================] - 1s 68us/step - loss: 20162.7518
Epoch 28/30
8000/8000 [==============================] - 1s 71us/step - loss: 20154.7186
Epoch 29/30
8000/8000 [==============================] - 1s 69us/step - loss: 20144.8681
Epoch 30/30
8000/8000 [==============================] - 1s 74us/step - loss: 20134.1736
correlation with isomap       : 0.901432
correlation with isomap (test): 0.831927

20newsgroups embedding

In [8]:
## load the data and transform it into a tf-idf representation
categories = [
newsgroups_train = fetch_20newsgroups(subset='train', remove=(
    'headers', 'footers', 'quotes'), data_home='data', categories=categories, random_state=42)
newsgroups_test = fetch_20newsgroups(subset='test', remove=(
    'headers', 'footers', 'quotes'), data_home='data', categories=categories, random_state=42)
# store in dicts (if the text contains more than 3 words)
textdict = {i: t for i, t in enumerate( if len(t.split()) > 3}
textdict.update({i: t for i, t in enumerate(, len( if len(t.split()) > 3})
train_ids = [i for i in range(len( if i in textdict]
test_ids = [i for i in range(len(, len(textdict)) if i in textdict]
print("%i training and %i test samples" % (len(train_ids), len(test_ids)))
# transform into tf-idf features
ft = FeatureTransform(norm='max', weight=True, renorm='max')
docfeats = ft.texts2features(textdict, fit_ids=train_ids)
# organize in feature matrix
X, featurenames = features2mat(docfeats, train_ids)
X_test, _ = features2mat(docfeats, test_ids, featurenames)
print("%i features" % len(featurenames))
targets = np.hstack([,])
y = targets[train_ids]
y_test = targets[test_ids]
target_names = newsgroups_train.target_names

3959 training and 2359 test samples
45813 features

In [9]:
# linear kPCA
kpca = KernelPCA(n_components=2, kernel='linear')
X_embed = kpca.fit_transform(X)
X_embed_test = kpca.transform(X_test)
plot_20news(X_embed, y, target_names, X_embed_test, y_test,
            title='20newsgroups - linear Kernel PCA', legend=True)

In [10]:
# compute linear kernel and center
K_lin = center_K(
# project to 2d with linear similarity encoder
# careful: our input is sparse!!!
inputs = Input(shape=(X.shape[1],), sparse=True)
# layer for the 2d embedding
embedding = Dense(2, activation='linear')(inputs)
# layer for the predicted similarities
outputs = Dense(K_lin.shape[1], activation='linear')(embedding)
# put it all into a model
model = Model(inputs=inputs, outputs=outputs)
# compile the model to minimize the MSE
opt = keras.optimizers.SGD(lr=50.)
model.compile(optimizer=opt, loss='mse')
# train the model to get an embedding with which the target similarities
# can be linearly approximated, K_lin, epochs=20)
# after training is complete, we loose the last layer (+ activation) to only get the embedding
model2 = Model(inputs=inputs, outputs=embedding)
# the (linear) activations of the second to last layer are our embedding
X_embeds = model2.predict(X)
X_embed_tests = model2.predict(X_test)
plot_20news(X_embeds, y, target_names, X_embed_tests, y_test,
            title='20 newsgroups - SimEc (lin. kernel, linear)')
print("correlation with lin kPCA       : %f" % check_embed_match(X_embed, X_embeds)[1])
print("correlation with lin kPCA (test): %f" % check_embed_match(X_embed_test, X_embed_tests)[1])

Epoch 1/20
3959/3959 [==============================] - 1s 135us/step - loss: 0.0428
Epoch 2/20
3959/3959 [==============================] - 0s 109us/step - loss: 0.0356
Epoch 3/20
3959/3959 [==============================] - 0s 109us/step - loss: 0.0343
Epoch 4/20
3959/3959 [==============================] - 0s 110us/step - loss: 0.0343
Epoch 5/20
3959/3959 [==============================] - 0s 112us/step - loss: 0.0342
Epoch 6/20
3959/3959 [==============================] - 0s 118us/step - loss: 0.0338
Epoch 7/20
3959/3959 [==============================] - 0s 111us/step - loss: 0.0335
Epoch 8/20
3959/3959 [==============================] - 0s 112us/step - loss: 0.0334
Epoch 9/20
3959/3959 [==============================] - 0s 110us/step - loss: 0.0334
Epoch 10/20
3959/3959 [==============================] - 0s 116us/step - loss: 0.0333
Epoch 11/20
3959/3959 [==============================] - 0s 113us/step - loss: 0.0333
Epoch 12/20
3959/3959 [==============================] - 0s 121us/step - loss: 0.0333
Epoch 13/20
3959/3959 [==============================] - 0s 115us/step - loss: 0.0333
Epoch 14/20
3959/3959 [==============================] - 0s 114us/step - loss: 0.0333
Epoch 15/20
3959/3959 [==============================] - 0s 116us/step - loss: 0.0333
Epoch 16/20
3959/3959 [==============================] - 0s 112us/step - loss: 0.0333
Epoch 17/20
3959/3959 [==============================] - 0s 117us/step - loss: 0.0333
Epoch 18/20
3959/3959 [==============================] - 0s 121us/step - loss: 0.0333
Epoch 19/20
3959/3959 [==============================] - 0s 117us/step - loss: 0.0333
Epoch 20/20
3959/3959 [==============================] - 0s 115us/step - loss: 0.0333
correlation with lin kPCA       : 0.993308
correlation with lin kPCA (test): 0.995331

In [ ]: