Similarity Encoders with Keras


In [1]:
from __future__ import unicode_literals, division, print_function, absolute_import
from builtins import range
import numpy as np
np.random.seed(28)
import matplotlib.pyplot as plt
from sklearn.manifold import Isomap
from sklearn.decomposition import KernelPCA
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import fetch_mldata, fetch_20newsgroups

import tensorflow as tf
tf.set_random_seed(28)
import keras
from keras.models import Sequential, Model
from keras.layers import Input, Dense, Activation

# https://github.com/cod3licious/nlputils
from nlputils.features import FeatureTransform, features2mat

from simec import LastLayerReg
from utils import center_K, check_embed_match, check_similarity_match
from utils_plotting import plot_mnist, plot_20news

%matplotlib inline
%load_ext autoreload
%autoreload 2


/home/franzi/anaconda2/envs/python36/lib/python3.6/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
  from ._conv import register_converters as _register_converters
Using TensorFlow backend.

MNIST with Linear Kernel


In [2]:
# load digits
mnist = fetch_mldata('MNIST original', data_home='data')
X = mnist.data/255.  # normalize to 0-1
y = np.array(mnist.target, dtype=int)
# subsample 10000 random data points
np.random.seed(42)
n_samples = 10000
n_test = 2000
rnd_idx = np.random.permutation(X.shape[0])[:n_samples]
X_test, y_test = X[rnd_idx[:n_test],:], y[rnd_idx[:n_test]]
X, y = X[rnd_idx[n_test:],:], y[rnd_idx[n_test:]]
ss = StandardScaler(with_std=False)
X = ss.fit_transform(X)
X_test = ss.transform(X_test)
n_train, n_features = X.shape

In [3]:
# centered linear kernel matrix
K_lin = center_K(np.dot(X, X.T))

In [4]:
# linear kPCA
kpca = KernelPCA(n_components=2, kernel='linear')
X_embed = kpca.fit_transform(X)
X_embed_test = kpca.transform(X_test)
plot_mnist(X_embed, y, X_embed_test, y_test, title='MNIST - linear Kernel PCA')
print("error similarity match: msqe: %.10f ; r^2: %.10f ; rho: %.10f" % check_similarity_match(X_embed, K_lin))


error similarity match: msqe: 51.4531662441 ; r^2: 0.4438507851 ; rho: 0.6341602248

In [5]:
# on how many target similarities you want to train - faster and works equally well than training on all
n_targets = 1000  # K_lin.shape[1]
# initialize the model
model = Sequential()
# add the linear layer that maps to your embedding
model.add(Dense(2, input_shape=(X.shape[1],), kernel_regularizer=keras.regularizers.l2(0.01)))
model.add(Activation('linear'))
# add another linear layer to get the linear approximation of the target similarities
model.add(Dense(n_targets, kernel_regularizer=LastLayerReg(s_ll_reg=0.5, S_ll=K_lin[:n_targets,:n_targets])))
model.add(Activation('linear'))
# compile the model to minimize the MSE
opt = keras.optimizers.Adamax(lr=0.005)
model.compile(optimizer=opt, loss='mse')
# train the model to get an embedding with which the target similarities
# can be linearly approximated
model.fit(X, K_lin[:,:n_targets], epochs=25)
# after training is complete, we loose the last layer (+ activation) to only get the embedding
model2 = Sequential(model.layers[:-2])
# the (linear) activations of the second to last layer are our embedding
X_embeds = model2.predict(X)
X_embed_tests = model2.predict(X_test)
plot_mnist(X_embeds, y, X_embed_tests, y_test, title='MNIST - SimEc (lin. kernel, linear)')
# correlation with the embedding produced by the spectral method should be high
print("correlation with lin kPCA       : %f" % check_embed_match(X_embed, X_embeds)[1])
print("correlation with lin kPCA (test): %f" % check_embed_match(X_embed_test, X_embed_tests)[1])
# similarity match error should be similar to the one from kpca
print("error similarity match: msqe: %.10f ; r^2: %.10f ; rho: %.10f" % check_similarity_match(X_embeds, K_lin))


Epoch 1/25
8000/8000 [==============================] - 1s 169us/step - loss: 117.2756
Epoch 2/25
8000/8000 [==============================] - 1s 99us/step - loss: 99.2442
Epoch 3/25
8000/8000 [==============================] - 1s 96us/step - loss: 98.4770
Epoch 4/25
8000/8000 [==============================] - 1s 99us/step - loss: 97.9257
Epoch 5/25
8000/8000 [==============================] - 1s 98us/step - loss: 97.2574
Epoch 6/25
8000/8000 [==============================] - 1s 95us/step - loss: 96.3561
Epoch 7/25
8000/8000 [==============================] - 1s 97us/step - loss: 95.0996
Epoch 8/25
8000/8000 [==============================] - 1s 101us/step - loss: 93.2983
Epoch 9/25
8000/8000 [==============================] - 1s 101us/step - loss: 90.7463
Epoch 10/25
8000/8000 [==============================] - 1s 94us/step - loss: 87.4388
Epoch 11/25
8000/8000 [==============================] - 1s 96us/step - loss: 83.8674
Epoch 12/25
8000/8000 [==============================] - 1s 96us/step - loss: 81.0287
Epoch 13/25
8000/8000 [==============================] - 1s 97us/step - loss: 79.5132
Epoch 14/25
8000/8000 [==============================] - 1s 96us/step - loss: 78.9788
Epoch 15/25
8000/8000 [==============================] - 1s 94us/step - loss: 78.8436
Epoch 16/25
8000/8000 [==============================] - 1s 97us/step - loss: 78.8110
Epoch 17/25
8000/8000 [==============================] - 1s 95us/step - loss: 78.8038
Epoch 18/25
8000/8000 [==============================] - 1s 104us/step - loss: 78.8016
Epoch 19/25
8000/8000 [==============================] - 1s 97us/step - loss: 78.7990
Epoch 20/25
8000/8000 [==============================] - 1s 96us/step - loss: 78.7984
Epoch 21/25
8000/8000 [==============================] - 1s 97us/step - loss: 78.7973
Epoch 22/25
8000/8000 [==============================] - 1s 109us/step - loss: 78.7977
Epoch 23/25
8000/8000 [==============================] - 1s 98us/step - loss: 78.7953
Epoch 24/25
8000/8000 [==============================] - 1s 97us/step - loss: 78.7977
Epoch 25/25
8000/8000 [==============================] - 1s 96us/step - loss: 78.7962
correlation with lin kPCA       : 0.980925
correlation with lin kPCA (test): 0.982280
error similarity match: msqe: 51.6534822241 ; r^2: 0.4416858158 ; rho: 0.6324372691

Non-linear MNIST embedding with isomap


In [6]:
# isomap
isomap = Isomap(n_neighbors=10, n_components=2)
X_embed = isomap.fit_transform(X)
X_embed_test = isomap.transform(X_test)
plot_mnist(X_embed, y, X_embed_test, y_test, title='MNIST - isomap')



In [7]:
# non-linear SimEc to approximate isomap solution
K_geod = center_K(-0.5*(isomap.dist_matrix_**2))
n_targets = 1000
# initialize the model
model = Sequential()
# optionally add some non-linear layers to the feed forward NN
model.add(Dense(20, input_shape=(X.shape[1],), kernel_regularizer=keras.regularizers.l2(0.01)))
model.add(Activation('tanh'))
# add the linear layer that maps to your embedding
# in this case we want to visualize our data so the embedding dim is 2
model.add(Dense(2, kernel_regularizer=keras.regularizers.l2(0.01)))
model.add(Activation('linear'))
# add another linear layer to get the linear approximation of the target similarities
model.add(Dense(n_targets))
model.add(Activation('linear'))
# compile the model to minimize the MSE
opt = keras.optimizers.Adamax(lr=0.005)
model.compile(optimizer=opt, loss='mse')
# train the model to get an embedding with which the target similarities
# can be linearly approximated
model.fit(X, K_geod[:,:n_targets], epochs=30)
# after training is complete, we loose the last layer (+ activation) to only get the embedding
model2 = Sequential(model.layers[:-2])
# the (linear) activations of the second to last layer are our embedding
X_embeds = model2.predict(X)
X_embed_tests = model2.predict(X_test)
plot_mnist(X_embeds, y, X_embed_tests, y_test, title='MNIST - SimEc (isomap, 1 h.l.)')
print("correlation with isomap       : %f" % check_embed_match(X_embed, X_embeds)[1])
print("correlation with isomap (test): %f" % check_embed_match(X_embed_test, X_embed_tests)[1])


Epoch 1/30
8000/8000 [==============================] - 1s 86us/step - loss: 34893.1082
Epoch 2/30
8000/8000 [==============================] - 1s 70us/step - loss: 31729.7535
Epoch 3/30
8000/8000 [==============================] - 1s 71us/step - loss: 29316.6960
Epoch 4/30
8000/8000 [==============================] - 1s 69us/step - loss: 27526.3431
Epoch 5/30
8000/8000 [==============================] - 1s 69us/step - loss: 25319.7543
Epoch 6/30
8000/8000 [==============================] - 1s 67us/step - loss: 23184.3603
Epoch 7/30
8000/8000 [==============================] - 1s 69us/step - loss: 22018.5334
Epoch 8/30
8000/8000 [==============================] - 1s 69us/step - loss: 21426.3431
Epoch 9/30
8000/8000 [==============================] - 1s 69us/step - loss: 21077.3772
Epoch 10/30
8000/8000 [==============================] - 1s 72us/step - loss: 20854.1272
Epoch 11/30
8000/8000 [==============================] - 1s 65us/step - loss: 20707.7457
Epoch 12/30
8000/8000 [==============================] - 1s 68us/step - loss: 20604.4523
Epoch 13/30
8000/8000 [==============================] - 1s 72us/step - loss: 20528.2652
Epoch 14/30
8000/8000 [==============================] - 1s 71us/step - loss: 20462.0827
Epoch 15/30
8000/8000 [==============================] - 1s 64us/step - loss: 20409.4992
Epoch 16/30
8000/8000 [==============================] - 1s 65us/step - loss: 20366.3701
Epoch 17/30
8000/8000 [==============================] - 1s 73us/step - loss: 20332.8203
Epoch 18/30
8000/8000 [==============================] - 1s 71us/step - loss: 20303.3159
Epoch 19/30
8000/8000 [==============================] - 1s 72us/step - loss: 20280.5825
Epoch 20/30
8000/8000 [==============================] - 1s 71us/step - loss: 20254.1126
Epoch 21/30
8000/8000 [==============================] - 1s 64us/step - loss: 20241.2899
Epoch 22/30
8000/8000 [==============================] - 1s 73us/step - loss: 20222.6653
Epoch 23/30
8000/8000 [==============================] - 1s 72us/step - loss: 20210.8417
Epoch 24/30
8000/8000 [==============================] - 1s 68us/step - loss: 20197.6768
Epoch 25/30
8000/8000 [==============================] - 1s 68us/step - loss: 20184.4013
Epoch 26/30
8000/8000 [==============================] - 1s 70us/step - loss: 20172.1834
Epoch 27/30
8000/8000 [==============================] - 1s 68us/step - loss: 20162.7518
Epoch 28/30
8000/8000 [==============================] - 1s 71us/step - loss: 20154.7186
Epoch 29/30
8000/8000 [==============================] - 1s 69us/step - loss: 20144.8681
Epoch 30/30
8000/8000 [==============================] - 1s 74us/step - loss: 20134.1736
correlation with isomap       : 0.901432
correlation with isomap (test): 0.831927

20newsgroups embedding


In [8]:
## load the data and transform it into a tf-idf representation
categories = [
    "comp.graphics",
    "rec.autos",
    "rec.sport.baseball",
    "sci.med",
    "sci.space",
    "soc.religion.christian",
    "talk.politics.guns"
]
newsgroups_train = fetch_20newsgroups(subset='train', remove=(
    'headers', 'footers', 'quotes'), data_home='data', categories=categories, random_state=42)
newsgroups_test = fetch_20newsgroups(subset='test', remove=(
    'headers', 'footers', 'quotes'), data_home='data', categories=categories, random_state=42)
# store in dicts (if the text contains more than 3 words)
textdict = {i: t for i, t in enumerate(newsgroups_train.data) if len(t.split()) > 3}
textdict.update({i: t for i, t in enumerate(newsgroups_test.data, len(newsgroups_train.data)) if len(t.split()) > 3})
train_ids = [i for i in range(len(newsgroups_train.data)) if i in textdict]
test_ids = [i for i in range(len(newsgroups_train.data), len(textdict)) if i in textdict]
print("%i training and %i test samples" % (len(train_ids), len(test_ids)))
# transform into tf-idf features
ft = FeatureTransform(norm='max', weight=True, renorm='max')
docfeats = ft.texts2features(textdict, fit_ids=train_ids)
# organize in feature matrix
X, featurenames = features2mat(docfeats, train_ids)
X_test, _ = features2mat(docfeats, test_ids, featurenames)
print("%i features" % len(featurenames))
targets = np.hstack([newsgroups_train.target,newsgroups_test.target])
y = targets[train_ids]
y_test = targets[test_ids]
target_names = newsgroups_train.target_names


3959 training and 2359 test samples
45813 features

In [9]:
# linear kPCA
kpca = KernelPCA(n_components=2, kernel='linear')
X_embed = kpca.fit_transform(X)
X_embed_test = kpca.transform(X_test)
plot_20news(X_embed, y, target_names, X_embed_test, y_test,
            title='20newsgroups - linear Kernel PCA', legend=True)



In [10]:
# compute linear kernel and center
K_lin = center_K(X.dot(X.T).A)
# project to 2d with linear similarity encoder
# careful: our input is sparse!!!
inputs = Input(shape=(X.shape[1],), sparse=True)
# layer for the 2d embedding
embedding = Dense(2, activation='linear')(inputs)
# layer for the predicted similarities
outputs = Dense(K_lin.shape[1], activation='linear')(embedding)
# put it all into a model
model = Model(inputs=inputs, outputs=outputs)
# compile the model to minimize the MSE
opt = keras.optimizers.SGD(lr=50.)
model.compile(optimizer=opt, loss='mse')
# train the model to get an embedding with which the target similarities
# can be linearly approximated
model.fit(X, K_lin, epochs=20)
# after training is complete, we loose the last layer (+ activation) to only get the embedding
model2 = Model(inputs=inputs, outputs=embedding)
# the (linear) activations of the second to last layer are our embedding
X_embeds = model2.predict(X)
X_embed_tests = model2.predict(X_test)
plot_20news(X_embeds, y, target_names, X_embed_tests, y_test,
            title='20 newsgroups - SimEc (lin. kernel, linear)')
print("correlation with lin kPCA       : %f" % check_embed_match(X_embed, X_embeds)[1])
print("correlation with lin kPCA (test): %f" % check_embed_match(X_embed_test, X_embed_tests)[1])


Epoch 1/20
3959/3959 [==============================] - 1s 135us/step - loss: 0.0428
Epoch 2/20
3959/3959 [==============================] - 0s 109us/step - loss: 0.0356
Epoch 3/20
3959/3959 [==============================] - 0s 109us/step - loss: 0.0343
Epoch 4/20
3959/3959 [==============================] - 0s 110us/step - loss: 0.0343
Epoch 5/20
3959/3959 [==============================] - 0s 112us/step - loss: 0.0342
Epoch 6/20
3959/3959 [==============================] - 0s 118us/step - loss: 0.0338
Epoch 7/20
3959/3959 [==============================] - 0s 111us/step - loss: 0.0335
Epoch 8/20
3959/3959 [==============================] - 0s 112us/step - loss: 0.0334
Epoch 9/20
3959/3959 [==============================] - 0s 110us/step - loss: 0.0334
Epoch 10/20
3959/3959 [==============================] - 0s 116us/step - loss: 0.0333
Epoch 11/20
3959/3959 [==============================] - 0s 113us/step - loss: 0.0333
Epoch 12/20
3959/3959 [==============================] - 0s 121us/step - loss: 0.0333
Epoch 13/20
3959/3959 [==============================] - 0s 115us/step - loss: 0.0333
Epoch 14/20
3959/3959 [==============================] - 0s 114us/step - loss: 0.0333
Epoch 15/20
3959/3959 [==============================] - 0s 116us/step - loss: 0.0333
Epoch 16/20
3959/3959 [==============================] - 0s 112us/step - loss: 0.0333
Epoch 17/20
3959/3959 [==============================] - 0s 117us/step - loss: 0.0333
Epoch 18/20
3959/3959 [==============================] - 0s 121us/step - loss: 0.0333
Epoch 19/20
3959/3959 [==============================] - 0s 117us/step - loss: 0.0333
Epoch 20/20
3959/3959 [==============================] - 0s 115us/step - loss: 0.0333
correlation with lin kPCA       : 0.993308
correlation with lin kPCA (test): 0.995331

In [ ]: