In [1]:
from __future__ import unicode_literals, division, print_function, absolute_import
from builtins import range
import numpy as np
import matplotlib.pyplot as plt
from sklearn.manifold import Isomap
from sklearn.decomposition import KernelPCA
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import fetch_mldata, fetch_20newsgroups
import tensorflow as tf
import keras
from keras.models import Sequential, Model
from keras.layers import Input, Dense, Activation
from nlputils.features import FeatureTransform, features2mat
from simec import LastLayerReg
from utils import center_K, check_embed_match, check_similarity_match
from utils_plotting import plot_mnist, plot_20news
%matplotlib inline
%load_ext autoreload
%autoreload 2
In [2]:
# load digits
mnist = fetch_mldata('MNIST original', data_home='data')
X = # normalize to 0-1
y = np.array(, dtype=int)
# subsample 10000 random data points
n_samples = 10000
n_test = 2000
rnd_idx = np.random.permutation(X.shape[0])[:n_samples]
X_test, y_test = X[rnd_idx[:n_test],:], y[rnd_idx[:n_test]]
X, y = X[rnd_idx[n_test:],:], y[rnd_idx[n_test:]]
ss = StandardScaler(with_std=False)
X = ss.fit_transform(X)
X_test = ss.transform(X_test)
n_train, n_features = X.shape
In [3]:
# centered linear kernel matrix
K_lin = center_K(, X.T))
In [4]:
# linear kPCA
kpca = KernelPCA(n_components=2, kernel='linear')
X_embed = kpca.fit_transform(X)
X_embed_test = kpca.transform(X_test)
plot_mnist(X_embed, y, X_embed_test, y_test, title='MNIST - linear Kernel PCA')
print("error similarity match: msqe: %.10f ; r^2: %.10f ; rho: %.10f" % check_similarity_match(X_embed, K_lin))
In [5]:
# on how many target similarities you want to train - faster and works equally well than training on all
n_targets = 1000 # K_lin.shape[1]
# initialize the model
model = Sequential()
# add the linear layer that maps to your embedding
model.add(Dense(2, input_shape=(X.shape[1],), kernel_regularizer=keras.regularizers.l2(0.01)))
# add another linear layer to get the linear approximation of the target similarities
model.add(Dense(n_targets, kernel_regularizer=LastLayerReg(s_ll_reg=0.5, S_ll=K_lin[:n_targets,:n_targets])))
# compile the model to minimize the MSE
opt = keras.optimizers.Adamax(lr=0.005)
model.compile(optimizer=opt, loss='mse')
# train the model to get an embedding with which the target similarities
# can be linearly approximated, K_lin[:,:n_targets], epochs=25)
# after training is complete, we loose the last layer (+ activation) to only get the embedding
model2 = Sequential(model.layers[:-2])
# the (linear) activations of the second to last layer are our embedding
X_embeds = model2.predict(X)
X_embed_tests = model2.predict(X_test)
plot_mnist(X_embeds, y, X_embed_tests, y_test, title='MNIST - SimEc (lin. kernel, linear)')
# correlation with the embedding produced by the spectral method should be high
print("correlation with lin kPCA : %f" % check_embed_match(X_embed, X_embeds)[1])
print("correlation with lin kPCA (test): %f" % check_embed_match(X_embed_test, X_embed_tests)[1])
# similarity match error should be similar to the one from kpca
print("error similarity match: msqe: %.10f ; r^2: %.10f ; rho: %.10f" % check_similarity_match(X_embeds, K_lin))
In [6]:
# isomap
isomap = Isomap(n_neighbors=10, n_components=2)
X_embed = isomap.fit_transform(X)
X_embed_test = isomap.transform(X_test)
plot_mnist(X_embed, y, X_embed_test, y_test, title='MNIST - isomap')
In [7]:
# non-linear SimEc to approximate isomap solution
K_geod = center_K(-0.5*(isomap.dist_matrix_**2))
n_targets = 1000
# initialize the model
model = Sequential()
# optionally add some non-linear layers to the feed forward NN
model.add(Dense(20, input_shape=(X.shape[1],), kernel_regularizer=keras.regularizers.l2(0.01)))
# add the linear layer that maps to your embedding
# in this case we want to visualize our data so the embedding dim is 2
model.add(Dense(2, kernel_regularizer=keras.regularizers.l2(0.01)))
# add another linear layer to get the linear approximation of the target similarities
# compile the model to minimize the MSE
opt = keras.optimizers.Adamax(lr=0.005)
model.compile(optimizer=opt, loss='mse')
# train the model to get an embedding with which the target similarities
# can be linearly approximated, K_geod[:,:n_targets], epochs=30)
# after training is complete, we loose the last layer (+ activation) to only get the embedding
model2 = Sequential(model.layers[:-2])
# the (linear) activations of the second to last layer are our embedding
X_embeds = model2.predict(X)
X_embed_tests = model2.predict(X_test)
plot_mnist(X_embeds, y, X_embed_tests, y_test, title='MNIST - SimEc (isomap, 1 h.l.)')
print("correlation with isomap : %f" % check_embed_match(X_embed, X_embeds)[1])
print("correlation with isomap (test): %f" % check_embed_match(X_embed_test, X_embed_tests)[1])
In [8]:
## load the data and transform it into a tf-idf representation
categories = [
newsgroups_train = fetch_20newsgroups(subset='train', remove=(
'headers', 'footers', 'quotes'), data_home='data', categories=categories, random_state=42)
newsgroups_test = fetch_20newsgroups(subset='test', remove=(
'headers', 'footers', 'quotes'), data_home='data', categories=categories, random_state=42)
# store in dicts (if the text contains more than 3 words)
textdict = {i: t for i, t in enumerate( if len(t.split()) > 3}
textdict.update({i: t for i, t in enumerate(, len( if len(t.split()) > 3})
train_ids = [i for i in range(len( if i in textdict]
test_ids = [i for i in range(len(, len(textdict)) if i in textdict]
print("%i training and %i test samples" % (len(train_ids), len(test_ids)))
# transform into tf-idf features
ft = FeatureTransform(norm='max', weight=True, renorm='max')
docfeats = ft.texts2features(textdict, fit_ids=train_ids)
# organize in feature matrix
X, featurenames = features2mat(docfeats, train_ids)
X_test, _ = features2mat(docfeats, test_ids, featurenames)
print("%i features" % len(featurenames))
targets = np.hstack([,])
y = targets[train_ids]
y_test = targets[test_ids]
target_names = newsgroups_train.target_names
In [9]:
# linear kPCA
kpca = KernelPCA(n_components=2, kernel='linear')
X_embed = kpca.fit_transform(X)
X_embed_test = kpca.transform(X_test)
plot_20news(X_embed, y, target_names, X_embed_test, y_test,
title='20newsgroups - linear Kernel PCA', legend=True)
In [10]:
# compute linear kernel and center
K_lin = center_K(
# project to 2d with linear similarity encoder
# careful: our input is sparse!!!
inputs = Input(shape=(X.shape[1],), sparse=True)
# layer for the 2d embedding
embedding = Dense(2, activation='linear')(inputs)
# layer for the predicted similarities
outputs = Dense(K_lin.shape[1], activation='linear')(embedding)
# put it all into a model
model = Model(inputs=inputs, outputs=outputs)
# compile the model to minimize the MSE
opt = keras.optimizers.SGD(lr=50.)
model.compile(optimizer=opt, loss='mse')
# train the model to get an embedding with which the target similarities
# can be linearly approximated, K_lin, epochs=20)
# after training is complete, we loose the last layer (+ activation) to only get the embedding
model2 = Model(inputs=inputs, outputs=embedding)
# the (linear) activations of the second to last layer are our embedding
X_embeds = model2.predict(X)
X_embed_tests = model2.predict(X_test)
plot_20news(X_embeds, y, target_names, X_embed_tests, y_test,
title='20 newsgroups - SimEc (lin. kernel, linear)')
print("correlation with lin kPCA : %f" % check_embed_match(X_embed, X_embeds)[1])
print("correlation with lin kPCA (test): %f" % check_embed_match(X_embed_test, X_embed_tests)[1])
In [ ]: