Understanding Latent Neural Spaces

https://www.sfdatainstitute.org/

Experiments

  1. Add one airport into manual embedding
  2. Add one airport description here and let it train into embedding

More notebooks

  • TODO: AE notebooks incl advanced

In [1]:
import tensorflow as tf
tf.logging.set_verbosity(tf.logging.ERROR)
print(tf.__version__)


1.13.0-rc1

Challenge: You have a couple of airports and want to bring them into a numerical representation to enable processing with neural networks. How do you do that?


In [0]:
# https://en.wikipedia.org/wiki/List_of_busiest_airports_by_passenger_traffic

airports = {
 'HAM': ["germany europe regional", 18],
 'TXL': ["germany europe regional", 21],
 'FRA': ["germany europe hub", 70],
 'MUC': ["germany europe hub", 46],
 'CPH': ["denmark capital scandinavia europe hub", 29],
 'ARN': ["sweden capital scandinavia europe regional", 27],
 'BGO': ["norway scandinavia europe regional", 6],
 'OSL': ["norway capital scandinavia europe regional", 29],
 'LHR': ["gb capital europe hub", 80],
 'CDG': ["france capital europe hub", 72],
 'SFO': ["usa california regional", 58],
 'IAD': ["usa capital regional", 21],
 'AUS': ["usa texas regional", 16],
 'EWR': ["usa new_jersey hub", 46],
 'JFK': ["usa new_york hub", 62],
 'ATL': ["usa georgia hub", 110],
 'STL': ["usa missouri regional", 16],
 'LAX': ["usa california hub", 88]
}

In [0]:
airport_names = list(airports.keys())
airport_numbers = list(range(0, len(airports)))
airport_to_number = dict(zip(airport_names, airport_numbers))
number_to_airport = dict(zip(airport_numbers, airport_names))
airport_descriptions = [value[0] for value in list(airports.values())]
airport_passengers = [value[1] for value in list(airports.values())]

Encode Texts in multi-hot frequency


In [4]:
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(airport_descriptions)
description_matrix = tokenizer.texts_to_matrix(airport_descriptions, mode='freq')

aiport_count, word_count = description_matrix.shape
dictionary_size = word_count
aiport_count, word_count


Out[4]:
(18, 20)

In [0]:
x = airport_numbers
Y = description_matrix

2d embeddings


In [6]:
%%time

import matplotlib.pyplot as plt


import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Flatten, GlobalAveragePooling1D, Dense, LSTM, GRU, SimpleRNN, Bidirectional, Embedding
from tensorflow.keras.models import Sequential, Model

from tensorflow.keras.initializers import glorot_normal
seed = 3

input_dim = len(airports)
embedding_dim = 2

model = Sequential()

model.add(Embedding(name='embedding',
                    input_dim=input_dim, 
                    output_dim=embedding_dim, 
                    input_length=1,
                    embeddings_initializer=glorot_normal(seed=seed)))

model.add(GlobalAveragePooling1D())

model.add(Dense(units=50, activation='relu', bias_initializer='zeros', kernel_initializer=glorot_normal(seed=seed)))

model.add(Dense(units=dictionary_size, name='output', activation='softmax', bias_initializer='zeros', kernel_initializer=glorot_normal(seed=seed)))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

EPOCHS=1000
BATCH_SIZE=2

%time history = model.fit(x, Y, epochs=EPOCHS, batch_size=BATCH_SIZE, verbose=0)


plt.yscale('log')
plt.plot(history.history['loss'])


CPU times: user 15.8 s, sys: 712 ms, total: 16.5 s
Wall time: 12.2 s
CPU times: user 16 s, sys: 718 ms, total: 16.8 s
Wall time: 12.4 s

In [7]:
loss, accuracy = model.evaluate(x, Y)
loss, accuracy


18/18 [==============================] - 0s 2ms/sample - loss: 0.1053 - acc: 0.8194
Out[7]:
(0.10528630018234253, 0.8194444)

In [8]:
embedding_layer = model.get_layer('embedding')
embedding_model = Model(inputs=model.input, outputs=embedding_layer.output)
embeddings_2d = embedding_model.predict(airport_numbers).reshape(-1, 2)

# for printing only
# plt.figure(dpi=600)
plt.axis('off')
plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1])
for name, x_pos, y_pos in zip(airport_names, embeddings_2d[:, 0], embeddings_2d[:, 1]):
  print(name, (x_pos, y_pos))
  plt.annotate(name, (x_pos, y_pos))


HAM (-0.13915496, -1.1113863)
TXL (-0.13627577, -0.9834647)
FRA (-0.4516137, 0.1212538)
MUC (-0.45013693, 0.12262177)
CPH (0.46797395, 1.5072246)
ARN (0.53808963, -0.1485416)
BGO (0.23179582, -0.25896809)
OSL (0.29200092, 0.019353922)
LHR (-0.27203402, 0.73110664)
CDG (-0.93478554, 1.6889447)
SFO (1.7859012, -0.75303876)
IAD (1.4758574, -0.0042744256)
AUS (-1.7030097, -2.1409686)
EWR (-1.4294101, 0.47971565)
JFK (-1.9295005, 1.3067832)
ATL (1.8216516, 1.2266517)
STL (-1.335406, -0.9980086)
LAX (-1.2161198, -0.059176665)

1d embeddings


In [9]:
seed = 3

input_dim = len(airports)
embedding_dim = 1

model = Sequential()

model.add(Embedding(name='embedding',
                    input_dim=input_dim, 
                    output_dim=embedding_dim, 
                    input_length=1,
                    embeddings_initializer=glorot_normal(seed=seed)))

model.add(GlobalAveragePooling1D())

model.add(Dense(units=50, activation='relu', bias_initializer='zeros', kernel_initializer=glorot_normal(seed=seed)))

model.add(Dense(units=dictionary_size, name='output', activation='softmax', bias_initializer='zeros', kernel_initializer=glorot_normal(seed=seed)))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

EPOCHS=1500
BATCH_SIZE=2

%time history = model.fit(x, Y, epochs=EPOCHS, batch_size=BATCH_SIZE, verbose=0)


plt.yscale('log')
plt.plot(history.history['loss'])


CPU times: user 23.5 s, sys: 974 ms, total: 24.5 s
Wall time: 17.9 s
Out[9]:
[<matplotlib.lines.Line2D at 0x7fbfcd8fa400>]

In [16]:
import numpy as np

embedding_layer = model.get_layer('embedding')
embedding_model = Model(inputs=model.input, outputs=embedding_layer.output)
embeddings_1d = embedding_model.predict(airport_numbers).reshape(-1)

# for printing only
# plt.figure(figsize=(20,5))
# plt.figure(dpi=600)
plt.axis('off')
plt.scatter(embeddings_1d, np.zeros(len(embeddings_1d)))
for name, x_pos in zip(airport_names, embeddings_1d):
  print(name, (x_pos, y_pos))
  plt.annotate(name, (x_pos, 0), rotation=80)


HAM (0.23612742, -0.059176665)
TXL (0.23696959, -0.059176665)
FRA (0.501017, -0.059176665)
MUC (0.50058854, -0.059176665)
CPH (0.6619179, -0.059176665)
ARN (-0.33543202, -0.059176665)
BGO (-0.070580386, -0.059176665)
OSL (-0.17660478, -0.059176665)
LHR (3.2895024, -0.059176665)
CDG (3.798196, -0.059176665)
SFO (-0.98354906, -0.059176665)
IAD (-0.74012494, -0.059176665)
AUS (-1.3230947, -0.059176665)
EWR (-2.931522, -0.059176665)
JFK (-3.5054133, -0.059176665)
ATL (1.7543058, -0.059176665)
STL (-1.5092196, -0.059176665)
LAX (1.3219032, -0.059176665)

In [0]: