Basic assumption: airlines fliying similar routes are similar
In [1]:
!curl -O https://raw.githubusercontent.com/jpatokal/openflights/master/data/routes.dat
In [0]:
# pd.read_csv?
In [3]:
import pandas as pd
df = pd.read_csv('routes.dat', quotechar="'", sep=',', encoding='utf-8', header=None, na_values='\\N',
names=['Airline', 'Airline ID', 'Source airport', 'Source airport ID', 'Destination airport', 'Destination airport ID', 'Codeshare', 'Stops', 'Equipment'])
# https://openflights.org/data.html#route
# Airline 2-letter (IATA) or 3-letter (ICAO) code of the airline.
# Airline ID Unique OpenFlights identifier for airline (see Airline).
# Source airport 3-letter (IATA) or 4-letter (ICAO) code of the source airport.
# Source airport ID Unique OpenFlights identifier for source airport (see Airport)
# Destination airport 3-letter (IATA) or 4-letter (ICAO) code of the destination airport.
# Destination airport ID Unique OpenFlights identifier for destination airport (see Airport)
# Codeshare "Y" if this flight is a codeshare (that is, not operated by Airline, but another carrier), empty otherwise.
# Stops Number of stops on this flight ("0" for direct)
# Equipment 3-letter codes for plane type(s) generally used on this flight, separated by spaces
# df[df['Stops'] == 1] gives only a dozen or so routes, so also drop it
df.drop(['Airline ID', 'Source airport ID', 'Destination airport ID', 'Codeshare', 'Equipment', 'Stops'], axis='columns', inplace=True)
len(df)
Out[3]:
In [4]:
df.head()
Out[4]:
In [5]:
sources = df['Source airport'].unique()
len(sources)
Out[5]:
In [6]:
destinations = df['Destination airport'].unique()
len(destinations)
Out[6]:
In [7]:
airlines = df['Airline'].unique()
len(airlines)
Out[7]:
In [8]:
import tensorflow as tf
tf.logging.set_verbosity(tf.logging.ERROR)
print(tf.__version__)
In [9]:
from tensorflow.keras.preprocessing.text import Tokenizer
airline_tokenizer = Tokenizer()
airline_tokenizer.fit_on_texts(df['Airline'])
import numpy as np
encoded_airlines = np.array(airline_tokenizer.texts_to_sequences(df['Airline'])).reshape(-1)
encoded_airlines
Out[9]:
In [10]:
len(encoded_airlines)
Out[10]:
In [11]:
routes = df[['Source airport', 'Destination airport']].apply(lambda x: ' '.join(x), axis=1)
routes.head()
Out[11]:
In [0]:
routes_tokenizer = Tokenizer()
routes_tokenizer.fit_on_texts(routes)
encoded_routes = np.array(routes_tokenizer.texts_to_sequences(routes))
In [13]:
# should be a bit more 3400 as source and destination are from the same set
output_dim = len(routes_tokenizer.word_index) + 1
output_dim
Out[13]:
In [14]:
encoded_routes[0]
Out[14]:
In [15]:
len(encoded_routes)
Out[15]:
In [0]:
from tensorflow.keras.utils import to_categorical
# sequence of airlines encoded as a unique number
x = encoded_airlines
# sequence of pair, src, dest encoded as a unique numbers
Y = to_categorical(encoded_routes)
# for now just the source
# Y = to_categorical(encoded_routes[:, 0])
In [17]:
Y[0]
Out[17]:
In [18]:
%%time
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Flatten, GlobalAveragePooling1D, Dense, LSTM, GRU, SimpleRNN, Bidirectional, Embedding, RepeatVector
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.initializers import glorot_normal
seed = 3
input_dim = len(airlines) + 1
embedding_dim = 2
model = Sequential()
model.add(Embedding(name='embedding',
input_dim=input_dim,
output_dim=embedding_dim,
input_length=1,
embeddings_initializer=glorot_normal(seed=seed)))
# https://stackoverflow.com/questions/49295311/what-is-the-difference-between-flatten-and-globalaveragepooling2d-in-keras
# averages over all (global) embedding values
# model.add(GlobalAveragePooling1D())
model.add(Flatten())
model.add(Dense(units=50, activation='relu', bias_initializer='zeros', kernel_initializer=glorot_normal(seed=seed)))
model.add(RepeatVector(2))
model.add(SimpleRNN(units=50, return_sequences=True, bias_initializer='zeros', kernel_initializer=glorot_normal(seed=seed)))
model.add(Dense(units=output_dim, name='output', activation='softmax', bias_initializer='zeros', kernel_initializer=glorot_normal(seed=seed)))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()
In [19]:
model.predict(np.array([x[0]]))
Out[19]:
In [20]:
Y[0]
Out[20]:
In [21]:
%%time
EPOCHS=25
BATCH_SIZE=10
history = model.fit(x, Y, epochs=EPOCHS, batch_size=BATCH_SIZE, verbose=1)
In [22]:
loss, accuracy = model.evaluate(x, Y, batch_size=BATCH_SIZE)
loss, accuracy
Out[22]:
In [0]:
# plt.yscale('log')
plt.plot(history.history['loss'])
In [0]:
# plt.yscale('log')
plt.plot(history.history['acc'])
In [0]:
samples = pd.DataFrame(encoded_airlines).sample(n=200).values.reshape(-1)
In [0]:
# https://en.wikipedia.org/wiki/List_of_airline_codes
# https://en.wikipedia.org/wiki/List_of_largest_airlines_in_North_America
# https://en.wikipedia.org/wiki/List_of_largest_airlines_in_Europe
europe_airlines = ['LH', 'BA', 'SK', 'KL', 'AF', 'FR', 'SU', 'EW', 'TP', 'BT', 'U2']
us_airlines = ['AA', 'US', 'UA', 'WN', 'DL', 'AS', 'HA']
In [0]:
samples = [airline_tokenizer.word_index[airline_code.lower()] for airline_code in europe_airlines + us_airlines]
In [0]:
embedding_layer = model.get_layer('embedding')
embedding_model = Model(inputs=model.input, outputs=embedding_layer.output)
embeddings_2d = embedding_model.predict(samples).reshape(-1, 2)
In [0]:
# for printing only
# plt.figure(dpi=600)
plt.axis('off')
plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1])
for index, x_pos, y_pos in zip(samples, embeddings_2d[:, 0], embeddings_2d[:, 1]):
name = airline_tokenizer.index_word[index].upper()
# print(name, (x_pos, y_pos))
plt.annotate(name, (x_pos, y_pos))
In [0]:
%%time
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Flatten, GlobalAveragePooling1D, Dense, LSTM, GRU, SimpleRNN, Bidirectional, Embedding, RepeatVector
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.initializers import glorot_normal
seed = 7
input_dim = len(airlines) + 1
embedding_dim = 1
model = Sequential()
model.add(Embedding(name='embedding',
input_dim=input_dim,
output_dim=embedding_dim,
input_length=1,
embeddings_initializer=glorot_normal(seed=seed)))
# model.add(GlobalAveragePooling1D())
model.add(Flatten())
model.add(Dense(units=50, activation='relu', bias_initializer='zeros', kernel_initializer=glorot_normal(seed=seed)))
model.add(RepeatVector(2))
model.add(SimpleRNN(units=50, return_sequences=True, bias_initializer='zeros', kernel_initializer=glorot_normal(seed=seed)))
model.add(Dense(units=output_dim, name='output', activation='softmax', bias_initializer='zeros', kernel_initializer=glorot_normal(seed=seed)))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()
In [0]:
%%time
EPOCHS=20
BATCH_SIZE=10
history = model.fit(x, Y, epochs=EPOCHS, batch_size=BATCH_SIZE, verbose=1)
plt.yscale('log')
plt.plot(history.history['loss'])
plt.plot(history.history['acc'])
In [0]:
# we expect this to be substantially worse than the 2d version as the bottle neck now is much more narrow
loss, accuracy = model.evaluate(x, Y, batch_size=BATCH_SIZE)
loss, accuracy
Out[0]:
In [0]:
import numpy as np
embedding_layer = model.get_layer('embedding')
embedding_model = Model(inputs=model.input, outputs=embedding_layer.output)
embeddings_1d = embedding_model.predict(samples).reshape(-1)
# for printing only
# plt.figure(figsize=(20,5))
# plt.figure(dpi=600)
plt.axis('off')
plt.scatter(embeddings_1d, np.zeros(len(embeddings_1d)))
for index, x_pos in zip(samples, embeddings_1d):
name = airline_tokenizer.index_word[index].upper()
# print(name, (x_pos, y_pos))
plt.annotate(name, (x_pos, 0), rotation=80)
In [0]:
from sklearn.decomposition import PCA
X = embeddings_2d
pca = PCA(n_components=2)
pca.fit(X)
Out[0]:
In [0]:
pca.explained_variance_ratio_
Out[0]:
In [0]:
X_transformed = pca.transform(X)
# for printing only
# plt.figure(dpi=600)
plt.axis('off')
plt.scatter(X_transformed[:, 0], X_transformed[:, 1])
for index, x_pos, y_pos in zip(samples, X_transformed[:, 0], X_transformed[:, 1]):
name = airline_tokenizer.index_word[index].upper()
# print(name, (x_pos, y_pos))
plt.annotate(name, (x_pos, y_pos))
In [0]:
pca = PCA(n_components=1)
pca.fit(X)
X_transformed = pca.transform(X)
# for printing only
# plt.figure(figsize=(20,5))
# plt.figure(dpi=600)
plt.axis('off')
plt.scatter(X_transformed, np.zeros(len(X_transformed)))
for index, x_pos in zip(samples, X_transformed):
name = airline_tokenizer.index_word[index].upper()
# print(name, (x_pos, y_pos))
plt.annotate(name, (x_pos, 0), rotation=80)
In [0]:
# https://en.wikipedia.org/wiki/List_of_airline_codes
# https://en.wikipedia.org/wiki/List_of_largest_airlines_in_North_America
# https://www.tvlon.com/resources/airlinecodes.htm
# https://en.wikipedia.org/wiki/List_of_largest_airlines_in_Europe
airline_size = {
'LH': 130, 'BA': 105, 'SK': 30, 'KL': 101, 'AF': 101, 'FR': 129, 'SU': 56, 'EW': 24, 'TP': 16, 'BT': 4, 'U2': 88, 'AA': 204, 'US': 204, 'UA': 158, 'WN': 164, 'DL': 192, 'AS': 46, 'HA': 12
}
sample_names = [airline_tokenizer.index_word[sample].upper() for sample in samples]
sample_sizes = [airline_size[name] * 1e6 for name in sample_names]
In [0]:
# for printing only
# plt.figure(figsize=(20,5))
# plt.figure(dpi=600)
# plt.axis('off')
plt.scatter(embeddings_1d, sample_sizes)
for name, x_pos, y_pos in zip(sample_names, embeddings_1d, sample_sizes):
plt.annotate(name, (x_pos, y_pos))
In [0]:
from sklearn.preprocessing import StandardScaler
embeddings_1d_scaled = StandardScaler().fit_transform(embeddings_1d.reshape(-1, 1))
sizes_for_samples_scaled = StandardScaler().fit_transform(np.array(sample_sizes).reshape(-1, 1))
X = np.dstack((embeddings_1d_scaled.reshape(-1), sizes_for_samples_scaled.reshape(-1)))[0]
X_scaled = StandardScaler().fit_transform(X)
X_scaled
Out[0]:
In [0]:
from sklearn.cluster import DBSCAN
clf = DBSCAN(eps=0.75, min_samples=2)
clf.fit(X_scaled)
clusters = clf.labels_.astype(np.int)
clusters
Out[0]:
In [0]:
import matplotlib.pyplot as plt
from itertools import cycle, islice
# last color is black to properly display label -1 as noise (black)
colors = np.append(np.array(list(islice(cycle(['#AAAAFF', '#ff7f00', '#4daf4a',
'#f781bf', '#a65628', '#984ea3',
'#999999', '#e41a1c', '#dede00']),
int(max(clusters) + 1)))), ['#000000'])
# plt.figure(dpi=600)
plt.xlabel('Similarity by typical routes')
plt.ylabel('Passengers')
plt.scatter(embeddings_1d, sample_sizes, color=colors[clusters], s=400)
for name, x_pos, y_pos in zip(sample_names, embeddings_1d, sample_sizes):
plt.annotate(name, (x_pos, y_pos), fontsize=36)
In [0]:
X = StandardScaler().fit_transform(embeddings_2d)
clf = DBSCAN(eps=0.75, min_samples=2)
clf.fit(X)
clusters = clf.labels_.astype(np.int)
clusters
Out[0]:
In [0]:
# for printing only
# plt.figure(dpi=600)
# plt.axis('off')
plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1], color=colors[clusters], s=400)
# plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1], s=400)
for index, x_pos, y_pos in zip(samples, embeddings_2d[:, 0], embeddings_2d[:, 1]):
name = airline_tokenizer.index_word[index].upper()
# print(name, (x_pos, y_pos))
plt.annotate(name, (x_pos, y_pos), fontsize=36)
In [0]: