In [1]:
import tensorflow as tf
tf.logging.set_verbosity(tf.logging.ERROR)
print(tf.__version__)
In [2]:
!curl -O https://raw.githubusercontent.com/DJCordhose/deep-learning-crash-course-notebooks/master/data/insurance-customers-1500.csv
In [0]:
import pandas as pd
df = pd.read_csv('./insurance-customers-1500.csv', sep=';')
In [4]:
df.describe()
Out[4]:
In [5]:
df.head()
Out[5]:
In [6]:
import seaborn as sns
sample_df = df.sample(n=100, random_state=42)
sns.pairplot(sample_df,
hue="group", palette={0: '#AA4444', 1: '#006000', 2: '#EEEE44'},
# kind='reg',
size=5,
diag_kind='kde',
vars=['age', 'speed', 'miles'])
Out[6]:
In [7]:
from tensorflow import keras
from tensorflow.keras.layers import Input, Flatten, GlobalAveragePooling1D, Dense
from tensorflow.keras.models import Sequential, Model
encoding_dim = 32
# https://keras.io/getting-started/functional-api-guide/
input_data = Input(shape=(4,))
encoded = Dense(units=encoding_dim, activation='relu', name="encoder")(input_data)
decoded = Dense(units=4, activation='linear', name="decoder")(encoded)
autoencoder = Model(inputs=input_data, outputs=decoded)
autoencoder.compile(optimizer='adam', loss='mse')
autoencoder.summary();
In [8]:
X = df
BATCH_SIZE = 1 # larger batch size might force more epochs
%time history = autoencoder.fit(X, X, epochs=10, batch_size=BATCH_SIZE, verbose=1)
In [9]:
import matplotlib.pyplot as plt
plt.yscale('log')
plt.plot(history.history['loss'])
Out[9]:
In [10]:
loss = autoencoder.evaluate(X, X, batch_size=BATCH_SIZE)
loss
Out[10]:
In [11]:
samples = df.sample(10).reset_index(drop=True)
samples
Out[11]:
In [12]:
predictions = pd.DataFrame(autoencoder.predict(samples), columns=["speed", "age", "miles", "group"])
predictions
Out[12]:
In [13]:
samples.subtract(predictions)
Out[13]:
In [14]:
from tensorflow.keras.initializers import glorot_normal
encoding_dim = 2
seed = 13 # make training results more deterministic
input_data = Input(shape=(4,))
# notice tanh
encoded = Dense(units=encoding_dim, activation='tanh', name="encoder", kernel_initializer=glorot_normal(seed=seed))(input_data)
decoded = Dense(units=4, activation='linear', name="decoder", kernel_initializer=glorot_normal(seed=seed))(encoded)
autoencoder = Model(inputs=input_data, outputs=decoded)
# this does not get much better than 210 as a loss (bad), so we can just as well get there a bit faster (10 instead of 50 epochs with standard lr)
adam = keras.optimizers.Adam(lr=0.01)
# adam = keras.optimizers.Adam()
autoencoder.compile(optimizer=adam, loss='mse')
X = df
BATCH_SIZE = 1
%time history = autoencoder.fit(X, X, epochs=10, batch_size=BATCH_SIZE, verbose=1)
plt.yscale('log')
plt.plot(history.history['loss'])
Out[14]:
In [15]:
autoencoder.evaluate(X, X, batch_size=BATCH_SIZE)
Out[15]:
In [16]:
# just average over all of them :D
predictions = pd.DataFrame(autoencoder.predict(samples), columns=["speed", "age", "miles", "group"])
predictions
Out[16]:
In [17]:
samples.describe()
Out[17]:
In [18]:
from tensorflow.keras.initializers import glorot_normal
encoding_dim = 2
seed = 13 # if it trains still depens on initialization
input_data = Input(shape=(4,))
# notice relu
encoded = Dense(units=encoding_dim, activation='relu', name="encoder", kernel_initializer=glorot_normal(seed=seed))(input_data)
decoded = Dense(units=4, activation='linear', name="decoder", kernel_initializer=glorot_normal(seed=seed))(encoded)
autoencoder = Model(inputs=input_data, outputs=decoded)
# adam = keras.optimizers.Adam(lr=0.01)
adam = keras.optimizers.Adam()
autoencoder.compile(optimizer=adam, loss='mse')
X = df
BATCH_SIZE = 1
%time history = autoencoder.fit(X, X, epochs=10, batch_size=BATCH_SIZE, verbose=1)
plt.yscale('log')
plt.plot(history.history['loss'])
Out[18]:
In [19]:
samples
Out[19]:
In [20]:
predictions = pd.DataFrame(autoencoder.predict(samples), columns=["speed", "age", "miles", "group"])
predictions
Out[20]:
In [21]:
samples.subtract(predictions)
Out[21]:
In [28]:
df = pd.read_csv('./insurance-customers-1500.csv', sep=';')
normalized_df = (df - df.mean()) / df.std()
normalized_df.head()
Out[28]:
In [29]:
X = normalized_df
encoding_dim = 2
seed = 13 # make results determinisitic
input_data = Input(shape=(4,))
encoded = Dense(units=encoding_dim, activation='relu', name="encoder", kernel_initializer=glorot_normal(seed=seed))(input_data)
decoded = Dense(units=4, activation='linear', name="decoder", kernel_initializer=glorot_normal(seed=seed))(encoded)
autoencoder = Model(inputs=input_data, outputs=decoded)
# adam = keras.optimizers.Adam(lr=0.01)
adam = keras.optimizers.Adam()
autoencoder.compile(optimizer=adam, loss='mse')
BATCH_SIZE = 1
%time history = autoencoder.fit(X, X, epochs=10, batch_size=BATCH_SIZE, verbose=1)
plt.yscale('log')
plt.plot(history.history['loss'])
Out[29]:
In [0]:
encoder = Model(inputs=input_data, outputs=encoded)
latent_representation = encoder.predict(X)
In [25]:
latent_representation.shape
Out[25]:
In [26]:
latent_x = latent_representation[:, 0]
latent_y = latent_representation[:, 1]
plt.scatter(latent_x, latent_y, alpha=0.5)
Out[26]:
In [27]:
from matplotlib.colors import ListedColormap
# * 0 - red: many accidents
# * 1 - green: few or no accidents
# * 2 - yellow: in the middle
colors = X['group']
color_map = ListedColormap(['#AA4444', '#006000', '#EEEE44'])
plt.scatter(latent_x, latent_y, alpha=0.5, s=100, marker='o', edgecolors='w', cmap=color_map, c=colors)
Out[27]: