In [ ]:
#@title Agreement

# Copyright (c) 2021 Kevin P. Murphy (murphyk@gmail.com) and Mahmoud Soliman (mjs@aucegypt.edu)
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
# DEALINGS IN THE SOFTWARE.

In [ ]:
#@title Attribution 
#This notebook is based on the following: 
#https://www.tensorflow.org/tutorials/keras/overfit_and_underfit
#https://scikit-learn.org/stable/modules/cross_validation.html#cross-validation

In [ ]:
#@title Imports
from tensorflow.python.client import device_lib
from psutil import virtual_memory
import cv2
from google.colab.patches import cv2_imshow
%tensorflow_version 2.x
import tensorflow as tf
import os


from tensorflow.keras import layers
from tensorflow.keras import regularizers
from  IPython import display
from matplotlib import pyplot as plt

import numpy as np

import pathlib
import shutil
import tempfile

from tqdm import tqdm

In [ ]:
#@title Hardware check 



def find_accelerator():
  
  mem = virtual_memory()
  devices=device_lib.list_local_devices()
  RAM="Physical RAM: {:.2f} GB".format(mem.total/(1024*1024*1024))
  try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  
    device=["TPU at "+str(tpu.cluster_spec().as_dict()['worker'])]  
  except ValueError:
    device =[d.physical_device_desc for d in devices if d.device_type=="GPU"]
  if not device:
    return None, RAM
  return device ,  RAM 

a,r=find_accelerator()
print("Please make sure that the statement below says Accelerator found")
print("Accelerator found:",a,r)

In [ ]:
#@title Install the extra required packages if any
!pip install git+https://github.com/tensorflow/docs -q
import tensorflow_docs as tfdocs
import tensorflow_docs.modeling
import tensorflow_docs.plots

In [ ]:
#@title Clone PyProbML repo and set enviroment variables
!git clone https://github.com/probml/pyprobml/ -q
os.environ["PYPROBML"]='/content/pyprobml/'

In [ ]:
#@title Setup tensorboard
logdir = pathlib.Path(tempfile.mkdtemp())/"tensorboard_logs"
shutil.rmtree(logdir, ignore_errors=True)

In [ ]:
#@title Download HIGGs dataset
gz = tf.keras.utils.get_file('HIGGS.csv.gz', 'http://mlphysics.ics.uci.edu/data/higgs/HIGGS.csv.gz')

In [ ]:
#Set variables like number of features and the sample of the entire dataset we will use. 
#Note that we are only using 10K samples to make sure the runtime is minimal.
#The full dataset has 11 million samples
num_features = 28
sample_size=10000

In [ ]:
#Using dataset api to read samples on the fly with no decompressio step
ds = tf.data.experimental.CsvDataset(gz,[float(),]*(num_features+1), compression_type="GZIP")
#Convert a sample from a list of scalers to a pair of features and label
def pack_row(*row):
  label = row[0]
  features = tf.stack(row[1:],1)
  return features, label
#Apply pack_row on the sample features
packed_ds = ds.batch(sample_size).map(pack_row).unbatch()
#Plot the 1st 1K features, note they are they are normalized to -2 and 2 and not perfectly so.
#It should be fine for our purpose 
for features,label in packed_ds.batch(1000).take(1):
  print(features[0])
  plt.hist(features.numpy().flatten(), bins = 101)

In [ ]:
#We are using 10K samples split into 8K for training and 2K for validation
def ds_split_size(size,split=[0.2,0.8]):
  return int(size*split[0]), int(size*split[1])
N_VALIDATION,N_TRAIN =ds_split_size(sample_size) 
BUFFER_SIZE = sample_size
BATCH_SIZE = 500
STEPS_PER_EPOCH = N_TRAIN//BATCH_SIZE

In [ ]:
validate_ds = packed_ds.take(N_VALIDATION).cache()
train_ds = packed_ds.skip(N_VALIDATION).take(N_TRAIN).cache()

In [ ]:
validate_ds = validate_ds.batch(BATCH_SIZE)
train_ds = train_ds.shuffle(BUFFER_SIZE).repeat().batch(BATCH_SIZE)

In [ ]:
# A schedules.InverseTimeDecay instance is used to hyperbolically decrease the learning rate 
# to 1/2 of the base rate at 1000 epochs, 1/3 at 2000 epochs and so on.
lr_schedule = tf.keras.optimizers.schedules.InverseTimeDecay(
  0.001,
  decay_steps=STEPS_PER_EPOCH*1000,
  decay_rate=1,
  staircase=False)
step = np.linspace(0,100000)
lr = lr_schedule(step)
plt.figure(figsize = (8,6))
plt.plot(step/STEPS_PER_EPOCH, lr)
plt.ylim([0,max(plt.ylim())])
plt.xlabel('Epoch')
_ = plt.ylabel('Learning Rate')
#We are using the ADAM optimizer, it's fast and will help us understand overfitting
def get_optimizer():
  return tf.keras.optimizers.Adam(lr_schedule)

In [ ]:
# We are using callbacks here to hook an early stop condition (more on that later) 
# and log the metrics to tensorboard to helo with the visualizations
def get_callbacks(name):
  return [
    tfdocs.modeling.EpochDots(),
    tf.keras.callbacks.EarlyStopping(monitor='val_binary_crossentropy', patience=200),
    tf.keras.callbacks.TensorBoard(logdir/name),
  ]

In [ ]:
# Setting up the training for 10K max epochs and using validation BCE as a metric
def compile_and_fit(model, name,data=None, optimizer=None, max_epochs=10000):
  if optimizer is None:
    optimizer = get_optimizer()
  model.compile(optimizer=optimizer,
                loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
                metrics=[
                  tf.keras.losses.BinaryCrossentropy(
                      from_logits=True, name='binary_crossentropy'),
                  'accuracy'])
  print("Starting training model size {}".format(name))
  if data:
    history=model.fit(x=data[0],y=data[1],
                      validation_data=(data[2],data[3]),
                      steps_per_epoch = STEPS_PER_EPOCH,
                      epochs=max_epochs,
                      callbacks=get_callbacks(name),
                      verbose=0)
  else:
    history = model.fit(
      train_ds,
      steps_per_epoch = STEPS_PER_EPOCH,
      epochs=max_epochs,
      validation_data=validate_ds,
      callbacks=get_callbacks(name),
      verbose=0)
  return history

In [ ]:
size_histories = {}
#Setting 4 model sizes to try with the following format (number_of_layers,width_of_layer)
model_size={"tiny":(1,16),"small":(2,16),"medium":(4,64),"large":(4,512)}
# Creating fully connected dense model  
def get_models():
  models={}
  for name,size in model_size.items():
    models[name]=tf.keras.Sequential(name=name)
    models[name].add(layers.Dense(size[1], activation='elu', input_shape=(num_features,)))
    for i in range(1,size[0]):
      models[name].add(layers.Dense(size[1], activation='elu'))

      
    models[name].add(layers.Dense(1))
   
  return models
# Now we have 4 models in a dictionary
models=get_models()

In [ ]:
# We can see here the number of paramters per model 
# Note that we have 8000 X 28 datapoints for training (224K). 
# Intuitively the large model with 800K paramaters is an overkill.  
for name,model in models.items():
  print(model.summary())

In [ ]:
# Training time!
for name,model in models.items():
  size_histories[name] = compile_and_fit(model, 'sizes/'+name)

In [ ]:
#Plot the BCE
plotter = tfdocs.plots.HistoryPlotter(metric = 'binary_crossentropy', smoothing_std=10)
plotter.plot(size_histories)
plt.ylim([0.5, 0.7])

In [ ]:
#Load and run tensorboard
%load_ext tensorboard
%tensorboard --logdir {logdir}/sizes

As you can see above the both the medium and large model did overfit.

The validation accuracy and BCE are worse than the training ones by significant margins (Almost perfect training scores i.e. the models "memorized" the training data and failed to generalize on the validation data)

We will experiment with regularization techniques to try to pervent overfitting, an interesting change also is to use SGD as an optimizer.

Note that SGD is much slower than ADAM in terms of runtime (but as you can see in the below graphs more stable) below are screenshots of same code only changing the optimizer to SGD instead of ADAM.

Now that we have looked at optimizers and overfitting, let's look at a possible solution specially on small sized data (although artifically so in this instance).


In [ ]:
# The tensorflow dataset API has support for cross validation, but not as versatile as sklearn. 
# So we will opt-in for sklearn to showcase the types of cross validation 
# We will load our 10K samples in numpy arrays to work with sklearn
X_train_numpy=np.empty(shape=[N_TRAIN,num_features])
y_train_numpy=np.empty(shape=[N_TRAIN])
X_test_numpy=np.empty(shape=[N_VALIDATION,num_features])
y_test_numpy=np.empty(shape=[N_VALIDATION])
ci=0
for feature,label in packed_ds.take(N_TRAIN):
  X_train_numpy[ci]=(feature.numpy())
  y_train_numpy[ci]=(label.numpy())
  ci=ci+1
ci=0
for feature,label in packed_ds.skip(N_TRAIN).take(N_VALIDATION):
  X_test_numpy[ci]=(feature.numpy())
  y_test_numpy[ci]=(label.numpy())
  ci=ci+1
print(len(X_train_numpy))
print(len(y_train_numpy))
print(len(X_test_numpy))
print(len(y_test_numpy))

In [ ]:
from sklearn.model_selection import StratifiedKFold ,LeaveOneOut
cv_histories={}
splits=4
cvr=0
kfold = StratifiedKFold(splits)
models_cv=get_models()
for train_index,val_index in kfold.split(X_train_numpy,y_train_numpy):
  x_train,x_val=X_train_numpy[train_index],X_train_numpy[val_index]
  y_train,y_val=y_train_numpy[train_index],y_train_numpy[val_index]
  cv_histories['medium_cv_'+str(cvr)] = compile_and_fit(models_cv['medium'],
                                              name='sizes/medium_cv_'+str(cvr),
                                              data=[x_train,y_train,x_val,y_val],
                                              max_epochs=64 
                                              # Our medium model trained for 256 epochs 
                                              # So to make this apples to apples comparison
                                              # We use 256//splits which is 64
                                              # You can change this depending on your experiments 
                                              )
  cvr=cvr+1

In [ ]:
print('Test score:',models_cv['medium'].evaluate(X_test_numpy,y_test_numpy))
# Note that the BCE for medium sized model on the test set is larger without cross validation 
# In our run it was 1.206 without CV and 1.152 with CV after same number of epochs