In [ ]:
# Copyright 2019 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
For best performance using Colab, once the notebook is launched, from dropdown menu select Runtime -> Change Runtime Type, and select GPU for Hardware Accelerator.
To replace a traditional "inter-model" ensemble of models of high complexity with an "intra-model" ensemble of lower complexity, while retaining the performance benefits.
Question: Can one achieve the same performance with intra-model bagging vs. traditional inter-model ensemble?
Question: Can one achieve the same performance with intra-model stacking vs. traditional inter-model ensemble?
We will use the composable design pattern, and prebuilt units from the Google Cloud AI Developer Relations repo: Model Zoo
We will use the composable design pattern, and prebuilt units from the Google Cloud AI Developer Relations repo: Model Zoo
If you are not familiar with the Composable design pattern, we recommemd you review the ResNet model in our zoo.
We recommend a constant set for hyperparameters, where batch_size is 32 and initial learning rate is 0.001 -- but you may use any value for hyperparameters you prefer.
You can contact us on your findings via the twitter account: @andrewferlitsch
In this notebook, we use the CIFAR-10 datasets which consist of images 32x32x3 for 10 classes -- but you may use any dataset you prefer.
Build and train a baseline (single instance) model for CIFAR-10.
Build and train two more baseline model instances (three in total), each with a different draw for weight initialization.
Construct an inter-model ensemble from the trained model instances and evaluate it.
Observe the weight variances between the trained model instances.
Evaluate an interchange of the top layer weights between the trained model instances and observe the performance.
Build and train an intra-model bagging model ensemble.
Build a wrapper model to weight parameterize the intra-model bagging ensemble (majority voting).
Evaluate the intra-model bagging wrapper model.
Build and train an intra-model stacking model ensemble.
Evaluate the intra-model stacking model ensemble.
In [ ]:
import tensorflow as tf
from tensorflow.keras import Input, Model
from tensorflow.keras.layers import Conv2D, Flatten, Conv2DTranspose, ReLU, Add, Dense, Dropout, GaussianNoise
from tensorflow.keras.layers import BatchNormalization, GlobalAveragePooling2D, Activation, Concatenate
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import LearningRateScheduler
from tensorflow.keras.datasets import cifar10
import numpy as np
In [ ]:
from tensorflow.keras.datasets import cifar10
(x_train, y_train), (x_test, y_test) = cifar10.load_data()
x_train = (x_train / 255.0).astype(np.float32)
x_test = (x_test / 255.0).astype(np.float32)
print(x_train.shape)
In [ ]:
# from resnet/resnet_v2_c.py
class ResNetV2(object):
""" Construct a Residual Convolution Network Network V2 """
# Meta-parameter: list of groups: number of filters and number of blocks
groups = { 50 : [ { 'n_filters' : 64, 'n_blocks': 3 },
{ 'n_filters': 128, 'n_blocks': 4 },
{ 'n_filters': 256, 'n_blocks': 6 },
{ 'n_filters': 512, 'n_blocks': 3 } ], # ResNet50
101: [ { 'n_filters' : 64, 'n_blocks': 3 },
{ 'n_filters': 128, 'n_blocks': 4 },
{ 'n_filters': 256, 'n_blocks': 23 },
{ 'n_filters': 512, 'n_blocks': 3 } ], # ResNet101
152: [ { 'n_filters' : 64, 'n_blocks': 3 },
{ 'n_filters': 128, 'n_blocks': 8 },
{ 'n_filters': 256, 'n_blocks': 36 },
{ 'n_filters': 512, 'n_blocks': 3 } ] # ResNet152
}
init_weights = 'he_normal'
reg=l2(0.001)
_model = None
def __init__(self, n_layers, input_shape=(224, 224, 3), n_classes=1000):
""" Construct a Residual Convolutional Neural Network V2
n_layers : number of layers
input_shape: input shape
n_classes : number of output classes
"""
# predefined
if isinstance(n_layers, int):
if n_layers not in [50, 101, 152]:
raise Exception("ResNet: Invalid value for n_layers")
groups = self.groups[n_layers]
# user defined
else:
groups = n_layers
# The input tensor
inputs = Input(input_shape)
# The stem convolutional group
x = self.stem(inputs)
# The learner
x = self.learner(x, groups=groups)
# The classifier
outputs = self.classifier(x, n_classes)
# Instantiate the Model
self._model = Model(inputs, outputs)
@property
def model(self):
return self._model
@model.setter
def model(self, _model):
self._model = _model
def stem(self, inputs):
""" Construct the Stem Convolutional Group
inputs : the input vector
"""
# The 224x224 images are zero padded (black - no signal) to be 230x230 images prior to the first convolution
x = ZeroPadding2D(padding=(3, 3))(inputs)
# First Convolutional layer uses large (coarse) filter
x = Conv2D(64, (7, 7), strides=(2, 2), padding='valid', use_bias=False,
kernel_initializer=self.init_weights, kernel_regularizer=self.reg)(x)
x = BatchNormalization()(x)
x = ReLU()(x)
# Pooled feature maps will be reduced by 75%
x = ZeroPadding2D(padding=(1, 1))(x)
x = MaxPooling2D((3, 3), strides=(2, 2))(x)
return x
def learner(self, x, **metaparameters):
""" Construct the Learner
x : input to the learner
groups: list of groups: number of filters and blocks
"""
groups = metaparameters['groups']
# First Residual Block Group (not strided)
x = ResNetV2.group(x, strides=(1, 1), **groups.pop(0))
# Remaining Residual Block Groups (strided)
for group in groups:
x = ResNetV2.group(x, **group)
return x
@staticmethod
def group(x, strides=(2, 2), init_weights=None, **metaparameters):
""" Construct a Residual Group
x : input into the group
strides : whether the projection block is a strided convolution
n_filters : number of filters for the group
n_blocks : number of residual blocks with identity link
"""
n_blocks = metaparameters['n_blocks']
# Double the size of filters to fit the first Residual Group
x = ResNetV2.projection_block(x, strides=strides, init_weights=init_weights, **metaparameters)
# Identity residual blocks
for _ in range(n_blocks):
x = ResNetV2.identity_block(x, init_weights=init_weights, **metaparameters)
return x
@staticmethod
def identity_block(x, init_weights=None, **metaparameters):
""" Construct a Bottleneck Residual Block with Identity Link
x : input into the block
n_filters: number of filters
reg : kernel regularizer
"""
n_filters = metaparameters['n_filters']
if 'reg' in metaparameters:
reg = metaparameters['reg']
else:
reg = ResNetV2.reg
if init_weights is None:
init_weights = ResNetV2.init_weights
# Save input vector (feature maps) for the identity link
shortcut = x
## Construct the 1x1, 3x3, 1x1 convolution block
# Dimensionality reduction
x = BatchNormalization()(x)
x = ReLU()(x)
x = Conv2D(n_filters, (1, 1), strides=(1, 1), use_bias=False,
kernel_initializer=init_weights, kernel_regularizer=reg)(x)
# Bottleneck layer
x = BatchNormalization()(x)
x = ReLU()(x)
x = Conv2D(n_filters, (3, 3), strides=(1, 1), padding="same", use_bias=False,
kernel_initializer=init_weights, kernel_regularizer=reg)(x)
# Dimensionality restoration - increase the number of output filters by 4X
x = BatchNormalization()(x)
x = ReLU()(x)
x = Conv2D(n_filters * 4, (1, 1), strides=(1, 1), use_bias=False,
kernel_initializer=init_weights, kernel_regularizer=reg)(x)
# Add the identity link (input) to the output of the residual block
x = Add()([shortcut, x])
return x
@staticmethod
def projection_block(x, strides=(2,2), init_weights=None, **metaparameters):
""" Construct a Bottleneck Residual Block of Convolutions with Projection Shortcut
Increase the number of filters by 4X
x : input into the block
strides : whether the first convolution is strided
n_filters: number of filters
reg : kernel regularizer
"""
n_filters = metaparameters['n_filters']
if 'reg' in metaparameters:
reg = metaparameters['reg']
else:
reg = ResNetV2.reg
if init_weights is None:
init_weights = ResNetV2.init_weights
# Construct the projection shortcut
# Increase filters by 4X to match shape when added to output of block
shortcut = BatchNormalization()(x)
shortcut = Conv2D(4 * n_filters, (1, 1), strides=strides, use_bias=False,
kernel_initializer=init_weights, kernel_regularizer=reg)(shortcut)
## Construct the 1x1, 3x3, 1x1 convolution block
# Dimensionality reduction
x = BatchNormalization()(x)
x = ReLU()(x)
x = Conv2D(n_filters, (1, 1), strides=(1,1), use_bias=False,
kernel_initializer=init_weights, kernel_regularizer=reg)(x)
# Bottleneck layer
# Feature pooling when strides=(2, 2)
x = BatchNormalization()(x)
x = ReLU()(x)
x = Conv2D(n_filters, (3, 3), strides=strides, padding='same', use_bias=False,
kernel_initializer=init_weights, kernel_regularizer=reg)(x)
# Dimensionality restoration - increase the number of filters by 4X
x = BatchNormalization()(x)
x = ReLU()(x)
x = Conv2D(4 * n_filters, (1, 1), strides=(1, 1), use_bias=False,
kernel_initializer=init_weights, kernel_regularizer=reg)(x)
# Add the projection shortcut to the output of the residual block
x = Add()([x, shortcut])
return x
def classifier(self, x, n_classes):
""" Construct the Classifier Group
x : input to the classifier
n_classes : number of output classes
"""
# Pool at the end of all the convolutional residual blocks
x = GlobalAveragePooling2D()(x)
# Final Dense Outputting Layer for the outputs
outputs = Dense(n_classes, activation='softmax',
kernel_initializer=self.init_weights, kernel_regularizer=self.reg)(x)
return outputs
In [ ]:
def makeBaseModel(reg=None, n_blocks=4, lr=0.001, noise=None):
ResNetV2.reg = reg
# Stem
inputs = Input((32, 32, 3))
x = Conv2D(32, (3, 3), strides=(1, 1), padding='same',
kernel_initializer='he_normal', kernel_regularizer=reg)(inputs)
x = BatchNormalization()(x)
x = ReLU()(x)
# Learner
x = ResNetV2.group(x, n_blocks=n_blocks, n_filters=16)
x = ResNetV2.group(x, n_blocks=n_blocks, n_filters=64)
x = ResNetV2.group(x, n_blocks=n_blocks, n_filters=128)
# Classifier
x = GlobalAveragePooling2D()(x)
if noise:
x = GaussianNoise(noise)(x)
x = ReLU()(x)
outputs = Dense(10, activation='softmax',
kernel_initializer='he_normal', kernel_regularizer=reg)(x)
resnet = Model(inputs, outputs)
resnet.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(lr=lr), metrics=['acc'])
return resnet
Let's first a single (non-ensemble) model as our base reference. We will use a learning rate scheduler to train the model at a learning rate of 0.001 for 20 epochs, and then drop the learning rate by a magnitude to 0.0001 for the remaining 10 epochs. After 30 epochs, the validation/test accuracy will be about 84%.
Note that the size of the model is just over 1.8 million parameters.
In [ ]:
resnet1 = makeBaseModel(reg=l2(0.001), noise=0.1)
resnet1.summary()
def lr_schedule(epoch, lr):
if epoch < 20:
return 0.001
else:
return 0.0001
resnet1.fit(x_train, y_train, epochs=30, batch_size=32, validation_split=0.1, verbose=1,
callbacks=[LearningRateScheduler(lr_schedule)])
resnet1.evaluate(x_test, y_test)
Next, we will train two additional instances of the same model (three altogether), where each model has a different draw from the random distribution for weight initialization.
When we look at the results from the evaluation data for all three models, most often they will be very close to each other. In most runs, you might see the range of difference as little as < 0.5% or as large as 2%. For example, you might see something like [84%, 82.5%, 83%]
In [ ]:
resnet2 = makeBaseModel()
resnet2.fit(x_train, y_train, epochs=30, batch_size=32, validation_split=0.1, verbose=1,
callbacks=[LearningRateScheduler(lr_schedule)])
resnet2.evaluate(x_test, y_test)
resnet3 = makeBaseModel()
resnet3.fit(x_train, y_train, epochs=30, batch_size=32, validation_split=0.1, verbose=1,
callbacks=[LearningRateScheduler(lr_schedule)])
resnet3.evaluate(x_test, y_test)
Let's make a traditional inter-model ensemble. In this case, we will create a new wrapper model ('ensemble'), and include each of the model instances as a branch from the input. Finally, we add the outputs from each model together and do a softmax (effectively an argmax) for our majority vote of the models.
Let's compare the results of the inter-model ensemble to the individual model results. One should see a modest boost of ~2% above the best performance of the individual models. For example, if the best performing individual model was 83%, then the inter-model ensemble would be ~85%.
Note that the size of this inter-model ensemble is just over 5.6 million parameters (3X as a single model instance).
In [ ]:
# Input to the Ensemble
inputs = Input((32, 32, 3))
# Each model will be a branch in the ensemble
o1 = resnet1(inputs)
o2 = resnet2(inputs)
o3 = resnet3(inputs)
# Implement majority voting by adding their softmax predictions
outputs = Add()([o1, o2, o3])
outputs = Activation('softmax')(outputs)
ensemble = Model(inputs, outputs)
ensemble.summary()
ensemble.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(lr=0.001), metrics=['acc'])
ensemble.evaluate(x_test, y_test)
Let's now look at the variance of weights at the same layer across the model instances (each with a different weight initialization draw). One would observe that at the top layer before the softmax activation (referred to as the 'feature vector' or 'embedding') that there is very little variance. So little, as the next section will show, can be interchanged between the models with little to no effect on the output performance.
On the otherhand, any layer past the bottleneck layer (not demonstrated) will show a rapid degradation in performance when interchanged between the models.
In [ ]:
weights1 = resnet1.get_weights()
weights2 = resnet2.get_weights()
weights3 = resnet3.get_weights()
print("Number of Weight Matrices", len(weights1))
Let's now do an interchange between the three trained model instances of just the bottom layer. Notice how there is essentially no performance change! While the feature vectors (embeddings) are not identical, they are interchangeable.
In [ ]:
print("models 1, 2")
resnetx12 = makeBaseModel()
resnetx12.set_weights( weights1[0:247] + weights2[247:])
resnetx12.evaluate(x_test, y_test)
resnetx21 = makeBaseModel()
resnetx21.set_weights( weights2[0:247] + weights1[247:])
resnetx21.evaluate(x_test, y_test)
print("models 1, 3")
resnetx13 = makeBaseModel()
resnetx13.set_weights( weights1[0:247] + weights3[247:])
resnetx13.evaluate(x_test, y_test)
resnetx31 = makeBaseModel()
resnetx31.set_weights( weights3[0:247] + weights1[247:])
resnetx31.evaluate(x_test, y_test)
print("models 2, 3")
resnetx23 = makeBaseModel()
resnetx23.set_weights( weights2[0:247] + weights3[247:])
resnetx23.evaluate(x_test, y_test)
resnetx32 = makeBaseModel()
resnetx32.set_weights( weights3[0:247] + weights2[247:])
resnetx32.evaluate(x_test, y_test)
What's the difference between inter-model and intra-model ensemble techniques? Inter-model means that each model instance is an independent model, with no shared layers (weights) and no shared training. This method is of higher computational complexity and meets the traditional definition of an ensemble (majority vote from a collection of independently trained weak learners).
Intra-model ensemble methods go against the traditional method and rely on the concept of the lottery hypothesis; whereby each trained model instance has shared layers and trained together, with a separate classifier. The assumption is that each classifier has an independent draw from the random distribution for weight initializations, and this will give equalivalent results as a traditional ensemble, but with substantial less complexity.
We should observe that each of the three classifiers will be very close to each other in performance, typically within 0.025 (1/4 of 1 percent).
In [ ]:
def makeBagging(reg=None, n_blocks=4, lr=0.001, noise=None):
ResNetV2.reg = reg
# Stem
inputs = Input((32, 32, 3))
x = Conv2D(32, (3, 3), strides=(1, 1), padding='same',
kernel_initializer='he_normal', kernel_regularizer=reg)(inputs)
x = BatchNormalization()(x)
x = ReLU()(x)
# Learner
x = ResNetV2.group(x, n_blocks=n_blocks, n_filters=16)
x = ResNetV2.group(x, n_blocks=n_blocks, n_filters=64)
x = ResNetV2.group(x, n_blocks=n_blocks, n_filters=128)
# Classifier
x = GlobalAveragePooling2D()(x)
if noise:
x = GaussianNoise(noise)(x)
x = ReLU()(x)
# Multiple Instances of Classifier (Bagging)
outputs1 = Dense(10, activation='softmax',
kernel_initializer='he_normal', kernel_regularizer=reg)(x)
outputs2 = Dense(10, activation='softmax',
kernel_initializer='he_normal', kernel_regularizer=reg)(x)
outputs3 = Dense(10, activation='softmax',
kernel_initializer='he_normal', kernel_regularizer=reg)(x)
resnet = Model(inputs, [outputs1, outputs2, outputs3])
resnet.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['acc'])
return resnet
In [ ]:
resnet_b = makeBagging(reg=l2(0.001), noise=0.1)
resnet_b.summary()
resnet_b.fit(x_train, [y_train, y_train, y_train], epochs=30, batch_size=32, verbose=1, validation_split=0.1,
callbacks=[LearningRateScheduler(lr_schedule)])
resnet_b.evaluate(x_test, [y_test, y_test, y_test])
Let's now save the weights from the trained intra-model bagging.
In [ ]:
weights_b = resnet_b.get_weights()
In the above method, the model reported all three votes separately. We now construct the same model to add the step of majority voting. To do this, we will add two new layers to the top of the model. The first layer we add will add the outputs from each of the three classifier layers into a single vector. That is, all three predictions will be summed together for each class -- which is a form of weight parameterization. We will then pass the vector through a softmax activation (which essentially is an argmax in this case) for the final prediction. This does not add any new parameters, and simply implements majority voting.
Let's compare the results between the individual classifiers within the model and the bagged classifier. We see that there likely is very little difference, generally in the range of 1/10 to 1/4 of 1% increase.
Note that the size of this inter-model ensemble is just over 5.6 million parameters (3X as a single model instance).
In [ ]:
def makeBaggingEx(reg=None, n_blocks=4, lr=0.001, noise=None):
ResNetV2.reg = reg
# Stem
inputs = Input((32, 32, 3))
x = Conv2D(32, (3, 3), strides=(1, 1), padding='same',
kernel_initializer='he_normal', kernel_regularizer=reg)(inputs)
x = BatchNormalization()(x)
x = ReLU()(x)
# Learner
x = ResNetV2.group(x, n_blocks=n_blocks, n_filters=16)
x = ResNetV2.group(x, n_blocks=n_blocks, n_filters=64)
x = ResNetV2.group(x, n_blocks=n_blocks, n_filters=128)
# Classifier
x = GlobalAveragePooling2D()(x)
if noise:
x = GaussianNoise(noise)(x)
x = ReLU()(x)
outputs1 = Dense(10, activation='softmax',
kernel_initializer='he_normal', kernel_regularizer=reg)(x)
outputs2 = Dense(10, activation='softmax',
kernel_initializer='he_normal', kernel_regularizer=reg)(x)
outputs3 = Dense(10, activation='softmax',
kernel_initializer='he_normal', kernel_regularizer=reg)(x)
# Parameterize the weights from all three classifiers back into one classifier
outputs = Add()([outputs1, outputs2, outputs3])
outputs = Activation('softmax')(outputs)
resnet = Model(inputs, outputs)
resnet.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['acc'])
return resnet
In [ ]:
resnet_bx = makeBaggingEx(reg=l2(0.001), noise=0.1)
resnet_bx.summary()
resnet_bx.set_weights(weights_b)
resnet_bx.evaluate(x_test, y_test)
Another method to intra-model ensemble is stacking. Stacking is similar to the bagging method, except instead of bagging the results (adding together the individual classifiers and do majority voting), we pass the outputs from the pretrained classifiers to a second level classifier ("the stack"); whereby the second classifier learns to correct the misclassifications by the preceding models.
We will build the model by starting with the prior intra-model bagging model, and then replace the majority voting classifier with a new classifier. To do this, we will concatenate all three output vectors (vs. add) from the three classifiers and pass the concatenated vector to a new classifier layer. We also add some additional Guassian noise (regularization) between the first level and second level classifiers for regularizing the second level classifier -- to address overfitting.
Note that the total number of parameters has only gone up slightly from our intra-model bagging model at 1.9 million parameters (vs 1.8 million).
In [ ]:
def makeStacking(reg=None, n_blocks=4, lr=0.001, noise=None):
ResNetV2.reg = reg
# Stem
inputs = Input((32, 32, 3))
x = Conv2D(32, (3, 3), strides=(1, 1), padding='same',
kernel_initializer='he_normal', kernel_regularizer=reg)(inputs)
x = BatchNormalization()(x)
x = ReLU()(x)
# Learner
x = ResNetV2.group(x, n_blocks=n_blocks, n_filters=16)
x = ResNetV2.group(x, n_blocks=n_blocks, n_filters=64)
x = ResNetV2.group(x, n_blocks=n_blocks, n_filters=128)
# Classifier
x = GlobalAveragePooling2D()(x)
if noise:
x = GaussianNoise(noise)(x)
x = ReLU()(x)
outputs1 = Dense(10, activation='softmax',
kernel_initializer='he_normal', kernel_regularizer=reg)(x)
outputs2 = Dense(10, activation='softmax',
kernel_initializer='he_normal', kernel_regularizer=reg)(x)
outputs3 = Dense(10, activation='softmax',
kernel_initializer='he_normal', kernel_regularizer=reg)(x)
# Stacking
outputs = Concatenate()([outputs1, outputs2, outputs3])
outputs = Dense(10, activation='softmax',
kernel_initializer='he_normal', kernel_regularizer=reg)(outputs)
resnet = Model(inputs, outputs)
resnet.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['acc'])
return resnet
In [ ]:
resnet_s = makeStacking(reg=l2(0.001), noise=0.1)
resnet_s.summary()
Next, we will copy over the pretrained weights for the first level classifier (weights_b[:247]). Next we set all the layers of the first level classifier to non-trainable; i.e., we will only train the second level classifier.
In [ ]:
weights_s = resnet_s.get_weights()
resnet_s.set_weights(weights_b[:247] + weights_s[247:])
for _ in range(len(resnet_s.layers)-1):
resnet_s.layers[_].trainable = False
We will train the second level classifier with low learning rate (0.0001). Observe that after a few epochs, the validation accuracy plateaus out around the same as for the intra-model bagging version -- i.e., it does not appear in this scenario that we are learning to correct the mistakes of the first level classifier; we are still overfitting.
In [ ]:
resnet_s.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(lr=0.0001), metrics=['acc'])
resnet_s.fit(x_train, y_train, epochs=10, batch_size=32, verbose=1, validation_split=0.1)
resnet_s.evaluate(x_test, y_test)