by Alejandro Correa Bahnsen and Jesus Solano
version 1.4, May 2019
This notebook is licensed under a Creative Commons Attribution-ShareAlike 3.0 Unported License. Special thanks goes to Valerio Maggio, Fondazione Bruno Kessler
Keras is a minimalist, highly modular neural networks library, written in Python and capable of running on top of either TensorFlow or Theano.
It was developed with a focus on enabling fast experimentation. Being able to go from idea to result with the least possible delay is key to doing good research. ref: https://keras.io/
In [1]:
from sklearn.datasets import load_boston
boston_dataset = load_boston()
print(boston_dataset.DESCR)
In [2]:
import pandas as pd
from sklearn.datasets import load_boston
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
In [3]:
boston_dataset = load_boston()
boston = pd.DataFrame(boston_dataset.data, columns=boston_dataset.feature_names)
X = boston.drop(boston.columns[-1],axis=1)
Y = pd.DataFrame(np.array(boston_dataset.target), columns=['labels'])
boston.head()
Out[3]:
In [4]:
# Split datasets.
X_train, X_test , Y_train, Y_test = train_test_split(X,Y, test_size=0.3 ,random_state=22)
# Normalize Data
from sklearn.preprocessing import StandardScaler
# Define the Preprocessing Method and Fit Training Data to it
scaler = StandardScaler()
scaler.fit(X)
# Make X_train to be the Scaled Version of Data
# This process scales all the values in all 6 columns and replaces them with the new values
X_train = pd.DataFrame(data=scaler.transform(X_train), columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(data=scaler.transform(X_test), columns=X_test.columns, index=X_test.index)
X_train = np.array(X_train)
Y_train = np.array(Y_train)
X_test = np.array(X_test)
Y_test = np.array(Y_test)
In [5]:
# As it is a regression problem the output is a neuron.
output_var = Y_train.shape[1]
print(output_var, ' output variables')
dims = X_train.shape[1]
print(dims, 'input variables')
In [6]:
Y_train.shape
Out[6]:
In [7]:
import tensorflow as tf
In [8]:
# Parameters
learning_rate = 0.01
training_epochs = 150
display_step = 1
In [9]:
# tf Graph Input
x = tf.placeholder(tf.float32, [None, dims])
y = tf.placeholder(tf.float32, [None,1])
In [10]:
# Try to print a placeholder.
x
Out[10]:
In [11]:
# Construct (linear) model
with tf.name_scope("model") as scope:
# Set model weights
W = tf.Variable(tf.zeros([dims, output_var]))
b = tf.Variable(tf.zeros([output_var]))
activation = tf.add(tf.matmul(x, W), b) # Softmax
# Add summary ops to collect data
w_h = tf.summary.histogram("weights_histogram", W)
b_h = tf.summary.histogram("biases_histograms", b)
tf.summary.scalar('mean_weights', tf.reduce_mean(W))
tf.summary.scalar('mean_bias', tf.reduce_mean(b))
# Minimize error using cross entropy
# Note: More name scopes will clean up graph representation
with tf.name_scope("cost_function") as scope:
cost = tf.reduce_mean(tf.square(activation-y))
# Create a summary to monitor the cost function
tf.summary.scalar("cost_function", cost)
tf.summary.histogram("cost_histogram", cost)
with tf.name_scope("train") as scope:
# Set the Optimizer
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost)
In [12]:
# Launch the graph
with tf.Session() as session:
# Initializing the variables
session.run(tf.global_variables_initializer())
cost_epochs = []
# Training cycle
for epoch in range(training_epochs):
_, c = session.run(fetches=[optimizer, cost], feed_dict={x: X_train, y: Y_train})
cost_epochs.append(c)
#writer.add_summary(summary=summary, global_step=epoch)
#print("accuracy epoch {}:{}".format(epoch, accuracy.eval({x: X_train, y: Y_train})))
# Print the Loss/Error after every 100 epochs
if epoch%10 == 0:
print('Epoch: {0}, Error: {1}'.format(epoch, c))
print("Training phase finished")
#plotting
plt.figure(figsize=(12,8))
plt.plot(range(len(cost_epochs)), cost_epochs, 'o', label='Logistic Regression Training phase')
plt.ylabel('cost')
plt.xlabel('epoch')
plt.legend()
plt.show()
#prediction = tf.argmax(activation, 1)
#print(prediction.eval({x: X_test}))
In [13]:
from keras.models import Sequential
from keras.layers import Dense, Activation
from livelossplot import PlotLossesKeras
from keras import backend as K
In [14]:
K.clear_session()
print("Building model...")
print('Model variables: ', dims)
model = Sequential()
model.add(Dense(output_var, input_shape=(dims,)))
print(model.summary())
model.compile(optimizer='sgd', loss='mean_squared_error')
model.fit(X_train, Y_train, verbose=2,epochs=15)
Out[14]:
In [15]:
import keras.optimizers as opts
K.clear_session()
print("Building model...")
print('Model variables: ', dims)
model = Sequential()
model.add(Dense(output_var, input_shape=(dims,)))
op = opts.SGD(lr=learning_rate)
model.compile(loss = 'mean_squared_error',
optimizer = op)
model.fit(X_train, Y_train,
verbose=1,
epochs=150,
validation_data=[X_test,Y_test],
callbacks=[PlotLossesKeras()])
Out[15]:
Simplicity is pretty impressive right? :)
Now lets understand:
The core data structure of Keras is a model, a way to organize layers. The main type of model is the Sequential model, a linear stack of layers.
What we did here is stacking a Fully Connected (Dense) layer of trainable weights from the input to the output and an Activation layer on top of the weights layer.
from keras.layers.core import Dense
Dense(units, activation=None, use_bias=True,
kernel_initializer='glorot_uniform', bias_initializer='zeros',
kernel_regularizer=None, bias_regularizer=None,
activity_regularizer=None, kernel_constraint=None, bias_constraint=None)
units
: int > 0.
init
: name of initialization function for the weights of the layer (see initializations), or alternatively, Theano function to use for weights initialization. This parameter is only relevant if you don't pass a weights argument.
activation
: name of activation function to use (see activations), or alternatively, elementwise Theano function. If you don't specify anything, no activation is applied (ie. "linear" activation: a(x) = x).
weights
: list of Numpy arrays to set as initial weights. The list should have 2 elements, of shape (input_dim, output_dim) and (output_dim,) for weights and biases respectively.
kernel_regularizer
: instance of WeightRegularizer (eg. L1 or L2 regularization), applied to the main weights matrix.
bias_regularizer
: instance of WeightRegularizer, applied to the bias.
activity_regularizer
: instance of ActivityRegularizer, applied to the network output.
kernel_constraint
: instance of the constraints module (eg. maxnorm, nonneg), applied to the main weights matrix.
bias_constraint
: instance of the constraints module, applied to the bias.
use_bias
: whether to include a bias (i.e. make the layer affine rather than linear).
keras.layers.core.Flatten()
keras.layers.core.Reshape(target_shape)
keras.layers.core.Permute(dims)
model = Sequential()
model.add(Permute((2, 1), input_shape=(10, 64)))
# now: model.output_shape == (None, 64, 10)
# note: `None` is the batch dimension
keras.layers.core.Lambda(function, output_shape=None, arguments=None)
keras.layers.core.ActivityRegularization(l1=0.0, l2=0.0)
Credits: Yam Peleg (@Yampeleg)
from keras.layers.core import Activation
Activation(activation)
Supported Activations : [https://keras.io/activations/]
Advanced Activations: [https://keras.io/layers/advanced-activations/]
If you need to, you can further configure your optimizer. A core principle of Keras is to make things reasonably simple, while allowing the user to be fully in control when they need to (the ultimate control being the easy extensibility of the source code). Here we used SGD (stochastic gradient descent) as an optimization algorithm for our trainable weights.
What we did here is nice, however in the real world it is not useable because of overfitting. Lets try and solve it with cross validation.
In overfitting, a statistical model describes random error or noise instead of the underlying relationship. Overfitting occurs when a model is excessively complex, such as having too many parameters relative to the number of observations.
A model that has been overfit has poor predictive performance, as it overreacts to minor fluctuations in the training data.
To avoid overfitting, we will first split out data to training set and test set and test out model on the test set. Next: we will use two of keras's callbacks EarlyStopping and ModelCheckpoint
Let's see first the model we implemented
In [16]:
model.summary()
In [17]:
from sklearn.model_selection import train_test_split
from keras.callbacks import EarlyStopping, ModelCheckpoint
In [18]:
X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size=0.15, random_state=42)
fBestModel = 'best_model.h5'
early_stop = EarlyStopping(monitor='val_loss', patience=2, verbose=1)
best_model = ModelCheckpoint(fBestModel, verbose=0, save_best_only=True)
model.fit(X_train, Y_train, validation_data = (X_val, Y_val), epochs=50,
batch_size=128, verbose=True, callbacks=[best_model, early_stop])
Out[18]:
Q: How hard can it be to build a Multi-Layer Fully-Connected Network with keras?
A: It is basically the same, just add more layers!
In [25]:
K.clear_session()
print("Building model...")
model = Sequential()
model.add(Dense(256, input_shape=(dims,),activation='relu'))
model.add(Dense(256,activation='relu'))
model.add(Dense(output_var))
model.add(Activation('relu'))
model.compile(optimizer='sgd', loss='mean_squared_error')
model.summary()
In [26]:
model.fit(X_train, Y_train,
validation_data = (X_val, Y_val),
epochs=50,
callbacks=[PlotLossesKeras()])
Out[26]:
What does the cost function behavior mean over the traning in the above plot?
Take couple of minutes and try to play with the number of layers and the number of parameters in the layers to get the best results.
In [ ]:
K.clear_session()
print("Building model...")
model = Sequential()
model.add(Dense(256, input_shape=(dims,),activation='relu'))
# ...
# ...
# Play with it! add as much layers as you want! try and get better results.
model.add(Dense(output_var))
model.add(Activation('relu'))
model.compile(optimizer='sgd', loss='mean_squared_error')
model.summary()
In [ ]:
model.fit(X_train, Y_train,
validation_data = (X_val, Y_val),
epochs=50,
callbacks=[PlotLossesKeras()])
Building a question answering system, an image classification model, a Neural Turing Machine, a word2vec embedder or any other model is just as fast. The ideas behind deep learning are simple, so why should their implementation be painful?