Setup



In [1]:

    
from __future__ import print_function, unicode_literals, absolute_import, division
from six.moves import range, zip, map, reduce, filter



In [2]:

    
import numpy as np
import matplotlib.pyplot as plt
from IPython import display
%matplotlib inline
%config InlineBackend.figure_format = 'retina'



In [3]:

    
import seaborn as sns
sns.set_style('whitegrid')
plt.rc('figure', figsize=(7.0, 5.0))



In [29]:

    
import keras
from keras import backend as K
from keras.models import Sequential, Model
from keras.layers import Input, Dense, Activation
from keras.optimizers import Adam
from keras.callbacks import LambdaCallback
from keras.utils import np_utils



In [5]:

    
def plot_callback(func,p=20):
    def plot_epoch_end(epoch,logs):
        if epoch == 0 or (epoch+1) % p == 0:
            plt.clf(); func(); plt.title('epoch %d' % (epoch+1))
            display.clear_output(wait=True); display.display(plt.gcf())
    def clear(*args):
        plt.clf()
    return LambdaCallback(on_epoch_end=plot_epoch_end,on_train_end=clear)



In [6]:

    
def plot_loss_acc(hist):
    plt.figure(figsize=(15,4));
    if len(hist.params['metrics']) == 2:
        plt.subplot(121); plt.semilogy(hist.epoch,hist.history['loss'])
        plt.xlabel('epoch'); plt.ylabel('loss'); plt.legend(['train'],loc='upper right')
        plt.subplot(122); plt.plot(hist.epoch,hist.history['acc']);
        plt.xlabel('epoch'); plt.ylabel('accuracy'); plt.legend(['train'],loc='lower right');
    else:
        plt.subplot(121); plt.semilogy(hist.epoch,hist.history['loss'], hist.epoch,hist.history['val_loss']);
        plt.xlabel('epoch'); plt.ylabel('loss'); plt.legend(['train','test'],loc='upper right')
        plt.subplot(122); plt.plot(hist.epoch,hist.history['acc'], hist.epoch,hist.history['val_acc'])
        plt.xlabel('epoch'); plt.ylabel('accuracy'); plt.legend(['train','test'],loc='lower right');

Adapted from http://blog.fastforwardlabs.com/2016/02/24/hello-world-in-keras-or-scikit-learn-versus.html

Dataset

Iris flower data set



In [7]:

    
iris = sns.load_dataset("iris")
iris.sample(10)









    Out[7]:







  
    
      
      sepal_length
      sepal_width
      petal_length
      petal_width
      species
    
  
  
    
      65
      6.7
      3.1
      4.4
      1.4
      versicolor
    
    
      108
      6.7
      2.5
      5.8
      1.8
      virginica
    
    
      56
      6.3
      3.3
      4.7
      1.6
      versicolor
    
    
      67
      5.8
      2.7
      4.1
      1.0
      versicolor
    
    
      50
      7.0
      3.2
      4.7
      1.4
      versicolor
    
    
      31
      5.4
      3.4
      1.5
      0.4
      setosa
    
    
      133
      6.3
      2.8
      5.1
      1.5
      virginica
    
    
      102
      7.1
      3.0
      5.9
      2.1
      virginica
    
    
      87
      6.3
      2.3
      4.4
      1.3
      versicolor
    
    
      30
      4.8
      3.1
      1.6
      0.2
      setosa

Image from https://commons.wikimedia.org/wiki/File:Petal-sepal.jpg



In [8]:

    
sns.pairplot(iris, hue='species');

Label Encoding



In [9]:

    
def label_encode(arr):
    uniques, ids = np.unique(arr, return_inverse=True)
    return ids



In [10]:

    
classes = ('setosa', 'versicolor', 'virginica')
labels  = label_encode(classes)
for i,c in enumerate(classes):
    print('%10s → %d' % (c, labels[i]))









    



    setosa → 0
versicolor → 1
 virginica → 2



In [11]:

    
def onehot_encode(arr):
    uniques, ids = np.unique(arr, return_inverse=True)
    return np_utils.to_categorical(ids, len(uniques))



In [12]:

    
classes = ('setosa', 'versicolor', 'virginica')
onehot = onehot_encode(classes)
for i,c in enumerate(classes):
    print('%10s → [%d,%d,%d]' % (c, onehot[i,0], onehot[i,1], onehot[i,2]))









    



    setosa → [1,0,0]
versicolor → [0,1,0]
 virginica → [0,0,1]

Multiclass Classification

Data subset (1 feature only)



In [13]:

    
data = iris
feature_name = 'petal_length'
data = data[[feature_name,'species']]



In [14]:

    
X    = data.values[:,0]
y    = label_encode(data.values[:,1])
y_oh = onehot_encode(data.values[:,1])
N    = len(y)

Plotting



In [15]:

    
R = np.linspace(X.min()-1,X.max()+1,100)
Xp = np.zeros(X.shape[0])-.1
Rp = np.zeros(R.shape[0])-.2
def plot_all(model=None):
    plt.scatter(X, Xp, c=y, cmap='jet');
    plt.xlabel(feature_name)
    if model is not None:
        prob = model.predict(R)
        yhat = np.argmax(prob,axis=1)
        plt.scatter(R, Rp, c=yhat);
        plt.plot(R,prob)
        leg = plt.legend(map(lambda s:'p("%s")'%s,classes),loc='upper center',frameon=False,ncol=3)
    plt.xlim(X.min()-1.5,X.max()+1.5)
    plt.ylim(-.4,1.2)
plot_all()

Model (Logistic Regression)



In [16]:

    
model = Sequential()
model.add(Dense(16, input_shape=(1,)))
model.add(Activation('tanh'))
model.add(Dense(3))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()









    



_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
dense_1 (Dense)              (None, 16)                32        
_________________________________________________________________
activation_1 (Activation)    (None, 16)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 3)                 51        
_________________________________________________________________
activation_2 (Activation)    (None, 3)                 0         
=================================================================
Total params: 83
Trainable params: 83
Non-trainable params: 0
_________________________________________________________________

Softmax

Converts arbitrary "scores" to normalized probabilities: $ \large \sigma(\mathbf{z})_i = \frac{\exp(z_i)}{\sum_j \exp(z_j)} $

Example: for $\mathbf{z} = [0.451, -0.599, 0.006]$, we get $\sigma(\mathbf{z}) = [ 0.50232021, 0.1757808 , 0.32189899]$.

Cross entropy

$H(p, q) = \mathrm{E}_p[-\log q] = H(p) + D_{\mathrm{KL}}(p \| q)$ defines the cross entropy for distributions $p$ and $q$, where

$H(p)$ is the entropy of $p$, and
$D_{\mathrm{KL}}(p \| q)$ is the Kullback–Leibler divergence of $q$ from $p$.



In [17]:

    
hist = model.fit(X,y_oh,batch_size=5,epochs=300,verbose=0,
                 callbacks=[plot_callback(lambda:plot_all(model))]);









    












    





<Figure size 504x360 with 0 Axes>



In [18]:

    
plot_loss_acc(hist)

Full dataset (all 4 features)

Use all features and split dataset in train and test subsets:



In [19]:

    
N = iris.shape[0] # number of data points / table rows
data = iris.sample(N,replace=False) # shuffle data
X    = data.values[:,0:4]
y_oh = onehot_encode(data.values[:,4])

N_train = N//2 # random 50/50 train/test split
X_train, y_train = X[:N_train], y_oh[:N_train]
X_test,  y_test  = X[N_train:], y_oh[N_train:]



In [20]:

    
model = Sequential()
model.add(Dense(16, input_shape=(4,)))
model.add(Activation('tanh'))
model.add(Dense(3))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()









    



_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
dense_3 (Dense)              (None, 16)                80        
_________________________________________________________________
activation_3 (Activation)    (None, 16)                0         
_________________________________________________________________
dense_4 (Dense)              (None, 3)                 51        
_________________________________________________________________
activation_4 (Activation)    (None, 3)                 0         
=================================================================
Total params: 131
Trainable params: 131
Non-trainable params: 0
_________________________________________________________________



In [21]:

    
hist = model.fit(X_train, y_train, validation_data=(X_test,y_test), epochs=200, verbose=0, batch_size=5)



In [22]:

    
plot_loss_acc(hist)



In [23]:

    
loss, accuracy = model.evaluate(X_train, y_train, verbose=0)
print('train set: loss = %.5f, accuracy = %.5f' % (loss,accuracy))
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print('test  set: loss = %.5f, accuracy = %.5f' % (loss,accuracy))









    



train set: loss = 0.08024, accuracy = 0.98667
test  set: loss = 0.08079, accuracy = 0.98667

Overfitting



In [24]:

    
N_train = 20 # only 20 of 150 samples for training, rest for testing
X_train, y_train = X[:N_train], y_oh[:N_train]
X_test,  y_test  = X[N_train:], y_oh[N_train:]



In [25]:

    
model = Sequential()
model.add(Dense(16, input_shape=(4,)))
model.add(Activation('tanh'))
model.add(Dense(16))
model.add(Activation('tanh'))
model.add(Dense(16))
model.add(Activation('tanh'))
model.add(Dense(3))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()









    



_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
dense_5 (Dense)              (None, 16)                80        
_________________________________________________________________
activation_5 (Activation)    (None, 16)                0         
_________________________________________________________________
dense_6 (Dense)              (None, 16)                272       
_________________________________________________________________
activation_6 (Activation)    (None, 16)                0         
_________________________________________________________________
dense_7 (Dense)              (None, 16)                272       
_________________________________________________________________
activation_7 (Activation)    (None, 16)                0         
_________________________________________________________________
dense_8 (Dense)              (None, 3)                 51        
_________________________________________________________________
activation_8 (Activation)    (None, 3)                 0         
=================================================================
Total params: 675
Trainable params: 675
Non-trainable params: 0
_________________________________________________________________



In [26]:

    
hist = model.fit(X_train, y_train, validation_data=(X_test,y_test), epochs=1000, verbose=0, batch_size=5)



In [27]:

    
plot_loss_acc(hist)



In [28]:

    
loss, accuracy = model.evaluate(X_train, y_train, verbose=0)
print('train set: loss = %.5f, accuracy = %.5f' % (loss,accuracy))
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print('test  set: loss = %.5f, accuracy = %.5f' % (loss,accuracy))









    



train set: loss = 0.00036, accuracy = 1.00000
test  set: loss = 0.36123, accuracy = 0.93846

	sepal_length	sepal_width	petal_length	petal_width	species
65	6.7	3.1	4.4	1.4	versicolor
108	6.7	2.5	5.8	1.8	virginica
56	6.3	3.3	4.7	1.6	versicolor
67	5.8	2.7	4.1	1.0	versicolor
50	7.0	3.2	4.7	1.4	versicolor
31	5.4	3.4	1.5	0.4	setosa
133	6.3	2.8	5.1	1.5	virginica
102	7.1	3.0	5.9	2.1	virginica
87	6.3	2.3	4.4	1.3	versicolor
30	4.8	3.1	1.6	0.2	setosa