In [1]:
from __future__ import division
from __future__ import print_function

%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import random
from IPython import display
from sklearn import datasets, preprocessing
import tensorflow as tf

In [2]:
(X, y) = datasets.make_circles(n_samples=1024, shuffle=True, noise=0.2, factor=0.4)
ind = np.logical_or(y==1, X[:,1] > X[:,0] - 0.5)
X = X[ind,:]
X = preprocessing.scale(X)
y = y[ind]
y = 2*y - 1
plt.scatter(X[:, 0], X[:, 1], cmap=plt.cm.Paired, c=y, edgecolors='black')

plt.show()



In [3]:
h = 0.01
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
def visualize(X, y, w, loss, n_iter):
    plt.clf()
    plt.figure(figsize=(20, 8))
    Z = classify(np.c_[xx.ravel(), yy.ravel()], w)
    Z = Z.reshape(xx.shape)
    plt.subplot(1,2,1)
    plt.contourf(xx, yy, Z, cmap=plt.cm.Paired, alpha=0.8)
    plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.Paired, edgecolors='black');
    plt.xlim(xx.min(), xx.max())
    plt.ylim(yy.min(), yy.max())
    plt.subplot(1,2,2)
    plt.plot(loss)
    plt.grid()
    ymin, ymax = plt.ylim()
    plt.ylim(0, ymax)
    display.clear_output(wait=True)
    display.display(plt.gcf())

Your task starts here

First, let's write a function that predicts class for given X.

Since the problem above isn't linearly separable, we add quadratic features to the classifier. This transformation is implemented in the expand function.

Don't forget to expand X inside classify and other functions

Sample classification should not be much harder than computation of sign of dot product.


In [4]:
def expand(X):
    X0 = tf.transpose(tf.gather(tf.transpose(X), [0]))
    X1 = tf.transpose(tf.gather(tf.transpose(X), [1]))
    X_ = tf.concat([X, X ** 2, X0 * X1, tf.ones_like(X0)], axis=1)
    return X_

def classify(X, w):
    """
    Given feature matrix X [n_samples,2] and weight vector w [6],
    return an array of +1 or -1 predictions
    """
    X_ = expand(X)
    X_ = tf.cast(X_, w.dtype)
    dot_product = tf.matmul(X_, tf.reshape(w, shape=[6, 1]))   
    ones = tf.ones_like(dot_product)
    ans = tf.where(tf.greater(dot_product, 0.0), ones, -ones)
    
    with tf.Session() as sess:
        return sess.run(ans)

The loss you should try to minimize is the Hinge Loss:

$$ L = {1 \over N} \sum_{i=1}^N max(0,1-y_i \cdot w^T x_i) $$

In [5]:
def compute_loss(X, y, w):
    """
    Given feature matrix X [n_samples,2], target vector [n_samples] of +1/-1,
    and weight vector w [6], compute scalar loss function using formula above.
    """
    X_ = expand(X)
    right_val = tf.ones_like(y) - tf.multiply(y, tf.matmul(X_, tf.reshape(w, [6,1])))
    return tf.reduce_mean(tf.maximum(tf.zeros_like(y), right_val))

    
def compute_grad(X, y, w):
    """
    Given feature matrix X [n_samples,2], target vector [n_samples] of +1/-1,
    and weight vector w [6], compute vector [6] of derivatives of L over each weights.
    """
    X_ = expand(X)
    yx = tf.multiply(y, X_)   
    cond = tf.less(tf.multiply(w, yx), tf.ones_like(X_))
    
    return tf.reduce_mean(tf.where(cond, -yx, tf.zeros_like(X_)), axis=0)

Training

Find an optimal learning rate for gradient descent for given batch size.

You can see the example of correct output below this cell before you run it.

Don't change the batch size!


In [6]:
X_input = tf.placeholder(tf.float32, shape=(None, 2), name='X')
y_input = tf.placeholder(tf.float32, shape=(None), name='y')

n_iter = 100
batch_size = 4

In [7]:
w = tf.Variable([1,0,0,0,0,0], dtype=tf.float32, name='w')

alpha = 0.1 # learning rate

loss_func = compute_loss(X_input, y_input, w)
grad_func = compute_grad(X_input, y_input, w)
update_w_func = tf.assign(w, w - alpha * grad_func)

loss = np.zeros(n_iter)
plt.figure(figsize=(12,5))
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    
    for i in range(n_iter):
        ind = random.sample(range(X.shape[0]), batch_size)
        
        loss[i] = sess.run(loss_func, feed_dict={X_input:X, y_input:y.reshape(-1, 1)})
        sess.run(update_w_func, feed_dict={X_input:X[ind], y_input:y[ind].reshape([batch_size, 1])})

    w_ans = w.eval()
        
visualize(X, y, w_ans, loss, n_iter)
plt.clf()


<matplotlib.figure.Figure at 0x7fd6014ada20>
<matplotlib.figure.Figure at 0x7fd643f159b0>

Implement gradient descent with momentum and test it's performance for different learning rate and momentum values.


In [16]:
w = tf.Variable([1,0,0,0,0,0], dtype=tf.float32, name='w')
v = tf.Variable(tf.zeros_like(w))

alpha = 0.05 # learning rate
mu    = 0.5 # momentum

loss_func = compute_loss(X_input, y_input, w)
grad_func = compute_grad(X_input, y_input, w)
update_v_func = tf.assign(v, v * mu - alpha * grad_func)
update_w_func = tf.assign(w, w + update_v_func)

loss = np.zeros(n_iter)
plt.figure(figsize=(12,5))
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    
    for i in range(n_iter):
        ind = random.sample(range(X.shape[0]), batch_size)
        
        loss[i] = sess.run(loss_func, feed_dict={X_input:X, y_input:y.reshape(-1, 1)})
        sess.run(update_w_func, feed_dict={X_input:X[ind], y_input:y[ind].reshape([batch_size, 1])})

    w_ans = w.eval()
        
visualize(X, y, w_ans, loss, n_iter)
plt.clf()


<matplotlib.figure.Figure at 0x7fd5dfa29358>
<matplotlib.figure.Figure at 0x7fd5dfa1c550>

Same task but for Nesterov's accelerated gradient:


In [17]:
w = tf.Variable([1,0,0,0,0,0], dtype=tf.float32, name='w')
v = tf.Variable(tf.zeros_like(w))

alpha = 0.05 # learning rate
mu    = 0.5 # momentum

loss_func = compute_loss(X_input, y_input, w + mu * v)
grad_func = compute_grad(X_input, y_input, w)
update_v_func = tf.assign(v, v * mu - alpha * grad_func)
update_w_func = tf.assign(w, w + update_v_func)

loss = np.zeros(n_iter)
plt.figure(figsize=(12,5))
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    
    for i in range(n_iter):
        ind = random.sample(range(X.shape[0]), batch_size)
        
        loss[i] = sess.run(loss_func, feed_dict={X_input:X, y_input:y.reshape(-1, 1)})
        sess.run(update_w_func, feed_dict={X_input:X[ind], y_input:y[ind].reshape([batch_size, 1])})

    w_ans = w.eval()
        
visualize(X, y, w_ans, loss, n_iter)
plt.clf()


<matplotlib.figure.Figure at 0x7fd5df9e3ba8>
<matplotlib.figure.Figure at 0x7fd601497cf8>

Same task but for AdaGrad:


In [10]:
w = tf.Variable([1,0,0,0,0,0], dtype=tf.float32, name='w')
cw = tf.Variable(tf.zeros_like(w))

alpha = 0.1 # learning rate
eps   = 1e-8  # A small constant for numerical stability

loss_func = compute_loss(X_input, y_input, w)
grad_func = compute_grad(X_input, y_input, w)
update_cw_func = tf.assign(cw, cw + tf.square(grad_func))
update_w_func = tf.assign(w, w - alpha / (tf.sqrt(update_cw_func) + eps) * grad_func)

loss = np.zeros(n_iter)
plt.figure(figsize=(12,5))
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    
    for i in range(n_iter):
        ind = random.sample(range(X.shape[0]), batch_size)
        
        loss[i] = sess.run(loss_func, feed_dict={X_input:X, y_input:y.reshape(-1, 1)})
        sess.run(update_w_func, feed_dict={X_input:X[ind], y_input:y[ind].reshape([batch_size, 1])})

    w_ans = w.eval()
        
visualize(X, y, w_ans, loss, n_iter)
plt.clf()


<matplotlib.figure.Figure at 0x7fd601225080>
<matplotlib.figure.Figure at 0x7fd5f4680a20>

Same task but for AdaDelta:


In [18]:
w = tf.Variable([1,0,0,0,0,0], dtype=tf.float32, name='w')
cw = tf.Variable(tf.ones_like(w))

alpha = 0.05 # learning rate
beta  = 0.9
eps   = 1e-8 # A small constant for numerical stability

loss_func = compute_loss(X_input, y_input, w)
grad_func = compute_grad(X_input, y_input, w)
update_cw_func = tf.assign(cw, cw * beta + (1 - beta) * tf.square(grad_func))
update_w_func = tf.assign(w, w - alpha / (tf.sqrt(update_cw_func) + eps) * grad_func)

loss = np.zeros(n_iter)
plt.figure(figsize=(12,5))
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    
    for i in range(n_iter):
        ind = random.sample(range(X.shape[0]), batch_size)
        
        loss[i] = sess.run(loss_func, feed_dict={X_input:X, y_input:y.reshape(-1, 1)})
        sess.run(update_w_func, feed_dict={X_input:X[ind], y_input:y[ind].reshape([batch_size, 1])})

    w_ans = w.eval()
        
visualize(X, y, w_ans, loss, n_iter)
plt.clf()


<matplotlib.figure.Figure at 0x7fd5df8bccc0>
<matplotlib.figure.Figure at 0x7fd5df867e48>

Same task for Adam algorithm. You can start with beta = 0.9 and mu = 0.999


In [12]:
w = tf.Variable([1,0,0,0,0,0], dtype=tf.float32, name='w')
vw = tf.Variable(tf.zeros_like(w))
cw = tf.Variable(tf.ones_like(w))

alpha = 0.1 # learning rate
beta  = 0.9  # (beta1 coefficient in original paper) exponential decay rate for the 1st moment estimates
mu    = 0.99  # (beta2 coefficient in original paper) exponential decay rate for the 2nd moment estimates
eps   = 1e-8  # A small constant for numerical stability

loss_func = compute_loss(X_input, y_input, w)
grad_func = compute_grad(X_input, y_input, w)
update_vw_func = tf.assign(vw, beta * vw + (1 - beta) * grad_func)
update_cw_func = tf.assign(cw, mu * cw + (1 - mu) * tf.square(grad_func))
update_w_func = tf.assign(w, w - alpha / (tf.sqrt(update_cw_func) + eps) * update_vw_func)

loss = np.zeros(n_iter)
plt.figure(figsize=(12,5))
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    
    for i in range(n_iter):
        ind = random.sample(range(X.shape[0]), batch_size)
        
        loss[i] = sess.run(loss_func, feed_dict={X_input:X, y_input:y.reshape(-1, 1)})
        sess.run(update_w_func, feed_dict={X_input:X[ind], y_input:y[ind].reshape([batch_size, 1])})

    w_ans = w.eval()
        
visualize(X, y, w_ans, loss, n_iter)
plt.clf()


<matplotlib.figure.Figure at 0x7fd5dff88278>
<matplotlib.figure.Figure at 0x7fd5dff4c208>