In [1]:
import theano
from theano import tensor as T
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
import numpy as np


Using gpu device 0: GRID K520

In [2]:
from cs231n.data_utils import load_CIFAR10
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

In [3]:
def get_CIFAR10_data(num_training=49000, num_validation=1000, num_test=1000):
  """
  Load the CIFAR-10 dataset from disk and perform preprocessing to prepare
  it for the linear classifier. These are the same steps as we used for the
  SVM, but condensed to a single function.  
  """
  # Load the raw CIFAR-10 data
  cifar10_dir = 'cs231n/datasets/cifar-10-batches-py'
  X_train, y_train, X_test, y_test = load_CIFAR10(cifar10_dir)
  
  # subsample the data
  mask = range(num_training, num_training + num_validation)
  X_val = X_train[mask]
  y_val = y_train[mask]
  mask = range(num_training)
  X_train = X_train[mask]
  y_train = y_train[mask]
  mask = range(num_test)
  X_test = X_test[mask]
  y_test = y_test[mask]
  
  # Preprocessing: reshape the image data into rows
  X_train = np.reshape(X_train, (X_train.shape[0], -1))
  X_val = np.reshape(X_val, (X_val.shape[0], -1))
  X_test = np.reshape(X_test, (X_test.shape[0], -1))
  
  # Normalize the data: subtract the mean image
  mean_image = np.mean(X_train, axis = 0)
  X_train -= mean_image
  X_val -= mean_image
  X_test -= mean_image
  
  # add bias dimension and transform into columns
  X_train = np.hstack([X_train, np.ones((X_train.shape[0], 1))]).T
  X_val = np.hstack([X_val, np.ones((X_val.shape[0], 1))]).T
  X_test = np.hstack([X_test, np.ones((X_test.shape[0], 1))]).T
  
  return X_train, y_train, X_val, y_val, X_test, y_test


# Invoke the above function to get our data.
X_train, y_train, X_val, y_val, X_test, y_test = get_CIFAR10_data()
print 'Train data shape: ', X_train.shape
print 'Train labels shape: ', y_train.shape
print 'Validation data shape: ', X_val.shape
print 'Validation labels shape: ', y_val.shape
print 'Test data shape: ', X_test.shape
print 'Test labels shape: ', y_test.shape


Train data shape:  (3073, 49000)
Train labels shape:  (49000,)
Validation data shape:  (3073, 1000)
Validation labels shape:  (1000,)
Test data shape:  (3073, 1000)
Test labels shape:  (1000,)

In [4]:
def one_hot(x,n):
    if type(x) == list: x = np.array(x)
    x = x.flatten()
    o_h = np.zeros((len(x),n))
    o_h[np.arange(len(x)),x] = 1
    return o_h

In [5]:
y_train_o_h = one_hot(y_train,10)
y_val_o_h = one_hot(y_val,10)
y_test_o_h = one_hot(y_test,10)
print 'Train labels shape: ', y_train_o_h.shape
print 'Train labels shape: ', y_val_o_h.shape
print 'Train labels shape: ', y_test_o_h.shape

X_train_t = np.transpose(X_train)
X_val_t = np.transpose(X_val)
X_test_t = np.transpose(X_test)
print 'Train data shape: ', X_train_t.shape
print 'Validation data shape: ', X_val_t.shape
print 'Test data shape: ', X_test_t.shape


Train labels shape:  (49000, 10)
Train labels shape:  (1000, 10)
Train labels shape:  (1000, 10)
Train data shape:  (49000, 3073)
Validation data shape:  (1000, 3073)
Test data shape:  (1000, 3073)

In [6]:
srng = RandomStreams()

In [85]:
def floatX(X):
    return np.asarray(X, dtype=theano.config.floatX)

def init_weights(shape, factor=0.00005):
    return theano.shared(floatX(np.random.randn(*shape) * factor))

def rectify(X):
    return T.maximum(X, 0.0)

def softmax(X):
    e_x = T.exp(X - X.max(axis=1).dimshuffle(0, 'x'))
    return e_x / e_x.sum(axis=1).dimshuffle(0, 'x')

def RMSprop(cost, params, lr=0.001, rho=0.9, epsilon=1e-6):
    grads = T.grad(cost=cost, wrt=params)
    updates = []
    for p, g in zip(params, grads):
        acc = theano.shared(p.get_value() * 0.0)
        acc_new = rho * acc + (1 - rho) * g ** 2
        gradient_scaling = T.sqrt(acc_new + epsilon)
        g = g / gradient_scaling
        updates.append((acc, acc_new))
        updates.append((p, p - lr * g))
    return updates

def dropout(X, p=0.0):
    if p > 0:
        retain_prob = 1 - p
        X *= srng.binomial(X.shape, p=retain_prob, dtype=theano.config.floatX)
        X /= retain_prob
    return X

def model(X, w_h, w_h2, w_o, p_drop_input, p_drop_hidden):
    X = dropout(X, p_drop_input)
    h = rectify(T.dot(X, w_h))

    h = dropout(h, p_drop_hidden)
    h2 = rectify(T.dot(h, w_h2))

    h2 = dropout(h2, p_drop_hidden)
    py_x = softmax(T.dot(h2, w_o))
    return h, h2, py_x

In [86]:
X = T.fmatrix()
Y = T.fmatrix()

w_h = init_weights((3073, 500))
w_h2 = init_weights((500, 500))
w_o = init_weights((500, 10))

noise_h, noise_h2, noise_py_x = model(X, w_h, w_h2, w_o, 0.2, 0.5)
h, h2, py_x = model(X, w_h, w_h2, w_o, 0.0, 0.0)
y_pred = T.argmax(py_x, axis=1)

In [87]:
cost = T.mean(T.nnet.categorical_crossentropy(noise_py_x, Y))
params = [w_h, w_h2, w_o]
updates = RMSprop(cost, params, lr=0.00001)

In [88]:
train = theano.function(inputs=[X, Y], outputs=cost, updates=updates, allow_input_downcast=True)
predict = theano.function(inputs=[X], outputs=y_pred, allow_input_downcast=True)
# This may take a minute or two.

In [95]:
print "Initial Error:", np.mean(np.argmax(y_val_o_h, axis=1) == predict(X_val_t))
for i in range(100):
    for start, end in zip(range(0, len(X_train_t), 128), range(128, len(X_train_t), 128)):
        cost = train(X_train_t[start:end], y_train_o_h[start:end])
    print i, np.mean(np.argmax(y_val_o_h, axis=1) == predict(X_val_t))


Initial Error: 0.574
0 0.576
1 0.575
2 0.588
3 0.587
4 0.578
5 0.585
6 0.579
7 0.58
8 0.57
9 0.582
10 0.589
11 0.583
12 0.584
13 0.576
14 0.583
15 0.574
16 0.583
17 0.584
18 0.58
19 0.578
20 0.58
21 0.584
22 0.578
23 0.579
24 0.586
25 0.576
26 0.576
27 0.574
28 0.574
29 0.584
30 0.583
31 0.586
32 0.579
33 0.577
34 0.575
35 0.581
36 0.579
37 0.577
38 0.577
39 0.579
40 0.588
41 0.574
42 0.577
43 0.585
44 0.584
45 0.587
46 0.592
47 0.575
48 0.572
49 0.583
50 0.568
51 0.583
52 0.587
53 0.578
54 0.574
55 0.581
56 0.589
57 0.57
58 0.583
59 0.581
60 0.581
61 0.589
62 0.585
63 0.578
64 0.586
65 0.585
66 0.586
67 0.575
68 0.591
69 0.585
70 0.588
71 0.592
72 0.589
73 0.589
74 0.579
75 0.578
76 0.593
77 0.588
78 0.581
79 0.589
80 0.576
81 0.583
82 0.585
83 0.586
84 0.584
85 0.57
86 0.578
87 0.584
88 0.579
89 0.589
90 0.582
91 0.587
92 0.578
93 0.592
94 0.583
95 0.583
96 0.585
97 0.581
98 0.584
99 0.57

In [96]:
np.mean(np.argmax(y_test_o_h, axis=1) == predict(X_test_t))


Out[96]:
0.58099999999999996

In [ ]: