In [1]:
import mxnet as mx
import numpy as np
import logging
import pprint
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
We will use a very simple 1 hidden layer BatchNorm fully connected MNIST network to demo how to use low level API. The network looks like:
In [2]:
# we can use mx.sym in short of mx.symbol
data = mx.sym.Variable("data")
fc1 = mx.sym.FullyConnected(data=data, num_hidden=128, name="fc1")
bn1 = mx.sym.BatchNorm(data=fc1, name="bn1")
act1 = mx.sym.Activation(data=bn1, name="act1", act_type="tanh")
fc2 = mx.sym.FullyConnected(data=act1, name="fc2", num_hidden=10)
softmax = mx.sym.Softmax(data=fc2, name="softmax")
# visualize the network
batch_size = 100
data_shape = (batch_size, 784)
mx.viz.plot_network(softmax, shape={"data":data_shape}, node_attrs={"shape":'oval',"fixedsize":'false'})
Out[2]:
We can use simple_bind api to generate executor from symbol.
In [3]:
# context different to ```mx.model```,
# In mx.model, we wrapped parameter server, but for a single executor, the context is only able to be ONE device
# run on cpu
ctx = mx.cpu()
# run on gpu
# ctx = mx.gpu()
# run on third gpu
# ctx = mx.gpu(2)
executor = softmax.simple_bind(ctx=ctx, data=data_shape, grad_req='write')
# The default ctx is CPU, data's shape is required and ```simple_bind``` will try to infer all other required
# For MLP, the ```grad_req``` is write to, and for RNN it is different
After generating executor, get lists of inputs (args), outputs, etc The order of these arrays is in same sequence as symbol arguments
In [4]:
args = executor.arg_dict
# Equivalently you could do this:
#args = dict(zip(softmax.list_arguments(), executor.arg_arrays))
grads = executor.grad_dict
aux_states = executor.aux_dict
# For outputs we need to assemble the dict by hand:
outputs = dict(zip(softmax.list_outputs(), executor.outputs))
# we can print the args we have
print("args: %s" % pprint.pformat(args))
print("-" * 20)
print("grads: %s" % pprint.pformat(grads))
print("-" * 20)
print("aux_states: %s" % pprint.pformat(aux_states))
print("-" * 20)
print("outputs: %s" % pprint.pformat(outputs))
The next step is intilize the weights. We can set weight directly by using mx.random or numpy ndarray
In [5]:
args['fc1_weight'][:] = mx.random.uniform(-0.07, 0.07, args['fc1_weight'].shape)
args['fc2_weight'][:] = np.random.uniform(-0.07, 0.07, args['fc2_weight'].shape) # equivalent
args['bn1_beta'][:] = 1.0
args['bn1_gamma'][:] = 1.0
args['fc1_bias'][:] = 0
args['fc2_bias'][:] = 0
# Don't initialize data or softmax_label
Then we can customize our update rule. Here for demo purpose, we make a simple update rule to show mxnet feature
In [6]:
def SGD(key, weight, grad, lr=0.1, grad_norm=batch_size):
# key is key for weight, we can customize update rule
# weight is weight array
# grad is grad array
# lr is learning rate
# grad_norm is scalar to norm gradient, usually it is batch_size
norm = 1.0 / grad_norm
# here we can bias' learning rate 2 times larger than weight
if "weight" in key or "gamma" in key:
weight[:] -= lr * (grad * norm)
elif "bias" in key or "beta" in key:
weight[:] -= 2.0 * lr * (grad * norm)
else:
pass
Then we will make a data iterator. We can either use built-in iterator to load from binary file or build a numpy iterator.
For special case, you are free to write your own iterator in python
In [7]:
# We use utils function in sklearn to get MNIST dataset in pickle
from sklearn.datasets import fetch_mldata
from sklearn.utils import shuffle
mnist = fetch_mldata('MNIST original', data_home="./data")
# shuffle data
X, y = shuffle(mnist.data, mnist.target)
# split dataset
train_data = X[:50000, :].astype('float32')
train_label = y[:50000]
val_data = X[50000: 60000, :].astype('float32')
val_label = y[50000:60000]
# Normalize data
train_data[:] /= 256.0
val_data[:] /= 256.0
# Build iterator
train_iter = mx.io.NDArrayIter(data=train_data, label=train_label, batch_size=batch_size, shuffle=True)
val_iter = mx.io.NDArrayIter(data=val_data, label=val_label, batch_size=batch_size)
Also, we need to define a helper function to calculate accuracy of current training
In [8]:
def Accuracy(label, pred_prob):
pred = np.argmax(pred_prob, axis=1)
return np.sum(label == pred) * 1.0 / label.shape[0]
Then we can train the network by using executor
In [9]:
num_round = 3
keys = softmax.list_arguments()
# we use extra ndarray to save output of net
pred_prob = mx.nd.zeros(executor.outputs[0].shape)
for i in range(num_round):
train_iter.reset()
val_iter.reset()
train_acc = 0.
val_acc = 0.
nbatch = 0.
# train
for dbatch in train_iter:
data = dbatch.data[0]
label = dbatch.label[0]
# copy data into args
args["data"][:] = data # or we can ```data.copyto(args["data"])```
args["softmax_label"][:] = label
executor.forward(is_train=True)
pred_prob[:] = executor.outputs[0]
executor.backward()
for key in keys:
SGD(key, args[key], grads[key])
train_acc += Accuracy(label.asnumpy(), pred_prob.asnumpy())
nbatch += 1.
logging.info("Finish training iteration %d" % i)
train_acc /= nbatch
nbatch = 0.
# eval
for dbatch in val_iter:
data = dbatch.data[0]
label = dbatch.label[0]
args["data"][:] = data
executor.forward(is_train=False)
pred_prob[:] = executor.outputs[0]
val_acc += Accuracy(label.asnumpy(), pred_prob.asnumpy())
nbatch += 1.
val_acc /= nbatch
logging.info("Train Acc: %.4f" % train_acc)
logging.info("Val Acc: %.4f" % val_acc)
This is simple example gives a demo on how to directly use symbolic API to build a neural net from scratch.
In [ ]: