In [3]:
import numpy as np
In [4]:
class Node(object):
def __init__(self, inbound_nodes=[]):
self.inbound_nodes = inbound_nodes
self.value = None
self.outbound_nodes = []
self.gradients = {}
for node in inbound_nodes:
node.outbound_nodes.append(self)
def forward(self):
# nodes use this node will define its own forward method
raise NotImplementedError
def backward(self):
raise NotImplementedError
In [5]:
class Input(Node):
def __init__(self):
Node.__init__(self)
def forward(self):
pass #do nothing because nothing is calculated
def backward(self):
self.gradients = {self: 0}
for n in self.outbound_nodes:
grad_cost = n.gradients[self]
self.gradients[self] += grad_cost * 1
In [6]:
class Linear(Node):
def __init__(self, X, W, b):
Node.__init__(self, [X, W, b])
def forward(self):
X = self.inbound_nodes[0].value
W = self.inbound_nodes[1].value
b = self.inbound_nodes[2].value
self.value = np.dot(X, W) +b
def backward(self):
self.gradients = {n: np.zeros_like(n.value) for n in self.inbound_nodes}
for n in self.outbound_nodes:
grad_cost = n.gradients[self]
self.gradients[self.inbound_nodes[0]] += np.dot(grad_cost, self.inbound_nodes[1].value.T)
self.gradients[self.inbound_nodes[1]] += np.dot(self.inbound_nodes[0].value.T, grad_cost)
self.gradients[self.inbound_nodes[2]] += np.sum(grad_cost, axis=0, keepdims = False)
In [7]:
class Sigmoid(Node):
def __init__(self, node):
Node.__init__(self, [node])
def _sigmoid(self, x):
return 1. / (1. + np.exp(-x))
def forward(self):
input_value = self.inbound_nodes[0].value
self.value = self._sigmoid(input_value)
def backward(self):
self.gradients = {n: np.zeros_like(n.value) for n in self.inbound_nodes}
for n in self.outbound_nodes:
grad_cost = n.gradients[self]
sigmoid = self.value
self.gradients[self.inbound_nodes[0]] += grad_cost * sigmoid * (1. - sigmoid)
In [8]:
class MSE(Node):
def __init__(self, y, a):
"""
The mean squared error cost function.
Should be used as the last node for a network.
"""
# Call the base class' constructor.
Node.__init__(self, [y, a])
def forward(self):
"""
Calculates the mean squared error.
"""
# NOTE: We reshape these to avoid possible matrix/vector broadcast
# errors.
#
# For example, if we subtract an array of shape (3,) from an array of shape
# (3,1) we get an array of shape(3,3) as the result when we want
# an array of shape (3,1) instead.
#
# Making both arrays (3,1) insures the result is (3,1) and does
# an elementwise subtraction as expected.
y = self.inbound_nodes[0].value.reshape(-1, 1)
a = self.inbound_nodes[1].value.reshape(-1, 1)
self.m = self.inbound_nodes[0].value.shape[0]
# Save the computed output for backward.
self.diff = y - a
self.value = np.mean(self.diff**2)
def backward(self):
"""
Calculates the gradient of the cost.
This is the final node of the network so outbound nodes
are not a concern.
"""
self.gradients[self.inbound_nodes[0]] = (2 / self.m) * self.diff
self.gradients[self.inbound_nodes[1]] = (-2 / self.m) * self.diff
In [9]:
def topological_sort(feed_dict):
"""
Sort the nodes in topological order using Kahn's Algorithm.
`feed_dict`: A dictionary where the key is a `Input` Node and the value is the respective value feed to that Node.
Returns a list of sorted nodes.
"""
input_nodes = [n for n in feed_dict.keys()]
G = {}
nodes = [n for n in input_nodes]
while len(nodes) > 0:
n = nodes.pop(0)
if n not in G:
G[n] = {'in': set(), 'out': set()}
for m in n.outbound_nodes:
if m not in G:
G[m] = {'in': set(), 'out': set()}
G[n]['out'].add(m)
G[m]['in'].add(n)
nodes.append(m)
L = []
S = set(input_nodes)
while len(S) > 0:
n = S.pop()
if isinstance(n, Input):
n.value = feed_dict[n]
L.append(n)
for m in n.outbound_nodes:
G[n]['out'].remove(m)
G[m]['in'].remove(n)
# if no other incoming edges add to S
if len(G[m]['in']) == 0:
S.add(m)
return L
def forward_and_backward(graph):
"""
Performs a forward pass and a backward pass through a list of sorted Nodes.
Arguments:
`graph`: The result of calling `topological_sort`.
"""
# Forward pass
for n in graph:
n.forward()
# Backward pass
# see: https://docs.python.org/2.3/whatsnew/section-slices.html
for n in graph[::-1]:
n.backward()
In [11]:
X, W, b = Input(), Input(), Input()
y = Input()
f = Linear(X, W, b)
a = Sigmoid(f)
cost = MSE(y, a)
# set values
X_ = np.array([[-1., -2.], [-1., -2.]])
W_ = np.array([[2.], [3.]])
b_ = np.array([-3.])
y_ = np.array([1, 2])
feed_dict = {
X: X_,
y: y_,
W: W_,
b: b_,
}
In [12]:
graph = topological_sort(feed_dict)
forward_and_backward(graph)
# return the gradients for each Input
gradients = [t.gradients[t] for t in [X, y, W, b]]
"""
Expected output
[array([[ -3.34017280e-05, -5.01025919e-05],
[ -6.68040138e-05, -1.00206021e-04]]), array([[ 0.9999833],
[ 1.9999833]]), array([[ 5.01028709e-05],
[ 1.00205742e-04]]), array([ -5.01028709e-05])]
"""
print(gradients)
In [ ]: