In part 1, we ourselves created the additional features (x^2, x^3). Wouldn't it be nice if we create activation functions to do just that and let the neural network decide the weights for connections during training.
In [200]:
import torch
from torch.autograd import Variable
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
In [216]:
# Setup the training and test tensors
# Let's generate 400 examples
N = 400
x = np.random.uniform(low=-75, high=100, size=N)
y = 2*x
X_tensor = Variable(torch.FloatTensor(x), requires_grad=False)
y_tensor = Variable(torch.FloatTensor(y), requires_grad=False)
# Test set initialization
x_test = np.array([-2.5, 0.0, 19])
X_test_tsr = Variable(torch.FloatTensor(x_test), requires_grad=False)
# Normalized features
X_min = torch.min(X_tensor)
X_max = torch.max(X_tensor)
X_mean = torch.mean(X_tensor)
X_sub_mean = X_tensor-X_mean.expand_as(X_tensor)
X_max_min = X_max-X_min + 1e-7
X_norm_tsr = X_sub_mean/X_max_min.expand_as(X_sub_mean)
X_test_sub_mean = X_test_tsr-X_mean.expand_as(X_test_tsr)
X_test_norm_tsr = X_test_sub_mean/X_max_min.expand_as(X_test_sub_mean)
In [202]:
# Implement version-2 neural network
import math
from time import time
from collections import OrderedDict
def RunV2NNTraining(X, y, model, learning_rate=1e-5, epochs=5000, batch_size=None, X_test=None,
use_optimizer=None, adam_betas=(0.9, 0.999)):
# Neural Net
X_size = X.size()
N = X_size[0]
loss_fn = torch.nn.MSELoss(size_average=True)
# Choose Optimizer
optimizer = None
if use_optimizer:
if use_optimizer == 'SGD':
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
elif use_optimizer == 'Adam':
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, betas=adam_betas)
elif use_optimizer == 'Adadelta':
optimizer = torch.optim.Adadelta(model.parameters(), lr=learning_rate)
elif use_optimizer == 'ASGD':
optimizer = torch.optim.ASGD(model.parameters(), lr=learning_rate)
elif use_optimizer == 'RMSprop':
optimizer = torch.optim.RMSprop(model.parameters(), lr=learning_rate)
elif use_optimizer == 'Adagrad':
optimizer = torch.optim.Adagrad(model.parameters(), lr=learning_rate)
else:
print("Invalid Optimizer")
use_optimizer=None
losses = []
loss = None
start_time = time()
for t in range(epochs):
num_batches = 1
X_batch = None
y_batch = None
if batch_size:
num_batches = math.ceil(N/batch_size)
else:
batch_size = N
shuffle = torch.randperm(N)
for b in range(num_batches):
lower_index = b*batch_size
upper_index = min(lower_index+batch_size, N)
indices = shuffle[lower_index:upper_index]
X_batch = X[indices]
y_batch = y[indices]
y_pred = model(X_batch)
loss = loss_fn(y_pred, y_batch)
if use_optimizer:
optimizer.zero_grad()
loss.backward()
optimizer.step()
else:
# Zero the gradients before running the backward pass.
model.zero_grad()
loss.backward()
# Update the weights using gradient descent. Each parameter is a Variable, so
# we can access its data and gradients like we did before.
for param in model.parameters():
param.data -= learning_rate * param.grad.data
losses.append(loss.data[0])
end_time = time()
time_taken = end_time - start_time
print("Time Taken = %.2f seconds " % time_taken)
print("Final Loss: ", loss.data[0])
print("Parameters [w_1, w_2, w_3, b]: ")
for name, param in model.named_parameters():
print(name)
print(param.data)
# plot Loss vs Iterations
plt.plot(losses)
plt.title('Loss history')
plt.xlabel('Iteration')
plt.ylabel('Loss')
plt.show()
# Predictions on Test set
if X_test:
print("Test:")
print("X_test: ", X_test.data)
print("y_pred: ", model(X_test))
def GetV2NNLoss(X, y, model):
loss_fn = torch.nn.MSELoss(size_average=True)
y_pred = model(X)
loss = loss_fn(y_pred, y)
return loss.data[0]
In [203]:
class PowerNet(torch.nn.Module):
def __init__(self, n):
super(PowerNet, self).__init__()
self.n = n
self.linear = torch.nn.Linear(1, 1)
def forward(self, x):
return self.linear(x).pow(self.n)
Pow123Net_Mask = Variable(torch.FloatTensor([1.0,1.0,1.0]), requires_grad=False)
class Pow123Net(torch.nn.Module):
def __init__(self):
super(Pow123Net, self).__init__()
self.p1 = PowerNet(1)
self.p2 = PowerNet(2)
self.p3 = PowerNet(3)
def forward(self, x):
x1 = self.p1.forward(x)
x2 = self.p2.forward(x)
x3 = self.p3.forward(x)
xc = torch.cat((x1, x2, x3), 1)
return xc*Pow123Net_Mask.expand_as(xc)
In [206]:
# use_optimizer can be Adam, RMSprop, Adadelta, ASGD, SGD, Adagrad
model = torch.nn.Sequential(OrderedDict([
("Pow123Net", Pow123Net()),
("FC", torch.nn.Linear(3, 1))]
))
RunV2NNTraining(X=X_tensor.view(-1,1), y=y_tensor, model=model, batch_size=None, epochs=25000, learning_rate=5e-3,
X_test=X_test_tsr.view(-1,1), use_optimizer='Adam')
In [217]:
# Now, how do we find the equation?
# One way to find is to see the effect of each activation on the loss
print("Final Loss: ", GetV2NNLoss(X=X_tensor.view(-1,1), y=y_tensor, model=model))
# mask f(x) = x
Pow123Net_Mask[0] = 0.0
print("Loss with x masked: ", GetV2NNLoss(X=X_tensor.view(-1,1), y=y_tensor, model=model))
Pow123Net_Mask[0] = 1.0
# mask f(x) = x^2
Pow123Net_Mask[1] = 0.0
print("Loss with x^2 masked: ", GetV2NNLoss(X=X_tensor.view(-1,1), y=y_tensor, model=model))
Pow123Net_Mask[1] = 1.0
# mask f(x) = x^3
Pow123Net_Mask[2] = 0.0
print("Loss with x^3 masked: ", GetV2NNLoss(X=X_tensor.view(-1,1), y=y_tensor, model=model))
Pow123Net_Mask[2] = 1.0
# Clearly activations X^2 and X^3 are not important
# Now what is the final equation?
p1_w = None
p1_b = None
fc1_w = None
fc_b = None
for name, param in model.named_parameters():
if name == 'Pow123Net.p1.linear.weight':
p1_w = param.data[0]
if name == 'Pow123Net.p1.linear.bias':
p1_b = param.data[0]
if name == 'FC.weight':
fc1_w = param.data[0,0]
if name == 'FC.bias':
fc_b = param.data[0]
coeff_x = p1_w*fc1_w
const = p1_b*fc1_w+fc_b
print("Finally the equation is y = ",coeff_x[0],"*x + ", const)
print("Pretty close to y = 2*x")
In [223]:
model = torch.nn.Sequential(OrderedDict([
("Pow123Net", Pow123Net()),
("FC", torch.nn.Linear(3, 1))]
))
RunV2NNTraining(X=X_norm_tsr.view(-1,1), y=y_tensor, model=model, batch_size=None, epochs=25000, learning_rate=1e-1,
X_test=X_test_norm_tsr.view(-1,1), use_optimizer='Adam')