In [ ]:
# See : http://pytorch.org/tutorials/beginner/pytorch_with_examples.html#pytorch-custom-nn-modules
import numpy as np
import torch
from torch.autograd import Variable
In [ ]:
batch_size, d = 32, 5 # d is size of input
use_cuda = torch.cuda.is_available()
t_long = torch.LongTensor
t_float = torch.FloatTensor
if use_cuda:
t_long = torch.cuda.LongTensor
t_float = torch.cuda.FloatTensor
In [ ]:
x_np = np.random.randint( d, size=(batch_size,1) )
# Create Tensors to hold inputs and outputs, and wrap them in Variables
x = Variable( torch.from_numpy(x_np).type( t_long ) )
y = Variable( torch.from_numpy(x_np).type( t_long ).squeeze(), requires_grad=False)
In [ ]:
def sample_gumbel(input):
noise = torch.rand(input.size())
eps = 1e-20
noise.add_(eps).log_().neg_()
noise.add_(eps).log_().neg_()
return Variable(noise).type(t_float)
In [ ]:
class SelectorNet(torch.nn.Module):
def __init__(self, D):
"""
In the constructor we instantiate nn modules and assign them as member variables.
"""
super(SelectorNet, self).__init__()
self.input_space = torch.FloatTensor(batch_size, D).type(t_float)
self.linear = torch.nn.Linear(D, D) # The weights in here should become the identity
def forward(self, i, fuzziness=1.0):
"""
In the forward function we accept a Variable of input data and we must return
a Variable of output data. We can use Modules defined in the constructor as
well as arbitrary operators on Variables.
"""
# Convert the input 'i' into a one-hot vector
self.input_space.zero_()
self.input_space.scatter_(1, i.data, 1.)
x = Variable( self.input_space )
logits = self.linear(x)
# These two seem to have problems with vanishing gradients (==0 on LHS)
#logits = logits.clamp(min=0)
#logits = torch.nn.ReLU()(logits)
# Works fine
#logits = torch.nn.LeakyReLU()(logits)
# Soft version
#action = logits
# Hard op
#print( y_idx)
# Do nothing else to get pure flow-through behaviour (Soft attention, though)
action = logits.clone()
if False: # This gives appropriate action(s) but no learning (?)
y_max, y_idx = torch.max(logits, 1, keepdim=True)
#print(action.size(), y_idx.size())
action[:,:] = 0.
action.scatter_(1, y_idx, 5.)
#print(y_idx,action)
if False: # This gives appropriate action(s) and learns something, sometimes
y_max, y_idx = torch.max(logits, 1, keepdim=True)
action[:,:] = 0.
action.scatter_(1, y_idx, y_max)
#print(action)
if False:
gumbel = sample_gumbel(logits)
y_max, y_idx = torch.max(logits + gumbel*fuzziness, 1, keepdim=True)
action[:,:] = 0.
#action.scatter_(1, y_idx, y_max)
action.scatter_(1, y_idx, y_max+5.0)
#print(action)
if True:
gumbel = sample_gumbel(logits)
action = action + gumbel*fuzziness
return action # This is a (batch_size, d) matrix
In [ ]:
model = SelectorNet(d)
if use_cuda: model = model.cuda()
# Construct our loss function and an Optimizer. The call to model.parameters()
# in the SGD constructor will contain the learnable parameters of the nn modules
# which are members of the model.
criterion = torch.nn.NLLLoss(size_average=True)
#optimizer = torch.optim.SGD(model.parameters(), lr=1e-4) #
optimizer = torch.optim.Adam(model.parameters(), lr=1e-1) #
for t in range(100):
# Forward pass: Compute predicted y by passing x to the model
y_pred_train = model(x)
y_pred_train_idx = y_pred_train.data.max(1)[1]
correct_train = y_pred_train_idx.eq(y.data).cpu().sum()
y_pred_test = model(x, fuzziness=0.0)
y_pred_test_idx = y_pred_test.data.max(1)[1]
correct_test = y_pred_test_idx.eq(y.data).cpu().sum()
loss = criterion(y_pred_train, y)
if (t+1) % 1 ==0 :
print("%4d %+6.2f %6.2f%% %6.2f%%" %
(t+1, loss.data[0], correct_train*100./batch_size, correct_test*100./batch_size,))
# Zero gradients, perform a backward pass, and update the weights.
optimizer.zero_grad()
loss.backward()
optimizer.step()
print(model.linear.weight.data,)# model.linear.bias.data)
In [ ]:
In [ ]:
a = Variable(torch.rand(5, 3), requires_grad=True)
a = a.clone() # Otherwise we change inplace a leaf Variable
print(a)
ind = Variable(torch.LongTensor([3]))
a.index_fill_(0, ind, 0)
print(a)
a[1, :] = 0
print(a)
In [ ]: