Linear regression from scratch

Generating random data



In [1]:

    
from __future__ import print_function
import matplotlib.pyplot as plt
from tqdm import tqdm
import mxnet as mx
from mxnet import gluon



In [2]:

    
data_ctx = mx.cpu()
model_ctx = mx.cpu()



In [3]:

    
num_inputs = 2
num_outputs = 1
num_examples = 10000



In [4]:

    
w1_true = 2
w2_true = -3.4
b_true = 4.2



In [5]:

    
# Defining an example function that parameters we are trying to find
def real_fn(X):
    # Equation
    # 2 * x1 -3.4 * x2 + 4.2
    return w1_true * X[:, 0] + w2_true * X[:, 1] + b_true



In [6]:

    
# Generating random X
X = mx.nd.random_normal(shape=(num_examples, num_inputs), 
                     ctx=data_ctx)



In [7]:

    
# Generating random noise
noise = 0.1 * mx.nd.random_normal(shape=(num_examples, ), ctx=data_ctx)



In [8]:

    
noise.shape









    Out[8]:





(10000,)



In [9]:

    
# Generating Y
y = real_fn(X) + noise



In [10]:

    
print(X[0])
print(y[0])









    



[2.2122064 0.7740038]
<NDArray 2 @cpu(0)>

[6.053678]
<NDArray 1 @cpu(0)>



In [11]:

    
print(w1_true * X[0, 0] - w2_true * X[0, 1] + b_true)









    



[11.256025]
<NDArray 1 @cpu(0)>

Data iterator



In [12]:

    
# Defining batch_size
batch_size = 4



In [13]:

    
# Creating a data iterator
train_data = gluon.data.DataLoader(gluon.data.ArrayDataset(X, y),
                                   batch_size=batch_size, 
                                   shuffle=True)



In [14]:

    
# Getting a single batch
for i, (data, label) in enumerate(train_data):
    print(data, label)
    break









    



[[-0.03586762 -0.72321445]
 [ 0.39837354  1.3839029 ]
 [-0.05032287 -0.19343433]
 [-0.44942716  1.4952304 ]]
<NDArray 4x2 @cpu(0)> 
[ 6.5313764   0.44909072  4.9427285  -1.7898526 ]
<NDArray 4 @cpu(0)>



In [15]:

    
# When shuffle=True, each time the batch will be different
for i, (data, label) in enumerate(train_data):
    print(data, label)
    break









    



[[-0.38888022 -0.5888279 ]
 [-0.04182246  0.17726438]
 [ 0.34541106 -0.33166552]
 [ 1.3457807   0.4406432 ]]
<NDArray 4x2 @cpu(0)> 
[5.398409  3.646068  6.02336   5.6694674]
<NDArray 4 @cpu(0)>



In [16]:

    
# 10000 samples batches into 4 samples per batch yields 2500 batches
counter = 0
for i, (data, label) in enumerate(train_data):
    pass
print(i + 1)

Defining the model

Defining model parameters



In [17]:

    
w = mx.nd.random_normal(shape=(num_inputs, num_outputs), 
                        ctx=model_ctx)
b = mx.nd.random_normal(shape=num_outputs, 
                        ctx=model_ctx)
params = [w, b]



In [18]:

    
w.shape









    Out[18]:





(2, 1)



In [19]:

    
b.shape









    Out[19]:





(1,)



In [20]:

    
# Attaching gradients
for param in params:
    param.attach_grad()



In [21]:

    
# Defining network
def net(X):
    return mx.nd.dot(X, w) + b



In [22]:

    
# Defining the loss function
def square_loss(yhat, y):
    return mx.nd.mean((yhat - y) ** 2)



In [23]:

    
# Defining Stochastic Gradient Descent
def SGD(params, lr):
    for param in params:
        param[:] = param - lr * param.grad

Training



In [24]:

    
# Definint training parameters
epochs = 10
learning_rate = .0001



In [25]:

    
num_batches = num_examples / batch_size
num_batches









    Out[25]:





2500.0



In [26]:

    
for e in range(epochs):
    cumulative_loss = 0
    # Batch training
    for i, (data, label) in tqdm(enumerate(train_data), ascii=True):
        data = data.as_in_context(model_ctx)
        label = label.as_in_context(model_ctx).reshape((-1, 1))
        with mx.autograd.record():
            output = net(data)
            loss = square_loss(output, label)
        loss.backward()
        # Applying the change
        SGD(params, learning_rate)
        cumulative_loss += loss.asscalar()
    print('Epoch: {}'.format(e))
    print(cumulative_loss / num_batches)









    



2500it [00:04, 575.26it/s]
45it [00:00, 448.44it/s]





    



Epoch: 0
17.787389125978947






    



2500it [00:04, 565.95it/s]
39it [00:00, 388.39it/s]





    



Epoch: 1
6.528129469433427






    



2500it [00:04, 564.98it/s]
44it [00:00, 433.83it/s]





    



Epoch: 2
2.399974269490689






    



2500it [00:04, 507.98it/s]
43it [00:00, 429.16it/s]





    



Epoch: 3
0.8864629945185035






    



2500it [00:04, 524.35it/s]
37it [00:00, 366.68it/s]





    



Epoch: 4
0.3314312654912472






    



2500it [00:04, 537.91it/s]
42it [00:00, 418.91it/s]





    



Epoch: 5
0.1278467336665839






    



2500it [00:04, 557.15it/s]
45it [00:00, 442.21it/s]





    



Epoch: 6
0.05314170874669216






    



2500it [00:04, 500.69it/s]
40it [00:00, 394.40it/s]





    



Epoch: 7
0.025746813264163212






    



2500it [00:04, 532.79it/s]
38it [00:00, 378.55it/s]





    



Epoch: 8
0.01570879513991531






    



2500it [00:04, 519.32it/s]





    



Epoch: 9
0.012016978109814226



In [27]:

    
print('True values:')
print(w1_true)
print(w2_true)
print(b_true)









    



True values:
2
-3.4
4.2



In [28]:

    
w1_predicted = params[0][0]
w2_predicted = params[0][1]
b1_predicted = params[1][0]



In [29]:

    
print('Predicted values:')
print(w1_predicted.asscalar())
print(w2_predicted.asscalar())
print(b1_predicted.asscalar())









    



Predicted values:
1.9835123
-3.3783004
4.1766834