In [4]:
# Data
import numpy as np

with open('data/text_data/japan.txt', 'r') as f:
# with open('data/text_data/anna.txt', 'r') as f:

    txt = f.read()

    X = []
    y = []

    char_to_idx = {char: i for i, char in enumerate(set(txt))}
    idx_to_char = {i: char for i, char in enumerate(set(txt))}

    X = np.array([char_to_idx[x] for x in txt])
    y = [char_to_idx[x] for x in txt[1:]]
    y.append(char_to_idx['.'])
    y = np.array(y)

# Looking at the X, y
X.shape, y.shape, X[:10], y[:10]


Out[4]:
((3629,),
 (3629,),
 array([15, 11, 25, 11,  5, 13, 55, 15, 11, 25]),
 array([11, 25, 11,  5, 13, 55, 15, 11, 25, 11]))

In [5]:
# Model or Network
import impl.layer as l

class GRU:
    def __init__(self, D, H, L, char2idx, idx2char):
        self.D = D
        self.H = H
        self.L = L
        self.char2idx = char2idx
        self.idx2char = idx2char
        self.vocab_size = len(char2idx)
        self.losses = {'train':[], 'smooth train':[]}
        
        # Model params
        Z = H + D
        m = dict(
            Wz=np.random.randn(Z, H) / np.sqrt(Z / 2.),
            Wh=np.random.randn(Z, H) / np.sqrt(Z / 2.),
            Wy=np.random.randn(H, D) / np.sqrt(H / 2.), 
            bz=np.zeros((1, H)),
            bh=np.zeros((1, H)),
            by=np.zeros((1, D))
        )

        self.model = []
        for _ in range(self.L):
            self.model.append(m)
            
        # Bi-directional
        self.model_l = []
        for _ in range(self.L):
            self.model_l.append(m)            

    def initial_state(self):
        return np.zeros((1, self.H))

    def forward(self, X, h, m):
        Wz, Wh, Wy = m['Wz'], m['Wh'], m['Wy']
        bz, bh, by = m['bz'], m['bh'], m['by']

        X_in = X.copy()
        h_in = h.copy()

        X = np.column_stack((h_in, X_in))

        hz, hz_cache = l.fc_forward(X, Wz, bz)
        hz, hz_sigm_cache = l.sigmoid_forward(hz)
        
        hh, hh_cache = l.fc_forward(X, Wh, bh)
        hh, hh_tanh_cache = l.tanh_forward(hh)

        # h = (1. - hz) * h_old + hz * hh
        # or
        # h = ((1. - hz) * h_in) + (hz * hh)
        # or
        h = h_in + (hz * (hh - h_in))

        y, y_cache = l.fc_forward(h, Wy, by)

        cache = (h_in, hz, hz_cache, hz_sigm_cache, hh, hh_cache, hh_tanh_cache, y_cache)

        return y, h, cache

    def backward(self, dy, dh, cache):
        h_in, hz, hz_cache, hz_sigm_cache, hh, hh_cache, hh_tanh_cache, y_cache = cache
        
        dh_out = dh.copy()

        dh, dWy, dby = l.fc_backward(dy, y_cache)
        dh += dh_out

        dh_in1 = (1. - hz) * dh
        
        dhh = hz * dh
        dhh = l.tanh_backward(dhh, hh_tanh_cache)
        dXh, dWh, dbh = l.fc_backward(dhh, hh_cache)

        # dhz = (hh * dh) - (h_in * dh)
        # or
        dhz = (hh - h_in) * dh
        dhz = l.sigmoid_backward(dhz, hz_sigm_cache)
        dXz, dWz, dbz = l.fc_backward(dhz, hz_cache)

        dX = dXh + dXz
        dh_in2 = dX[:, :self.H]
        dX_in = dX[:, self.H:]

        dh = dh_in1 + dh_in2
        dX = dX_in

        grad = dict(Wz=dWz, Wh=dWh, Wy=dWy, bz=dbz, bh=dbh, by=dby)
        
        return dX, dh, grad

    def train_forward(self, X_train, h):
        ys, caches = [], []
        h_init = h.copy()
        h = []
        for _ in range(self.L):
            h.append(h_init.copy())
            caches.append([])

        # for X in X_train:
        for t in range(len(X_train)):
            X = X_train[t]
            X_one_hot = np.zeros(self.D)
            X_one_hot[X] = 1.
            X = X_one_hot.reshape(1, -1)
            for layer in range(self.L):
                y, h[layer], cache = self.forward(X, h[layer], self.model[layer])
                caches[layer].append(cache)
                X = y.copy()
            ys.append(y)
            
        return ys, caches

    def train_forward_l(self, X_train, h):
        ys, caches = [], []
        h_init = h.copy()
        h = []
        for _ in range(self.L):
            h.append(h_init.copy())
            caches.append([])

        # for X in X_train:
        for t in reversed(range(len(X_train))):
            X = X_train[t]
            X_one_hot = np.zeros(self.D)
            X_one_hot[X] = 1.
            X = X_one_hot.reshape(1, -1)
            for layer in range(self.L):
                y, h[layer], cache = self.forward(X, h[layer], self.model_l[layer])
                caches[layer].append(cache)
                X = y.copy()
            ys.append(y)
            
        return ys, caches

    def cross_entropy(self, y_pred, y_train):
        m = y_pred.shape[0]

        prob = l.softmax(y_pred)
        log_like = -np.log(prob[range(m), y_train])
        data_loss = np.sum(log_like) / m

        return data_loss

    def dcross_entropy(self, y_pred, y_train):
        m = y_pred.shape[0]

        grad_y = l.softmax(y_pred)
        grad_y[range(m), y_train] -= 1.0
        grad_y /= m

        return grad_y
    
    def loss_function(self, y_train, ys):
        loss, dys = 0.0, []

        for y_pred, y in zip(ys, y_train):
            loss += self.cross_entropy(y_pred, y)
            dy = self.dcross_entropy(y_pred, y)
            dys.append(dy)
            
        return loss, dys
    
    def train_backward(self, dys, caches):
        dh, grad, grads = [], [], []
        for layer in range(self.L):
            dh.append(np.zeros((1, self.H)))
            grad.append({key: np.zeros_like(val) for key, val in self.model[layer].items()})
            grads.append({key: np.zeros_like(val) for key, val in self.model[layer].items()})
        
        dXs = []
        for t in reversed(range(len(dys))):
            dy = dys[t]
            for layer in reversed(range(self.L)):
                dX, dh[layer], grad[layer] = self.backward(dy, dh[layer], caches[layer][t])
                for k in grad[layer].keys():
                    grads[layer][k] += grad[layer][k]
                dy = dX.copy()
            dXs.append(dX)
                
        return dXs, grads
    
    def train_backward_l(self, dys, caches):
        dh, grad, grads = [], [], []
        for layer in range(self.L):
            dh.append(np.zeros((1, self.H)))
            grad.append({key: np.zeros_like(val) for key, val in self.model_l[layer].items()})
            grads.append({key: np.zeros_like(val) for key, val in self.model_l[layer].items()})
        
        dXs = []
        for t in range(len(dys)):
            dy = dys[t]
            for layer in reversed(range(self.L)):
                dX, dh[layer], grad[layer] = self.backward(dy, dh[layer], caches[layer][t])
                for k in grad[layer].keys():
                    grads[layer][k] += grad[layer][k]
                dy = dX.copy()
            dXs.append(dX)
                
        return dXs, grads

#     def test(self, X_seed, h, size):
#         chars = [self.idx2char[X_seed]]
#         idx_list = list(range(self.vocab_size))
#         X = X_seed
        
#         h_init = h.copy()
#         h = []
#         for _ in range(self.L):
#             h.append(h_init.copy())

#         for _ in range(size):
#             # Right-directional
#             X_one_hot = np.zeros(self.D)
#             X_one_hot[X] = 1.
#             X = X_one_hot.reshape(1, -1)
#             for layer in range(self.L):
#                 y, h[layer], _ = self.forward(X, h[layer], self.model[layer])
#                 X = y.copy()
#             # Left-directional
#             X_one_hot = np.zeros(self.D)
#             X_one_hot[X] = 1.
#             X = X_one_hot.reshape(1, -1)
#             for layer in range(self.L):
#                 y, h[layer], _ = self.forward(X, h[layer], self.model[layer])
#                 X = y.copy()

#             prob = l.softmax(y)
#             idx = np.random.choice(idx_list, p=prob.ravel())
#             chars.append(self.idx2char[idx])
#             X = idx

#         return ''.join(chars)

In [6]:
def get_minibatch(X, y, minibatch_size, shuffle):
    minibatches = []

    #for i in range(0, X.shape[0], minibatch_size):
    for i in range(0, X.shape[0] - minibatch_size + 1, 1):
        X_mini = X[i:i + minibatch_size]
        y_mini = y[i:i + minibatch_size]
        minibatches.append((X_mini, y_mini))

    return minibatches

def adam_rnn(nn, X_train, y_train, alpha, mb_size, n_iter, print_after):

    M, R = [], []
    for layer in range(nn.L):
        M.append({k: np.zeros_like(v) for k, v in nn.model[layer].items()})
        R.append({k: np.zeros_like(v) for k, v in nn.model[layer].items()})

    M_l, R_l = [], []
    for layer in range(nn.L):
        M_l.append({k: np.zeros_like(v) for k, v in nn.model_l[layer].items()})
        R_l.append({k: np.zeros_like(v) for k, v in nn.model_l[layer].items()})
        
    beta1 = .99
    beta2 = .999
    state = nn.initial_state()
    smooth_loss = 1.
    eps = 1e-8
    minibatches = get_minibatch(X_train, y_train, mb_size, shuffle=False)
    
    for iter in range(1, n_iter + 1):
        for idx in range(len(minibatches)):
            X_mini, y_mini = minibatches[idx]
            ys, caches = nn.train_forward(X_mini, state)
            ys_l, caches_l = nn.train_forward_l(X_mini, state)
            ys += ys_l
            loss, dys = nn.loss_function(y_mini, ys)
            _, grads = nn.train_backward(dys, caches)
            _, grads_l = nn.train_backward_l(dys, caches_l)
            nn.losses['train'].append(loss)
            smooth_loss = (0.999 * smooth_loss) + (0.001 * loss)
            nn.losses['smooth train'].append(smooth_loss)

            for layer in range(nn.L):
                for k in grads[layer].keys(): #key, value: items
                    M[layer][k] = l.exp_running_avg(M[layer][k], grads[layer][k], beta1)
                    R[layer][k] = l.exp_running_avg(R[layer][k], grads[layer][k]**2, beta2)

                    m_k_hat = M[layer][k] / (1. - (beta1**(iter)))
                    r_k_hat = R[layer][k] / (1. - (beta2**(iter)))

                    nn.model[layer][k] -= alpha * m_k_hat / (np.sqrt(r_k_hat) + eps)
                    
                for k in grads_l[layer].keys(): #key, value: items
                    M_l[layer][k] = l.exp_running_avg(M_l[layer][k], grads_l[layer][k], beta1)
                    R_l[layer][k] = l.exp_running_avg(R_l[layer][k], grads_l[layer][k]**2, beta2)

                    m_k_hat = M_l[layer][k] / (1. - (beta1**(iter)))
                    r_k_hat = R_l[layer][k] / (1. - (beta2**(iter)))

                    nn.model_l[layer][k] -= alpha * m_k_hat / (np.sqrt(r_k_hat) + eps)

        # Print loss and test sample
        if iter % print_after == 0:
            print('Iter-{} loss: {:.4f}'.format(iter, loss))
#             sample = nn.test(X_mini[0], state, 100)
#             print(sample)

    return nn

In [7]:
# Hyper-parameters
time_step = 100 # width, minibatch size and test sample size as well
num_layers = 2 # depth
n_iter = 300 # epochs
alpha = 1e-4 # learning_rate
print_after = 1 # n_iter//10 # print training loss, valid, and test
num_hidden_units = 64 # num_hidden_units in hidden layer
num_input_units = len(char_to_idx) # vocab_size = len(char_to_idx)

# Build the network and learning it or optimizing it using SGD
net = GRU(D=num_input_units, H=num_hidden_units, L=num_layers, char2idx=char_to_idx, idx2char=idx_to_char)

# Start learning using BP-SGD-ADAM
adam_rnn(nn=net, X_train=X, y_train=y, alpha=alpha, mb_size=time_step, n_iter=n_iter, print_after=print_after)

# # Display the learning curve and losses for training, validation, and testing
# %matplotlib inline
# %config InlineBackend.figure_format = 'retina'
import matplotlib.pyplot as plt

plt.plot(net.losses['train'], label='Train loss')
plt.plot(net.losses['smooth train'], label='Train smooth loss')
plt.legend()
plt.show()


Iter-1 loss: 353.3016
Iter-2 loss: 346.1233
Iter-3 loss: 343.6538
Iter-4 loss: 343.8252
Iter-5 loss: 337.7312
Iter-6 loss: 325.1908
Iter-7 loss: 294.6706
Iter-8 loss: 273.4859
Iter-9 loss: 258.0155
Iter-10 loss: 250.8854
Iter-11 loss: 240.0031
Iter-12 loss: 235.9300
Iter-13 loss: 235.3071
Iter-14 loss: 230.2372
Iter-15 loss: 229.2688
Iter-16 loss: 233.9227
Iter-17 loss: 229.5201
Iter-18 loss: 220.9576
Iter-19 loss: 221.1028
Iter-20 loss: 220.0469
Iter-21 loss: 226.1762
Iter-22 loss: 213.5906
Iter-23 loss: 216.6812
Iter-24 loss: 219.3163
Iter-25 loss: 215.1100
Iter-26 loss: 208.3085
Iter-27 loss: 212.3584
Iter-28 loss: 201.7661
Iter-29 loss: 194.1194
Iter-30 loss: 209.5096
Iter-31 loss: 199.0491
Iter-32 loss: 202.8826
Iter-33 loss: 199.8574
Iter-34 loss: 203.7821
Iter-35 loss: 201.6214
Iter-36 loss: 194.1163
Iter-37 loss: 196.2859
Iter-38 loss: 197.3501
Iter-39 loss: 191.2670
Iter-40 loss: 188.0067
Iter-41 loss: 181.8373
Iter-42 loss: 194.6279
Iter-43 loss: 192.0386
Iter-44 loss: 183.5653
Iter-45 loss: 181.2330
Iter-46 loss: 182.7158
Iter-47 loss: 180.6051
Iter-48 loss: 177.6872
Iter-49 loss: 182.4917
Iter-50 loss: 178.1083
Iter-51 loss: 190.0309
Iter-52 loss: 173.4698
Iter-53 loss: 171.1774
Iter-54 loss: 178.2823
Iter-55 loss: 175.9313
Iter-56 loss: 196.3182
Iter-57 loss: 180.3250
Iter-58 loss: 173.8164
Iter-59 loss: 172.3976
Iter-60 loss: 168.4015
Iter-61 loss: 180.5888
Iter-62 loss: 182.8592
Iter-63 loss: 172.5609
Iter-64 loss: 165.7028
Iter-65 loss: 158.0435
Iter-66 loss: 172.3257
Iter-67 loss: 165.5859
Iter-68 loss: 171.6120
Iter-69 loss: 164.4085
Iter-70 loss: 176.1861
Iter-71 loss: 178.4984
Iter-72 loss: 170.0545
Iter-73 loss: 163.3255
Iter-74 loss: 176.8796
Iter-75 loss: 173.5171
Iter-76 loss: 169.0784
Iter-77 loss: 173.2475
Iter-78 loss: 175.9579
Iter-79 loss: 168.8534
Iter-80 loss: 162.7758
Iter-81 loss: 182.9856
Iter-82 loss: 173.5999
Iter-83 loss: 179.3330
Iter-84 loss: 168.7299
Iter-85 loss: 163.7274
Iter-86 loss: 164.0523
Iter-87 loss: 185.1531
Iter-88 loss: 174.0803
Iter-89 loss: 157.0627
Iter-90 loss: 183.7141
Iter-91 loss: 162.2491
Iter-92 loss: 167.3686
Iter-93 loss: 161.8193
Iter-94 loss: 152.3722
Iter-95 loss: 166.9908
Iter-96 loss: 157.5951
Iter-97 loss: 160.3770
Iter-98 loss: 151.0114
Iter-99 loss: 152.3443
Iter-100 loss: 149.8383
Iter-101 loss: 156.0515
Iter-102 loss: 159.4643
Iter-103 loss: 147.7329
Iter-104 loss: 146.5643
Iter-105 loss: 148.2674
Iter-106 loss: 149.1379
Iter-107 loss: 148.4481
Iter-108 loss: 125.9320
Iter-109 loss: 129.8492
Iter-110 loss: 136.5517
Iter-111 loss: 134.3482
Iter-112 loss: 132.0252
Iter-113 loss: 132.5277
Iter-114 loss: 134.4361
Iter-115 loss: 136.3939
Iter-116 loss: 126.3932
Iter-117 loss: 132.0287
Iter-118 loss: 138.6128
Iter-119 loss: 126.0082
Iter-120 loss: 122.0597
Iter-121 loss: 122.7083
Iter-122 loss: 124.2829
Iter-123 loss: 114.0778
Iter-124 loss: 125.5891
Iter-125 loss: 122.3282
Iter-126 loss: 114.2301
Iter-127 loss: 121.7364
Iter-128 loss: 110.7098
Iter-129 loss: 111.8749
Iter-130 loss: 117.1829
Iter-131 loss: 113.2632
Iter-132 loss: 106.0025
Iter-133 loss: 106.6988
Iter-134 loss: 113.1010
Iter-135 loss: 113.6731
Iter-136 loss: 107.2420
Iter-137 loss: 110.2328
Iter-138 loss: 102.7878
Iter-139 loss: 107.2732
Iter-140 loss: 113.5786
Iter-141 loss: 105.8822
Iter-142 loss: 107.1561
Iter-143 loss: 108.3402
Iter-144 loss: 112.2813
Iter-145 loss: 103.0860
Iter-146 loss: 107.8940
Iter-147 loss: 102.1124
Iter-148 loss: 102.1365
Iter-149 loss: 97.0196
Iter-150 loss: 93.9733
Iter-151 loss: 96.4864
Iter-152 loss: 100.9532
Iter-153 loss: 107.0472
Iter-154 loss: 105.0973
Iter-155 loss: 94.3813
Iter-156 loss: 102.0551
Iter-157 loss: 96.4782
Iter-158 loss: 96.1996
Iter-159 loss: 95.6849
Iter-160 loss: 99.5164
Iter-161 loss: 97.2832
Iter-162 loss: 107.1022
Iter-163 loss: 94.1813
Iter-164 loss: 89.7298
Iter-165 loss: 84.7218
Iter-166 loss: 84.1860
Iter-167 loss: 88.2822
Iter-168 loss: 85.2007
Iter-169 loss: 82.2647
Iter-170 loss: 88.2274
Iter-171 loss: 87.1860
Iter-172 loss: 85.1327
Iter-173 loss: 77.2893
Iter-174 loss: 74.1302
Iter-175 loss: 76.7236
Iter-176 loss: 71.0902
Iter-177 loss: 84.5028
Iter-178 loss: 81.4938
Iter-179 loss: 71.4998
Iter-180 loss: 73.7193
Iter-181 loss: 77.0808
Iter-182 loss: 71.4167
Iter-183 loss: 66.9473
Iter-184 loss: 72.0970
Iter-185 loss: 78.6943
Iter-186 loss: 74.4889
Iter-187 loss: 74.9685
Iter-188 loss: 65.7573
Iter-189 loss: 70.1994
Iter-190 loss: 62.1004
Iter-191 loss: 75.2544
Iter-192 loss: 70.2767
Iter-193 loss: 64.9774
Iter-194 loss: 85.8548
Iter-195 loss: 83.1590
Iter-196 loss: 76.8509
Iter-197 loss: 74.1899
Iter-198 loss: 66.3714
Iter-199 loss: 65.4168
Iter-200 loss: 68.6006
Iter-201 loss: 66.1362
Iter-202 loss: 72.3569
Iter-203 loss: 60.2538
Iter-204 loss: 70.2628
Iter-205 loss: 61.3208
Iter-206 loss: 64.9461
Iter-207 loss: 61.2936
Iter-208 loss: 71.3327
Iter-209 loss: 75.9303
Iter-210 loss: 66.9265
Iter-211 loss: 64.4869
Iter-212 loss: 65.2586
Iter-213 loss: 68.1932
Iter-214 loss: 62.0212
Iter-215 loss: 69.4382
Iter-216 loss: 72.7645
Iter-217 loss: 65.7376
Iter-218 loss: 66.7134
Iter-219 loss: 62.3150
Iter-220 loss: 61.2449
Iter-221 loss: 71.2433
Iter-222 loss: 66.6124
Iter-223 loss: 66.3245
Iter-224 loss: 60.8770
Iter-225 loss: 59.7522
Iter-226 loss: 58.5245
Iter-227 loss: 73.9337
Iter-228 loss: 75.2988
Iter-229 loss: 73.2725
Iter-230 loss: 68.6287
Iter-231 loss: 73.7499
Iter-232 loss: 62.8285
Iter-233 loss: 64.1730
Iter-234 loss: 72.3561
Iter-235 loss: 76.0326
Iter-236 loss: 74.4939
Iter-237 loss: 66.6839
Iter-238 loss: 67.4206
Iter-239 loss: 70.7217
Iter-240 loss: 65.3442
Iter-241 loss: 65.5043
Iter-242 loss: 70.9098
Iter-243 loss: 73.5998
Iter-244 loss: 70.9071
Iter-245 loss: 78.5548
Iter-246 loss: 64.9387
Iter-247 loss: 68.9665
Iter-248 loss: 58.9071
Iter-249 loss: 66.9279
Iter-250 loss: 56.6924
Iter-251 loss: 61.0700
Iter-252 loss: 59.7568
Iter-253 loss: 57.6109
Iter-254 loss: 66.0884
Iter-255 loss: 54.7701
Iter-256 loss: 61.3432
Iter-257 loss: 60.1563
Iter-258 loss: 64.0788
Iter-259 loss: 64.2589
Iter-260 loss: 66.9235
Iter-261 loss: 64.3296
Iter-262 loss: 60.3022
Iter-263 loss: 62.6852
Iter-264 loss: 77.9549
Iter-265 loss: 65.7512
Iter-266 loss: 79.0837
Iter-267 loss: 64.9879
Iter-268 loss: 65.1686
Iter-269 loss: 71.7018
Iter-270 loss: 64.8308
Iter-271 loss: 66.7343
Iter-272 loss: 67.4261
Iter-273 loss: 74.1856
Iter-274 loss: 81.9775
Iter-275 loss: 66.7757
Iter-276 loss: 81.6647
Iter-277 loss: 71.6501
Iter-278 loss: 77.8088
Iter-279 loss: 73.1311
Iter-280 loss: 68.3262
Iter-281 loss: 62.4083
Iter-282 loss: 77.0509
Iter-283 loss: 81.8294
Iter-284 loss: 88.9150
Iter-285 loss: 73.1361
Iter-286 loss: 69.3219
Iter-287 loss: 76.8381
Iter-288 loss: 72.1262
Iter-289 loss: 85.4140
Iter-290 loss: 83.6831
Iter-291 loss: 93.9323
Iter-292 loss: 87.3975
Iter-293 loss: 81.1552
Iter-294 loss: 66.7310
Iter-295 loss: 79.4867
Iter-296 loss: 72.5072
Iter-297 loss: 76.4554
Iter-298 loss: 74.2506
Iter-299 loss: 73.2850
Iter-300 loss: 70.5409

In [5]:
# # Hyper-parameters
# time_step = 100 # width, minibatch size and test sample size as well
# num_layers = 1 # depth
# n_iter = 300 # epochs
# alpha = 1e-4 # learning_rate
# print_after = 1 # n_iter//10 # print training loss, valid, and test
# num_hidden_units = 64 # num_hidden_units in hidden layer
# num_input_units = len(char_to_idx) # vocab_size = len(char_to_idx)

# # Build the network and learning it or optimizing it using SGD
# net = GRU(D=num_input_units, H=num_hidden_units, L=num_layers, char2idx=char_to_idx, idx2char=idx_to_char)

# # Start learning using BP-SGD-ADAM
# adam_rnn(nn=net, X_train=X, y_train=y, alpha=alpha, mb_size=time_step, n_iter=n_iter, print_after=print_after)

# # # Display the learning curve and losses for training, validation, and testing
# # %matplotlib inline
# # %config InlineBackend.figure_format = 'retina'
# import matplotlib.pyplot as plt

# plt.plot(net.losses['train'], label='Train loss')
# plt.plot(net.losses['smooth train'], label='Train smooth loss')
# plt.legend()
# plt.show()


Iter-1 loss: 309.4318
Iter-2 loss: 287.5329
Iter-3 loss: 279.1090
Iter-4 loss: 273.4013
Iter-5 loss: 269.3631
Iter-6 loss: 266.3277
Iter-7 loss: 263.0172
Iter-8 loss: 256.7446
Iter-9 loss: 253.4718
Iter-10 loss: 248.6307
Iter-11 loss: 239.8086
Iter-12 loss: 233.8652
Iter-13 loss: 230.0072
Iter-14 loss: 225.8405
Iter-15 loss: 223.8493
Iter-16 loss: 220.4195
Iter-17 loss: 217.0637
Iter-18 loss: 217.4756
Iter-19 loss: 217.7577
Iter-20 loss: 215.0665
Iter-21 loss: 212.8956
Iter-22 loss: 211.5629
Iter-23 loss: 211.4263
Iter-24 loss: 211.0869
Iter-25 loss: 211.9673
Iter-26 loss: 211.0677
Iter-27 loss: 212.1399
Iter-28 loss: 211.6270
Iter-29 loss: 210.6031
Iter-30 loss: 208.4290
Iter-31 loss: 209.1585
Iter-32 loss: 207.7561
Iter-33 loss: 206.5024
Iter-34 loss: 205.8519
Iter-35 loss: 201.4642
Iter-36 loss: 200.1840
Iter-37 loss: 197.5507
Iter-38 loss: 197.1976
Iter-39 loss: 194.4914
Iter-40 loss: 189.8893
Iter-41 loss: 185.6482
Iter-42 loss: 181.7703
Iter-43 loss: 179.5737
Iter-44 loss: 180.2451
Iter-45 loss: 178.8224
Iter-46 loss: 175.2156
Iter-47 loss: 176.0079
Iter-48 loss: 174.1804
Iter-49 loss: 175.1429
Iter-50 loss: 175.0173
Iter-51 loss: 173.0090
Iter-52 loss: 174.8738
Iter-53 loss: 172.0465
Iter-54 loss: 173.8027
Iter-55 loss: 172.0614
Iter-56 loss: 171.3277
Iter-57 loss: 175.5713
Iter-58 loss: 169.5954
Iter-59 loss: 169.8430
Iter-60 loss: 169.4491
Iter-61 loss: 169.5754
Iter-62 loss: 168.1068
Iter-63 loss: 168.9147
Iter-64 loss: 167.8981
Iter-65 loss: 170.4365
Iter-66 loss: 171.4680
Iter-67 loss: 171.0544
Iter-68 loss: 168.6333
Iter-69 loss: 171.3883
Iter-70 loss: 170.3479
Iter-71 loss: 168.4348
Iter-72 loss: 166.0719
Iter-73 loss: 166.2977
Iter-74 loss: 165.4888
Iter-75 loss: 166.3180
Iter-76 loss: 167.1275
Iter-77 loss: 166.4962
Iter-78 loss: 166.9076
Iter-79 loss: 162.8606
Iter-80 loss: 166.0411
Iter-81 loss: 164.8712
Iter-82 loss: 169.4452
Iter-83 loss: 164.9773
Iter-84 loss: 164.9481
Iter-85 loss: 162.3430
Iter-86 loss: 162.9129
Iter-87 loss: 161.0441
Iter-88 loss: 165.0695
Iter-89 loss: 163.3116
Iter-90 loss: 163.3405
Iter-91 loss: 160.7902
Iter-92 loss: 159.2136
Iter-93 loss: 161.7065
Iter-94 loss: 160.3086
Iter-95 loss: 158.1988
Iter-96 loss: 159.6755
Iter-97 loss: 161.2396
Iter-98 loss: 157.0298
Iter-99 loss: 158.6220
Iter-100 loss: 159.6396
Iter-101 loss: 159.0923
Iter-102 loss: 168.3000
Iter-103 loss: 164.7538
Iter-104 loss: 161.1185
Iter-105 loss: 163.1657
Iter-106 loss: 176.8791
Iter-107 loss: 160.4735
Iter-108 loss: 166.2886
Iter-109 loss: 167.0328
Iter-110 loss: 163.2966
Iter-111 loss: 166.9952
Iter-112 loss: 169.0397
Iter-113 loss: 161.4005
Iter-114 loss: 165.1582
Iter-115 loss: 163.4210
Iter-116 loss: 172.0658
Iter-117 loss: 163.3116
Iter-118 loss: 167.4987
Iter-119 loss: 163.9126
Iter-120 loss: 166.1403
Iter-121 loss: 161.5559
Iter-122 loss: 161.2765
Iter-123 loss: 169.4914
Iter-124 loss: 174.2874
Iter-125 loss: 161.7037
Iter-126 loss: 161.7123
Iter-127 loss: 183.9995
Iter-128 loss: 161.3640
Iter-129 loss: 166.8375
Iter-130 loss: 171.8454
Iter-131 loss: 166.7102
Iter-132 loss: 167.4072
Iter-133 loss: 176.9184
Iter-134 loss: 163.1729
Iter-135 loss: 165.3651
Iter-136 loss: 160.7814
Iter-137 loss: 163.5833
Iter-138 loss: 169.2695
Iter-139 loss: 167.1671
Iter-140 loss: 164.0540
Iter-141 loss: 175.1982
Iter-142 loss: 171.4883
Iter-143 loss: 164.7096
Iter-144 loss: 163.7784
Iter-145 loss: 164.1340
Iter-146 loss: 163.4996
Iter-147 loss: 175.8648
Iter-148 loss: 170.6272
Iter-149 loss: 165.3677
Iter-150 loss: 162.7060
Iter-151 loss: 178.6383
Iter-152 loss: 162.8370
Iter-153 loss: 165.0989
Iter-154 loss: 169.3685
Iter-155 loss: 166.7805
Iter-156 loss: 162.8328
Iter-157 loss: 164.5761
Iter-158 loss: 165.9347
Iter-159 loss: 168.0318
Iter-160 loss: 160.6640
Iter-161 loss: 167.6147
Iter-162 loss: 157.0994
Iter-163 loss: 156.1464
Iter-164 loss: 154.1655
Iter-165 loss: 158.3550
Iter-166 loss: 157.4847
Iter-167 loss: 158.8240
Iter-168 loss: 155.3508
Iter-169 loss: 151.5602
Iter-170 loss: 153.4203
Iter-171 loss: 154.7465
Iter-172 loss: 152.6828
Iter-173 loss: 154.9842
Iter-174 loss: 152.1741
Iter-175 loss: 149.1926
Iter-176 loss: 149.1542
Iter-177 loss: 151.2448
Iter-178 loss: 150.0508
Iter-179 loss: 147.6121
Iter-180 loss: 146.1169
Iter-181 loss: 151.6106
Iter-182 loss: 153.9579
Iter-183 loss: 144.2435
Iter-184 loss: 146.0814
Iter-185 loss: 145.2081
Iter-186 loss: 144.7895
Iter-187 loss: 142.9367
Iter-188 loss: 143.5739
Iter-189 loss: 148.8463
Iter-190 loss: 141.3336
Iter-191 loss: 139.3144
Iter-192 loss: 142.1779
Iter-193 loss: 140.8469
Iter-194 loss: 142.3344
Iter-195 loss: 138.8848
Iter-196 loss: 141.2317
Iter-197 loss: 134.7703
Iter-198 loss: 145.9153
Iter-199 loss: 138.1652
Iter-200 loss: 134.7829
Iter-201 loss: 141.8643
Iter-202 loss: 137.0788
Iter-203 loss: 132.2199
Iter-204 loss: 135.6687
Iter-205 loss: 141.1962
Iter-206 loss: 131.2536
Iter-207 loss: 133.5220
Iter-208 loss: 129.5378
Iter-209 loss: 142.0656
Iter-210 loss: 132.0587
Iter-211 loss: 128.8721
Iter-212 loss: 141.8817
Iter-213 loss: 127.7438
Iter-214 loss: 125.9501
Iter-215 loss: 130.9924
Iter-216 loss: 127.1201
Iter-217 loss: 126.6635
Iter-218 loss: 124.4386
Iter-219 loss: 123.5778
Iter-220 loss: 124.9831
Iter-221 loss: 123.0294
Iter-222 loss: 128.1595
Iter-223 loss: 125.6606
Iter-224 loss: 125.9883
Iter-225 loss: 123.0254
Iter-226 loss: 129.6900
Iter-227 loss: 125.5446
Iter-228 loss: 129.2337
Iter-229 loss: 126.0948
Iter-230 loss: 126.7170
Iter-231 loss: 123.8016
Iter-232 loss: 122.5802
Iter-233 loss: 119.5869
Iter-234 loss: 124.4456
Iter-235 loss: 117.3123
Iter-236 loss: 118.9172
Iter-237 loss: 120.9998
Iter-238 loss: 118.8920
Iter-239 loss: 120.7458
Iter-240 loss: 118.2045
Iter-241 loss: 123.4865
Iter-242 loss: 121.8579
Iter-243 loss: 120.1510
Iter-244 loss: 117.2528
Iter-245 loss: 115.4058
Iter-246 loss: 114.7028
Iter-247 loss: 112.5517
Iter-248 loss: 112.7767
Iter-249 loss: 112.1153
Iter-250 loss: 113.7038
Iter-251 loss: 112.9074
Iter-252 loss: 112.0033
Iter-253 loss: 110.0819
Iter-254 loss: 106.8992
Iter-255 loss: 105.8397
Iter-256 loss: 104.8882
Iter-257 loss: 107.2133
Iter-258 loss: 104.0819
Iter-259 loss: 107.7277
Iter-260 loss: 105.0908
Iter-261 loss: 102.2640
Iter-262 loss: 105.9284
Iter-263 loss: 102.7082
Iter-264 loss: 100.3758
Iter-265 loss: 103.8230
Iter-266 loss: 102.2046
Iter-267 loss: 104.8501
Iter-268 loss: 103.8064
Iter-269 loss: 100.6263
Iter-270 loss: 103.8533
Iter-271 loss: 103.0767
Iter-272 loss: 99.4147
Iter-273 loss: 98.9099
Iter-274 loss: 100.5226
Iter-275 loss: 98.6086
Iter-276 loss: 99.6697
Iter-277 loss: 98.6587
Iter-278 loss: 98.4573
Iter-279 loss: 96.2129
Iter-280 loss: 97.7559
Iter-281 loss: 96.2067
Iter-282 loss: 95.3574
Iter-283 loss: 94.5708
Iter-284 loss: 95.3913
Iter-285 loss: 93.9204
Iter-286 loss: 95.0807
Iter-287 loss: 93.7770
Iter-288 loss: 92.2958
Iter-289 loss: 92.8657
Iter-290 loss: 93.4943
Iter-291 loss: 90.6617
Iter-292 loss: 101.2036
Iter-293 loss: 90.9330
Iter-294 loss: 90.9861
Iter-295 loss: 87.9956
Iter-296 loss: 91.0379
Iter-297 loss: 89.1425
Iter-298 loss: 87.9695
Iter-299 loss: 84.8175
Iter-300 loss: 87.4210

In [ ]: