Outline

Generate data that is not linearly separable
Train with SN and see performance
Write from scratch our first feed forward network
Train the FF network on the data and compare with SN
Write a generic class for a FF network
Train generic class on binary classification
Generate data for multi-class classification
Train a FF network for 7
Use softmax as the output layer and cross-entropy loss function
Train with 8 for multi-class classification
Exercises on other datasets

Setup



In [0]:

    
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_squared_error
from tqdm import tqdm_notebook 

from sklearn.preprocessing import OneHotEncoder
from sklearn.datasets import make_blobs



In [0]:

    
class SigmoidNeuron:
    
  def __init__(self):
    self.w = None
    self.b = None
    
  def perceptron(self, x):
    return np.dot(x, self.w.T) + self.b
  
  def sigmoid(self, x):
    return 1.0/(1.0 + np.exp(-x))
  
  def grad_w_mse(self, x, y):
    y_pred = self.sigmoid(self.perceptron(x))
    return (y_pred - y) * y_pred * (1 - y_pred) * x
  
  def grad_b_mse(self, x, y):
    y_pred = self.sigmoid(self.perceptron(x))
    return (y_pred - y) * y_pred * (1 - y_pred)
  
  def grad_w_ce(self, x, y):
    y_pred = self.sigmoid(self.perceptron(x))
    if y == 0:
      return y_pred * x
    elif y == 1:
      return -1 * (1 - y_pred) * x
    else:
      raise ValueError("y should be 0 or 1")
    
  def grad_b_ce(self, x, y):
    y_pred = self.sigmoid(self.perceptron(x))
    if y == 0:
      return y_pred 
    elif y == 1:
      return -1 * (1 - y_pred)
    else:
      raise ValueError("y should be 0 or 1")
  
  def fit(self, X, Y, epochs=1, learning_rate=1, initialise=True, loss_fn="mse", display_loss=False):
    
    # initialise w, b
    if initialise:
      self.w = np.random.randn(1, X.shape[1])
      self.b = 0
      
    if display_loss:
      loss = {}
    
    for i in tqdm_notebook(range(epochs), total=epochs, unit="epoch"):
      dw = 0
      db = 0
      for x, y in zip(X, Y):
        if loss_fn == "mse":
          dw += self.grad_w_mse(x, y)
          db += self.grad_b_mse(x, y) 
        elif loss_fn == "ce":
          dw += self.grad_w_ce(x, y)
          db += self.grad_b_ce(x, y)
          
      m = X.shape[1]  
      self.w -= learning_rate * dw/m
      self.b -= learning_rate * db/m
      
      if display_loss:
        Y_pred = self.sigmoid(self.perceptron(X))
        if loss_fn == "mse":
          loss[i] = mean_squared_error(Y, Y_pred)
        elif loss_fn == "ce":
          loss[i] = log_loss(Y, Y_pred)
    
    if display_loss:
      plt.plot(loss.values())
      plt.xlabel('Epochs')
      if loss_fn == "mse":
        plt.ylabel('Mean Squared Error')
      elif loss_fn == "ce":
        plt.ylabel('Log Loss')
      plt.show()
      
  def predict(self, X):
    Y_pred = []
    for x in X:
      y_pred = self.sigmoid(self.perceptron(x))
      Y_pred.append(y_pred)
    return np.array(Y_pred)



In [0]:

    
my_cmap = matplotlib.colors.LinearSegmentedColormap.from_list("", ["red","yellow","green"])



In [0]:

    
np.random.seed(0)

Generate data



In [0]:

    
data, labels = make_blobs(n_samples=1000, centers=4, n_features=2, random_state=0)
print(data.shape, labels.shape)



In [0]:

    
plt.scatter(data[:,0], data[:,1], c=labels, cmap=my_cmap)
plt.show()



In [0]:

    
labels_orig = labels
labels = np.mod(labels_orig, 2)



In [0]:

    
plt.scatter(data[:,0], data[:,1], c=labels, cmap=my_cmap)
plt.show()



In [0]:

    
X_train, X_val, Y_train, Y_val = train_test_split(data, labels, stratify=labels, random_state=0)
print(X_train.shape, X_val.shape)

SN classification



In [0]:

    
sn = SigmoidNeuron()
sn.fit(X_train, Y_train, epochs=1000, learning_rate=0.5, display_loss=True)



In [0]:

    
Y_pred_train = sn.predict(X_train)
Y_pred_binarised_train = (Y_pred_train >= 0.5).astype("int").ravel()
Y_pred_val = sn.predict(X_val)
Y_pred_binarised_val = (Y_pred_val >= 0.5).astype("int").ravel()
accuracy_train = accuracy_score(Y_pred_binarised_train, Y_train)
accuracy_val = accuracy_score(Y_pred_binarised_val, Y_val)

print("Training accuracy", round(accuracy_train, 2))
print("Validation accuracy", round(accuracy_val, 2))



In [0]:

    
plt.scatter(X_train[:,0], X_train[:,1], c=Y_pred_binarised_train, cmap=my_cmap, s=15*(np.abs(Y_pred_binarised_train-Y_train)+.2))
plt.show()



In [0]:

    
!pip install ipdb

Our First FF Network



In [0]:

    
ffn = FirstFFNetwork()
rt=np.mean([0.3*0.5,0.1,0.03*0.25])
print( rt )
ffn.fit(X_train, Y_train, epochs=2000, learning_rate=rt, initialise=True, display_loss=True)



In [0]:

    
import ipdb;
class FirstFFNetwork:

  def __init__(self):
    self.w1 = np.ones((2,2))
    self.w2 = np.ones((2,1))
    self.b1 = np.zeros((1,2))
    self.b2 = 0

  def grad_sigmoid(self, X):
    return X*(1-X) 
  
  
  def sigmoid(self, x):
    return 1.0/(1.0 + np.exp(-x))
  
  def forward_pass(self, x):
    #ipdb.set_trace()
    self.a1 = np.matmul(x,self.w1) + self.b1 #(n,2)*(2,2)
    self.h1 = self.sigmoid(self.a1)    
    #print(self.h1.shape)
    self.a2 = np.matmul(x,self.w2) + self.b2
    self.h2 = self.sigmoid(self.a2)
    #ipdb.set_trace()
    return self.h2
    
    """
    self.x1, self.x2 = x
    self.a1 = self.w1*self.x1 + self.w2*self.x2 + self.b1  <=> W1=[w1,w2] W2= [w3,w4] W3=[w5,w6]T
    self.h1 = self.sigmoid(self.a1)
    self.a2 = self.w3*self.x1 + self.w4*self.x2 + self.b2
    self.h2 = self.sigmoid(self.a2)
    self.a3 = self.w5*self.h1 + self.w6*self.h2 + self.b3 
    self.h3 = self.sigmoid(self.a3)
 
    """
  def grad_sigmoid(self, x):
    return x*(1-x) 

  def grad(self, x, y):
    self.forward_pass(x)

    
    #self.dw5 = (self.h3-y) * self.h3*(1-self.h3) * self.h1
    # self.db3 = (self.h3-y) * self.h3*(1-self.h3)
    #self.dw6 = (self.h3-y) * self.h3*(1-self.h3) * self.h2
   
    y=y.reshape(-1,1)
    self.dW2=(self.h2-y) * self.grad_sigmoid(self.h2) * self.h1 #n,2
    self.dB2 = (self.h2-y) * self.grad_sigmoid(self.h2) #(N,1)
    #ipdb.set_trace()
    #self.dw1 = (self.h3-y) * self.h3*(1-self.h3)[n,1] * self.w5[2,2] * self.h1*(1-self.h1)[n,2] * self.x1
    #self.dw2 = (self.h3-y) * self.h3*(1-self.h3) * self.w5 * self.h1*(1-self.h1) * self.x2
    #self.db1 = (self.h3-y) * self.h3*(1-self.h3) * self.w5 * self.h1*(1-self.h1)
    #self.dw3 = (self.h3-y) * self.h3*(1-self.h3) * self.w6 * self.h2*(1-self.h2) * self.x1
    #self.dw4 = (self.h3-y) * self.h3*(1-self.h3) * self.w6 * self.h2*(1-self.h2) * self.x2
    #self.db2 = (self.h3-y) * self.h3*(1-self.h3) * self.w6 * self.h2*(1-self.h2)
    self.dW1 = np.hstack((((self.h2-y)* self.grad_sigmoid(self.h2)* self.w2.T*self.grad_sigmoid(self.h1)*x[0]),
                     ((self.h2-y)* self.grad_sigmoid(self.h2)* self.w2.T*self.grad_sigmoid(self.h1)*x[1]))) #[n,4]

    #self.dW1 =  (self.h2-y)* self.grad_sigmoid(self.h2)* self.w2.T*self.grad_sigmoid(self.h1)*x #(n,2)
    self.dB1 =  (self.h2-y)* self.grad_sigmoid(self.h2)* self.w2.T*self.grad_sigmoid(self.h1)  #(n,2)
    #print(self.dW.shape)
    #print(self.dB.shape)
  
    """
      self.forward_pass(X)
        m = X.shape[0]
        
        self.dA2 = self.H2 - Y # (N, 4) - (N, 4) -> (N, 4)
        
        self.dW2 = np.matmul(self.H1.T, self.dA2) # (2, N) * (N, 4) -> (2, 4)
        self.dB2 = np.sum(self.dA2, axis=0).reshape(1, -1) # (N, 4) -> (1, 4)
        self.dH1 = np.matmul(self.dA2, self.W2.T) # (N, 4) * (4, 2) -> (N, 2)
        self.dA1 = np.multiply(self.dH1, self.grad_sigmoid(self.H1)) # (N, 2) .* (N, 2) -> (N, 2)
        
        self.dW1 = np.matmul(X.T, self.dA1) # (2, N) * (N, 2) -> (2, 2)
        self.dB1 = np.sum(self.dA1, axis=0).reshape(1, -1) # (N, 2) -> (1, 2)

    """
  def fitFF(self, X, Y, epochs=50, learning_rate=1, initialise=True, display_loss=False):
    #print(X.shape)
    # initialise w, b
    if initialise:
      self.w1 = np.ones((2,1))
      self.w2 = np.ones((2,1))
      self.w3 = np.ones((2,1))
      self.b1 = 0
      self.b2 = 0
      self.b3 = 0
     
    if display_loss:
      loss = {}
    
    for i in tqdm_notebook(range(epochs), total=epochs, unit="epoch"):
      y_pred = self.forward_pass(X)

    return y_pred.shape
 
  def fit(self, X, Y, epochs=1, learning_rate=1, initialise=True, display_loss=False):
    self.w1 = np.ones((2,2))
    self.w2 = np.ones((2,1))
    self.b1 = np.zeros((1,2))
    self.b2 = 0
 
    # initialise w, b
    if initialise:
      self.w1 = np.ones((2,2))
      self.w2 = np.ones((2,1))
      self.b1 = np.zeros((1,2))
      self.b2 = 0
     
    if display_loss:
      loss = {}
    
    for i in tqdm_notebook(range(epochs), total=epochs, unit="epoch"):
      
      """
          self.a1 = np.matmul(x,self.w1) + self.b1 #(n,2)*(2,1)
         self.h1 = self.sigmoid(self.a1)    
          self.a2 = np.matmul(x,self.w2) + self.b2
          self.h2 = self.sigmoid(self.a2)
          self.h=np.array([self.h1,self.h2])
          self.a3 = np.matmul(self.h,self.w3.T) + self.b3 #(n,1)* (1,2) 
          self.h3 = self.sigmoid(self.a3)
(self.h3-y)* self.grad_sigmoid(self.h3)* self.w3
             self.dW3=(self.h3-y) * self.h3*(1-self.h3) * self.h
        self.dB3 = (self.h3-y) * self.h3*(1-self.h3) #(N,1)
      
      self.dW =  (self.h3-y)* grad_sigmoid(self.h3)* w3.T*grad_sigmoid(self.h)*X #(2,2)
       self.dB =  (self.h3-y)* grad_sigmoid(self.h3)* w3.T*grad_sigmoid(self.h)  #(N,2)
   W=[h1,h]
      
      """
      
      self.grad(X,Y)
      m = X.shape[0]
      #ipdb.set_trace()
      #self.w1 = np.random.randn(2,2)
      #self.w2 = np.random.randn(2,1)
      #self.b1 = np.zeros((1,2))
      #self.b2 = 0
      #dW2=n,2
      #dB1=n,2
      #dW1=n,4 => np.mean(self.dW1,axis=0) implies average by column => (4,)
      #dB2=n,1
      avg = np.mean(self.dW1,axis=0)
      dw1 = np.array([[avg[0],avg[1]],[avg[2],avg[3]]])
      dw2 = np.mean(self.dW2,axis=0)
      dw2=dw2.reshape(2,1)
      db1 = np.mean(self.dB1,axis=0).reshape(1,2)
      db2 = np.mean(self.dB2)
        
      #ipdb.set_trace()
      self.w1 -= learning_rate * dw1
      self.w2 -= learning_rate * dw2
      self.b1 -= learning_rate * db1
      self.b2 -= learning_rate * db2
      
      if display_loss:
        Y_pred = self.predict(X)
        loss[i] = mean_squared_error(Y_pred, Y)

    if display_loss:
      plt.plot(loss.values())
      plt.xlabel('Epochs')
      plt.ylabel('Mean Squared Error')
      plt.show()
      print(self.w1)
      print(self.w2)
      print(self.b1)
      print(self.b2)
      
  def predict(self, X):
    Y_pred = self.forward_pass(X)
    return np.array(Y_pred)

FFNetwork Classification



In [0]:

    
ffn = FirstFFNetwork()
ffn.fitFF(X_train, Y_train, epochs=2000, learning_rate=.01, display_loss=True)



In [0]:

    
Y_pred_train = ffn.predict(X_train)
Y_pred_binarised_train = (Y_pred_train >= 0.5).astype("int").ravel()
Y_pred_val = ffn.predict(X_val)
Y_pred_binarised_val = (Y_pred_val >= 0.5).astype("int").ravel()
accuracy_train = accuracy_score(Y_pred_binarised_train, Y_train)
accuracy_val = accuracy_score(Y_pred_binarised_val, Y_val)

print("Training accuracy", round(accuracy_train, 2))
print("Validation accuracy", round(accuracy_val, 2))



In [0]:

    
plt.scatter(X_train[:,0], X_train[:,1], c=Y_pred_binarised_train, cmap=my_cmap, s=15*(np.abs(Y_pred_binarised_train-Y_train)+.2))
plt.show()

Feed Forward Network - Generic Class



In [0]:

    
class FFSNNetwork:
  
  def __init__(self, n_inputs, hidden_sizes=[2]):
    self.nx = n_inputs
    self.ny = 1
    self.nh = len(hidden_sizes)
    self.sizes = [self.nx] + hidden_sizes + [self.ny]
    
    self.W = {}
    self.B = {}
    for i in range(self.nh+1):
      self.W[i+1] = np.random.randn(self.sizes[i], self.sizes[i+1])
      self.B[i+1] = np.zeros((1, self.sizes[i+1]))
  
  def sigmoid(self, x):
    return 1.0/(1.0 + np.exp(-x))
  
  def forward_pass(self, x):
    self.A = {}
    self.H = {}
    self.H[0] = x.reshape(1, -1)
    for i in range(self.nh+1):
      self.A[i+1] = np.matmul(self.H[i], self.W[i+1]) + self.B[i+1]
      self.H[i+1] = self.sigmoid(self.A[i+1])
    return self.H[self.nh+1]
  
  def grad_sigmoid(self, x):
    return x*(1-x) 
    
  def grad(self, x, y):
    self.forward_pass(x)
    self.dW = {}
    self.dB = {}
    self.dH = {}
    self.dA = {}
    L = self.nh + 1
    self.dA[L] = (self.H[L] - y)
    for k in range(L, 0, -1):
      self.dW[k] = np.matmul(self.H[k-1].T, self.dA[k])
      self.dB[k] = self.dA[k]
      self.dH[k-1] = np.matmul(self.dA[k], self.W[k].T)
      self.dA[k-1] = np.multiply(self.dH[k-1], self.grad_sigmoid(self.H[k-1]))
    
  def fit(self, X, Y, epochs=1, learning_rate=1, initialise=True, display_loss=False):
    
    # initialise w, b
    if initialise:
      for i in range(self.nh+1):
        self.W[i+1] = np.random.randn(self.sizes[i], self.sizes[i+1])
        self.B[i+1] = np.zeros((1, self.sizes[i+1]))
      
    if display_loss:
      loss = {}
    
    for e in tqdm_notebook(range(epochs), total=epochs, unit="epoch"):
      dW = {}
      dB = {}
      for i in range(self.nh+1):
        dW[i+1] = np.zeros((self.sizes[i], self.sizes[i+1]))
        dB[i+1] = np.zeros((1, self.sizes[i+1]))
      for x, y in zip(X, Y):
        self.grad(x, y)
        for i in range(self.nh+1):
          dW[i+1] += self.dW[i+1]
          dB[i+1] += self.dB[i+1]
        
      m = X.shape[1]
      for i in range(self.nh+1):
        self.W[i+1] -= learning_rate * dW[i+1] / m
        self.B[i+1] -= learning_rate * dB[i+1] / m
      
      if display_loss:
        Y_pred = self.predict(X)
        loss[e] = mean_squared_error(Y_pred, Y)
    
    if display_loss:
      plt.plot(loss.values())
      plt.xlabel('Epochs')
      plt.ylabel('Mean Squared Error')
      plt.show()
      
  def predict(self, X):
    Y_pred = []
    for x in X:
      y_pred = self.forward_pass(x)
      Y_pred.append(y_pred)
    return np.array(Y_pred).squeeze()



In [0]:

    
ffsnn = FFSNNetwork(2, [2, 3])
ffsnn.fit(X_train, Y_train, epochs=1000, learning_rate=.001, display_loss=True)



In [0]:

    
Y_pred_train = ffsnn.predict(X_train)
Y_pred_binarised_train = (Y_pred_train >= 0.5).astype("int").ravel()
Y_pred_val = ffsnn.predict(X_val)
Y_pred_binarised_val = (Y_pred_val >= 0.5).astype("int").ravel()
accuracy_train = accuracy_score(Y_pred_binarised_train, Y_train)
accuracy_val = accuracy_score(Y_pred_binarised_val, Y_val)

print("Training accuracy", round(accuracy_train, 2))
print("Validation accuracy", round(accuracy_val, 2))



In [0]:

    
plt.scatter(X_train[:,0], X_train[:,1], c=Y_pred_binarised_train, cmap=my_cmap, s=15*(np.abs(Y_pred_binarised_train-Y_train)+.2))
plt.show()

Multi class classification



In [0]:

    
class FFSN_MultiClass:
  
  def __init__(self, n_inputs, n_outputs, hidden_sizes=[3]):
    self.nx = n_inputs
    self.ny = n_outputs
    self.nh = len(hidden_sizes)
    self.sizes = [self.nx] + hidden_sizes + [self.ny] 

    self.W = {}
    self.B = {}
    for i in range(self.nh+1):
      self.W[i+1] = np.random.randn(self.sizes[i], self.sizes[i+1])
      self.B[i+1] = np.zeros((1, self.sizes[i+1]))
      
  def sigmoid(self, x):
    return 1.0/(1.0 + np.exp(-x))
  
  def softmax(self, x):
    exps = np.exp(x)
    return exps / np.sum(exps)

  def forward_pass(self, x):
    self.A = {}
    self.H = {}
    self.H[0] = x.reshape(1, -1)
    for i in range(self.nh):
      self.A[i+1] = np.matmul(self.H[i], self.W[i+1]) + self.B[i+1]
      self.H[i+1] = self.sigmoid(self.A[i+1])
    self.A[self.nh+1] = np.matmul(self.H[self.nh], self.W[self.nh+1]) + self.B[self.nh+1]
    self.H[self.nh+1] = self.softmax(self.A[self.nh+1])
    return self.H[self.nh+1]
  
  def predict(self, X):
    Y_pred = []
    for x in X:
      y_pred = self.forward_pass(x)
      Y_pred.append(y_pred)
    return np.array(Y_pred).squeeze()
 
  def grad_sigmoid(self, x):
    return x*(1-x) 
  
  def cross_entropy(self,label,pred):
    yl=np.multiply(pred,label)
    yl=yl[yl!=0]
    yl=-np.log(yl)
    yl=np.mean(yl)
    return yl
 
  def grad(self, x, y):
    self.forward_pass(x)
    self.dW = {}
    self.dB = {}
    self.dH = {}
    self.dA = {}
    L = self.nh + 1
    self.dA[L] = (self.H[L] - y)
    for k in range(L, 0, -1):
      self.dW[k] = np.matmul(self.H[k-1].T, self.dA[k])
      self.dB[k] = self.dA[k]
      self.dH[k-1] = np.matmul(self.dA[k], self.W[k].T)
      self.dA[k-1] = np.multiply(self.dH[k-1], self.grad_sigmoid(self.H[k-1])) 
    
  def fit(self, X, Y, epochs=100, initialize='True', learning_rate=0.01, display_loss=False):
      
    if display_loss:
      loss = {}
      
    if initialize:
      for i in range(self.nh+1):
        self.W[i+1] = np.random.randn(self.sizes[i], self.sizes[i+1])
        self.B[i+1] = np.zeros((1, self.sizes[i+1]))
        
    for epoch in tqdm_notebook(range(epochs), total=epochs, unit="epoch"):
      dW = {}
      dB = {}
      for i in range(self.nh+1):
        dW[i+1] = np.zeros((self.sizes[i], self.sizes[i+1]))
        dB[i+1] = np.zeros((1, self.sizes[i+1]))
      for x, y in zip(X, Y):
        self.grad(x, y)
        for i in range(self.nh+1):
          dW[i+1] += self.dW[i+1]
          dB[i+1] += self.dB[i+1]
                  
      m = X.shape[1]
      for i in range(self.nh+1):
        self.W[i+1] -= learning_rate * (dW[i+1]/m)
        self.B[i+1] -= learning_rate * (dB[i+1]/m)
        
      if display_loss:
        Y_pred = self.predict(X) 
        loss[epoch] = self.cross_entropy(Y, Y_pred)
    
    if display_loss:
      plt.plot(loss.values())
      plt.xlabel('Epochs')
      plt.ylabel('CE')
      plt.show()



In [0]:

    
X_train, X_val, Y_train, Y_val = train_test_split(data, labels_orig, stratify=labels_orig, random_state=0)
print(X_train.shape, X_val.shape, labels_orig.shape)



In [0]:

    
enc = OneHotEncoder()
# 0 -> (1, 0, 0, 0), 1 -> (0, 1, 0, 0), 2 -> (0, 0, 1, 0), 3 -> (0, 0, 0, 1)
y_OH_train = enc.fit_transform(np.expand_dims(Y_train,1)).toarray()
y_OH_val = enc.fit_transform(np.expand_dims(Y_val,1)).toarray()
print(y_OH_train.shape, y_OH_val.shape)



In [0]:

    
ffsn_multi = FFSN_MultiClass(2,4,[2,3])
ffsn_multi.fit(X_train,y_OH_train,epochs=2000,learning_rate=.005,display_loss=True)



In [0]:

    
Y_pred_train = ffsn_multi.predict(X_train)
Y_pred_train = np.argmax(Y_pred_train,1)

Y_pred_val = ffsn_multi.predict(X_val)
Y_pred_val = np.argmax(Y_pred_val,1)

accuracy_train = accuracy_score(Y_pred_train, Y_train)
accuracy_val = accuracy_score(Y_pred_val, Y_val)

print("Training accuracy", round(accuracy_train, 2))
print("Validation accuracy", round(accuracy_val, 2))



In [0]:

    
plt.scatter(X_train[:,0], X_train[:,1], c=Y_pred_train, cmap=my_cmap, s=15*(np.abs(np.sign(Y_pred_train-Y_train))+.1))
plt.show()

Exercises



In [0]:

    
from sklearn.datasets import make_moons, make_circles



In [0]:

    
data, labels = make_moons(n_samples=1000, random_state=0, noise=0.15)
print(data.shape, labels.shape)



In [0]:

    
plt.scatter(data[:,0], data[:,1], c=labels, cmap=my_cmap)
plt.show()



In [0]:

    
data, labels = make_circles(n_samples=1000, random_state=0, noise=0.2, factor=0.3)
print(data.shape, labels.shape)



In [0]:

    
plt.scatter(data[:,0], data[:,1], c=labels, cmap=my_cmap)
plt.show()



In [0]: