in this lab, you will see how adding dropout to your model will decrease overfitting.
Import all the libraries that you need for this lab:
In [ ]:
import torch
import matplotlib.pyplot as plt
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from matplotlib.colors import ListedColormap
Use this function only for plotting:
In [ ]:
def plot_decision_regions_3class(data_set,model=None):
cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA','#00AAFF'])
cmap_bold = ListedColormap(['#FF0000', '#00FF00','#00AAFF'])
X=data_set.x.numpy()
y=data_set.y.numpy()
h = .02
x_min, x_max = X[:, 0].min()-0.1 , X[:, 0].max()+0.1
y_min, y_max = X[:, 1].min()-0.1 , X[:, 1].max() +0.1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),np.arange(y_min, y_max, h))
newdata=np.c_[xx.ravel(), yy.ravel()]
#XX=torch.torch.Tensor(newdata)
#_,yhat=torch.max(model(XX),1)
#yhat=yhat.numpy().reshape(xx.shape)
Z=data_set.fun(newdata).flatten()
f=np.zeros(Z.shape)
f[Z>0]=1
f=f.reshape(xx.shape)
if model!=None:
model.eval()
XX=torch.torch.Tensor(newdata)
_,yhat=torch.max(model(XX),1)
yhat=yhat.numpy().reshape(xx.shape)
plt.pcolormesh(xx, yy, yhat, cmap=cmap_light)
plt.contour(xx, yy, f, cmap=plt.cm.Paired)
else:
plt.contour(xx, yy, f, cmap=plt.cm.Paired)
plt.pcolormesh(xx, yy, f, cmap=cmap_light)
plt.title("decision region vs True decision boundary")
plt.legend()
Use this function to calculate accuracy:
In [ ]:
def accuracy(model,data_set):
_,yhat=torch.max(model(data_set.x),1)
return (yhat==data_set.y).numpy().mean()
Create a nonlinearly separable dataset:
In [ ]:
from torch.utils.data import Dataset, DataLoader
class Data(Dataset):
def __init__(self,N_SAMPLES = 1000,noise_std=0.1,train=True):
a=np.matrix([-1,1,2,1,1,-3,1]).T
self.x = np.matrix(np.random.rand(N_SAMPLES,2))
self.f=np.array(a[0]+(self.x)*a[1:3]+np.multiply(self.x[:,0], self.x[:,1])*a[4]+np.multiply(self.x, self.x)*a[5:7]).flatten()
self.a=a
self.y=np.zeros(N_SAMPLES)
self.y[self.f> 0]=1
self.y=torch.from_numpy(self.y).type(torch.LongTensor)
self.x=torch.from_numpy(self.x).type(torch.FloatTensor)
self.x = self.x+noise_std*torch.randn(self.x.size())
self.f=torch.from_numpy(self.f)
self.a=a
if train==True:
torch.manual_seed(1)
self.x = self.x+noise_std*torch.randn(self.x.size())
torch.manual_seed(0)
def __getitem__(self,index):
return self.x[index],self.y[index]
def __len__(self):
return self.len
def plot(self):
X=data_set.x.numpy()
y=data_set.y.numpy()
h = .02
x_min, x_max = X[:, 0].min() , X[:, 0].max()
y_min, y_max = X[:, 1].min(), X[:, 1].max()
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),np.arange(y_min, y_max, h))
Z=data_set.fun(np.c_[xx.ravel(), yy.ravel()]).flatten()
f=np.zeros(Z.shape)
f[Z>0]=1
f=f.reshape(xx.shape)
plt.title('True decision boundary and sample points with noise ')
plt.plot(self.x[self.y==0,0].numpy(),self.x[self.y==0,1].numpy(),'bo',label='y=0' )
plt.plot(self.x[self.y==1,0].numpy(), self.x[self.y==1,1].numpy(),'ro',label='y=1' )
plt.contour(xx, yy, f, cmap=plt.cm.Paired )
plt.xlim(0,1)
plt.ylim(0,1)
plt.legend()
def fun(self,x):
x=np.matrix(x)
out=np.array(self.a[0]+(x)*self.a[1:3]+np.multiply(x[:,0], x[:,1])*self.a[4]+np.multiply(x, x)*self.a[5:7])
out=np.array(out)
return out
Create a dataset object:
In [ ]:
data_set=Data(noise_std=0.1)
data_set.plot()
Get some validation data:
In [ ]:
torch.manual_seed(0)
validation_set=Data(train=False)
Create a custom module with three layers. in_size
is the size of the input features, n_hidden
is the size of the layers, and out_size
is the size. p
is the dropout probability. The default is 0, that is, no dropout.
In [ ]:
class Net(nn.Module):
def __init__(self,in_size,n_hidden,out_size,p=0):
super(Net,self).__init__()
self.drop=nn.Dropout(p=p)
self.linear1=nn.Linear(in_size,n_hidden)
self.linear2=nn.Linear(n_hidden,n_hidden)
self.linear3=nn.Linear(n_hidden,out_size)
def forward(self,x):
x=F.relu(self.linear1(x))
x=self.drop(x)
x=F.relu(self.linear2(x))
x=self.drop(x)
x=self.linear3(x)
return x
Create two model objects: model had no dropout and model_drop has a dropout probability of 0.5:
In [ ]:
model=Net(2,300,2)
model_drop=Net(2,300,2,p=0.5)
Set the model using dropout to training mode; this is the default mode, but it's a good practice:
In [ ]:
model_drop.train()
Train the model by using the Adam optimizer. See the unit on other optimizers. Use the Cross Entropy Loss:
In [ ]:
optimizer_ofit = torch.optim.Adam(model.parameters(), lr=0.01)
optimizer_drop = torch.optim.Adam(model_drop.parameters(), lr=0.01)
criterion = torch.nn.CrossEntropyLoss()
Initialize a dictionary that stores the training and validation loss for each model:
In [ ]:
LOSS={}
LOSS['training data no dropout']=[]
LOSS['validation data no dropout']=[]
LOSS['training data dropout']=[]
LOSS['validation data dropout']=[]
Run 500 iterations of batch gradient decent:
In [ ]:
epochs=500
for epoch in range(epochs):
#make a prediction for both models
yhat = model(data_set.x)
yhat_drop = model_drop(data_set.x)
#calculate the lossf or both models
loss = criterion(yhat, data_set.y)
loss_drop = criterion(yhat_drop, data_set.y)
#store the loss for both the training and validation data for both models
LOSS['training data no dropout'].append(loss.item())
LOSS['validation data no dropout'].append(criterion(model(validation_set.x), validation_set.y).item())
LOSS['training data dropout'].append(loss_drop.item())
model_drop.eval()
LOSS['validation data dropout'].append(criterion(model_drop(validation_set.x), validation_set.y).item())
model_drop.train()
#clear gradient
optimizer_ofit.zero_grad()
optimizer_drop.zero_grad()
#Backward pass: compute gradient of the loss with respect to all the learnable parameters
loss.backward()
loss_drop.backward()
#the step function on an Optimizer makes an update to its parameters
optimizer_ofit.step()
optimizer_drop.step()
Set the model with dropout to evaluation mode:
In [ ]:
model_drop.eval()
Test the model without dropout on the validation data:
In [ ]:
accuracy(model,validation_set)
Test the model with dropout on the validation data:
In [ ]:
accuracy(model_drop,validation_set)
You see that the model with dropout performs better on the validation data.
Plot the decision boundary and the prediction of the networks in different colors:
In [ ]:
plot_decision_regions_3class(data_set)
In [ ]:
plot_decision_regions_3class(data_set,model)
In [ ]:
plot_decision_regions_3class(data_set,model_drop)
You can see that the model using dropout does better at tracking the function that generated the data.
Plot out the loss for the training and validation data on both models:
In [ ]:
plt.figure(figsize=(6.1, 10))
for key, value in LOSS.items():
plt.plot(np.log(np.array(value)),label=key)
plt.legend()
plt.xlabel("iterations")
plt.ylabel("Log of cost or total loss")
You see that the model without dropout performs better on the training data, but it performs worse on the validation data. This suggests overfitting. However, the model using dropout performed better on the validation data, but worse on the training data.
Joseph Santarcangelo has a PhD in Electrical Engineering. His research focused on using machine learning, signal processing, and computer vision to determine how videos impact human cognition.
Other contributors: Michelle Carey, Morvan Youtube channel, Mavis Zhou
Copyright © 2018 cognitiveclass.ai. This notebook and its source code are released under the terms of the MIT License.