In [ ]:
# Trying classifier on dessert images
Welcome to the first week of the second deep learning certificate! We're going to use convolutional neural networks (CNNs) to allow our computer to see - something that is only possible thanks to deep learning.
We're going to try to create a model to enter the Dogs vs Cats competition at Kaggle. There are 25,000 labelled dog and cat photos available for training, and 12,500 in the test set that we have to try to label for this competition. According to the Kaggle web-site, when this competition was launched (end of 2013): "State of the art: The current literature suggests machine classifiers can score above 80% accuracy on this task". So if we can beat 80%, then we will be at the cutting edge as of 2013!
In [ ]:
# Put these at the top of every notebook, to get automatic reloading and inline plotting
%reload_ext autoreload
%autoreload 2
%matplotlib inline
Here we import the libraries we need. We'll learn about what each does during the course.
In [ ]:
# This file contains all the main external libs we'll use
from fastai.imports import *
In [ ]:
from fastai.transforms import *
from fastai.conv_learner import *
from fastai.model import *
from fastai.dataset import *
from fastai.sgdr import *
from fastai.plots import *
from data_science.j_utils import copyanything
In [ ]:
PATH = "data/desserts/"
sz = 224
In [ ]:
import os
def walklevel(some_dir, level=1):
'''For copying dirs n levels deep from some_dir. n = 1 by default.'''
some_dir = some_dir.rstrip(os.path.sep)
assert os.path.isdir(some_dir)
num_sep = some_dir.count(os.path.sep)
for root, dirs, files in os.walk(some_dir):
yield root, dirs, files
num_sep_this = root.count(os.path.sep)
if num_sep + level <= num_sep_this:
del dirs[:]
In [ ]:
if not os.path.isdir(PATH):
os.mkdir(PATH)
In [ ]:
inputpath = 'data/dogscats/'
outputpath = 'data/desserts'
for dirpath, dirnames, filenames in walklevel(inputpath, level=1):
structure = os.path.join(outputpath, dirpath[len(inputpath):])
if not os.path.isdir(structure):
os.mkdir(structure)
else:
print("Folder already exists!")
In [ ]:
# used python ~/google_image_finder/image_download.py <class> 500 to download images
# move the dataset in homedir to data dir, and rename
making_dataset = False
if making_dataset:
!cp -R /home/justin/dataset/. data/desserts/train/
!rm -rf /home/justin/dataset
In [ ]:
import os
import imghdr
# print(imghdr.what(f'{PATH}train/cheesecake/cheesecake_0.jpg'))
def get_dirs_dataset(PATH):
'''Function to get paths to lowest directories in PATH. Used for purging
images that mpl cannot load.'''
need_checking_dirs = []
for root, dirs, files in os.walk(PATH):
if not dirs:
need_checking_dirs.append(root)
return need_checking_dirs
def remove_bad_imgs(need_checking_dirs):
for dirs in need_checking_dirs:
for root, _, files in os.walk(dirs):
for file in files:
fpath = os.path.join(root, file)
if not imghdr.what(fpath):
try:
img = plt.imread(fpath)
except OSError:
os.remove(fpath)
def checked_imgs(need_checking_dirs):
for dirs in need_checking_dirs:
for root, _, files in os.walk(dirs):
for file in files:
fpath = os.path.join(root, file)
if not imghdr.what(fpath):
try:
img = plt.imread(fpath)
except OSError:
print('There are still bad images somehow!')
return None
print('All iamges seem good to go!')
return True
In [ ]:
removed_bad_imgs = True
if not removed_bad_imgs:
remove_bad_imgs(get_dirs_dataset(PATH+'train'))
imgs_checked = checked_imgs(get_dirs_dataset(PATH+'train'))
In [ ]:
from sklearn.model_selection import train_test_split
def get_all_filepaths(PATH):
all_filepaths = []
for root, dirs, files in os.walk(PATH+'train'):
for file in files:
all_filepaths.append(os.path.join(root,file))
return all_filepaths
In [ ]:
import os
import errno
def remove_empty_folders(path, removeRoot=True):
'Function to remove empty folders'
if not os.path.isdir(path):
return
# remove empty subfolders
files = os.listdir(path)
if len(files):
for f in files:
fullpath = os.path.join(path, f)
if os.path.isdir(fullpath):
remove_empty_folders(fullpath)
# if folder empty, delete it
files = os.listdir(path)
if len(files) == 0 and removeRoot:
print("Removing empty folder:", path)
os.rmdir(path)
def move_to_target(validset, target):
for file in validset:
dst = file.replace('train', target)
os.makedirs(os.path.dirname(dst), exist_ok=True)
os.rename(file, dst)
print('moved files from train to {0}.'.format(target))
return True
def convert_to_test(PATH):
k = 0
for root, subdirs, files in os.walk(PATH):
if len(files) > 0:
for file in files:
os.rename(os.path.join(root, file), os.path.join(PATH, '{0}.jpg'.format(k)))
k += 1
remove_empty_folders(PATH)
def convert_to_jpg(file):
try:
im = Image.open(file)
except:
os.remove(file)
if '.jpg' not in file:
try:
im = Image.open(file)
name, ext = file.split('.')
im.save(name+'.jpg', 'JPG')
os.remove(file)
im = Image.open(name+'.jpg')
plt.imshow(im)
except:
os.remove(file)
In [ ]:
already_moved_valid = True
if not already_moved_valid:
trainset, validset= train_test_split(get_all_filepaths(PATH), test_size=.2)
move_to_target(validset, 'valid')
In [ ]:
# make testset
already_moved_test = True
if not already_moved_test:
trainset, testset = train_test_split(get_all_filepaths(PATH), test_size=.2)
move_to_target(testset, 'test1')
convert_to_test(PATH+'test1')
In [ ]:
import warnings
warnings.filterwarnings("error")
In [ ]:
# convert all images to .jpg
dirs_to_convert = ['train', 'valid', 'test1']
for dirs in dirs_to_convert:
for root, subdirs, files in os.walk(PATH+dirs):
print(root, len(files))
if len(files) > 0:
for file in files:
convert_to_jpg(os.path.join(root,file))
In [ ]:
# check images again...
# convert all images to .jpg
dirs_to_check = ['train', 'valid', 'test1']
for dirs in dirs_to_check:
for root, subdirs, files in os.walk(PATH+dirs):
print(root, len(files))
if len(files) > 0:
for file in files:
im = Image.open(os.path.join(root, file))
try:
plt.imshow(im)
except:
os.remove(os.path.join(root, file))
In [ ]:
# make sample
In [ ]:
inputpath = PATH
outputpath = os.path.join(PATH, 'sample')
for dirpath, dirnames, filenames in os.walk(inputpath):
structure = os.path.join(outputpath, dirpath[len(inputpath):])
if os.path.join(outputpath, 'sample') in structure:
pass
elif not os.path.isdir(structure):
os.mkdir(structure)
if len(filenames) > 0:
try:
files_to_copy = np.random.choice(filenames, 50, replace=False)
except:
files_to_copy = np.random.choice(filenames, 50, replace=True)
for file in files_to_copy:
copyanything(os.path.join(dirpath, file), os.path.join(structure,file))
else:
print("Folder {0} already exists! {1} files contained.".format(structure, len(os.listdir(structure))))
Our library will assume that you have train and valid directories. It also assumes that each dir will have subdirs for each class you wish to recognize (in this case, 'cats' and 'dogs').
In [ ]:
!ls {PATH}
In [ ]:
!ls {PATH}valid
In [ ]:
files = !ls {PATH}valid/strawberry_cake | head
files
In [ ]:
img = plt.imread(f'{PATH}valid/strawberry_cake/{files[0]}')
plt.imshow(img);
Here is how the raw data looks like
In [ ]:
img.shape
In [ ]:
img[:4,:4]
We're going to use a pre-trained model, that is, a model created by some one else to solve a different problem. Instead of building a model from scratch to solve a similar problem, we'll use a model trained on ImageNet (1.2 million images and 1000 classes) as a starting point. The model is a Convolutional Neural Network (CNN), a type of Neural Network that builds state-of-the-art models for computer vision. We'll be learning all about CNNs during this course.
We will be using the resnet34 model. resnet34 is a version of the model that won the 2015 ImageNet competition. Here is more info on resnet models. We'll be studying them in depth later, but for now we'll focus on using them effectively.
Here's how to train and evalulate a dogs vs cats model in 3 lines of code, and under 20 seconds:
In [ ]:
# Uncomment the below if you need to reset your precomputed activations
# !rm -rf {PATH}tmp
In [ ]:
%pdb
In [ ]:
%%writefile ../../fastai/dataset.py
# %load ../../fastai/dataset.py
from .imports import *
from .torch_imports import *
from .core import *
from .transforms import *
from .layer_optimizer import *
from .dataloader import DataLoader
def get_cv_idxs(n, cv_idx=0, val_pct=0.2, seed=42):
np.random.seed(seed)
n_val = int(val_pct*n)
idx_start = cv_idx*n_val
idxs = np.random.permutation(n)
return idxs[idx_start:idx_start+n_val]
def resize_img(fname, targ, path, new_path):
dest = os.path.join(path,new_path,str(targ),fname)
if os.path.exists(dest): return
im = Image.open(os.path.join(path, fname)).convert('RGB')
r,c = im.size
ratio = targ/min(r,c)
sz = (scale_to(r, ratio, targ), scale_to(c, ratio, targ))
os.makedirs(os.path.split(dest)[0], exist_ok=True)
im.resize(sz, Image.LINEAR).save(dest)
def resize_imgs(fnames, targ, path, new_path):
if not os.path.exists(os.path.join(path,new_path,str(targ),fnames[0])):
with ThreadPoolExecutor(8) as e:
ims = e.map(lambda x: resize_img(x, targ, path, 'tmp'), fnames)
for x in tqdm(ims, total=len(fnames), leave=False): pass
return os.path.join(path,new_path,str(targ))
def read_dir(path, folder):
# TODO: warn or error if no files found?
full_path = os.path.join(path, folder)
fnames = iglob(f"{full_path}/*.*")
if any(fnames):
return [os.path.relpath(f,path) for f in fnames]
else:
raise FileNotFoundError("{} folder doesn't exist or is empty".format(folder))
def read_dirs(path, folder):
labels, filenames, all_labels = [], [], []
full_path = os.path.join(path, folder)
for label in sorted(os.listdir(full_path)):
all_labels.append(label)
for fname in os.listdir(os.path.join(full_path, label)):
filenames.append(os.path.join(folder, label, fname))
labels.append(label)
# import pdb; pdb.set_trace()
return filenames, labels, all_labels
def n_hot(ids, c):
res = np.zeros((c,), dtype=np.float32)
res[ids] = 1
return res
def folder_source(path, folder):
fnames, lbls, all_labels = read_dirs(path, folder)
label2idx = {v:k for k,v in enumerate(all_labels)}
idxs = [label2idx[lbl] for lbl in lbls]
c = len(all_labels)
label_arr = np.array(idxs, dtype=int)
return fnames, label_arr, all_labels
def parse_csv_labels(fn, skip_header=True):
skip = 1 if skip_header else 0
csv_lines = [o.strip().split(',') for o in open(fn)][skip:]
fnames = [fname for fname, _ in csv_lines]
csv_labels = {a:b.split(' ') for a,b in csv_lines}
all_labels = sorted(list(set(p for o in csv_labels.values() for p in o)))
label2idx = {v:k for k,v in enumerate(all_labels)}
return sorted(fnames), csv_labels, all_labels, label2idx
def nhot_labels(label2idx, csv_labels, fnames, c):
all_idx = {k: n_hot([label2idx[o] for o in v], c)
for k,v in csv_labels.items()}
return np.stack([all_idx[o] for o in fnames])
def csv_source(folder, csv_file, skip_header=True, suffix='', continuous=False):
fnames,csv_labels,all_labels,label2idx = parse_csv_labels(csv_file, skip_header)
full_names = [os.path.join(folder,fn+suffix) for fn in fnames]
if continuous:
label_arr = np.array([csv_labels[i] for i in fnames]).astype(np.float32)
else:
label_arr = nhot_labels(label2idx, csv_labels, fnames, len(all_labels))
is_single = np.all(label_arr.sum(axis=1)==1)
if is_single: label_arr = np.argmax(label_arr, axis=1)
return full_names, label_arr, all_labels
class BaseDataset(Dataset):
def __init__(self, transform=None):
self.transform = transform
self.n = self.get_n()
self.c = self.get_c()
self.sz = self.get_sz()
def __getitem__(self, idx):
x,y = self.get_x(idx),self.get_y(idx)
return self.get(self.transform, x, y)
def __len__(self): return self.n
def get(self, tfm, x, y):
return (x,y) if tfm is None else tfm(x,y)
@abstractmethod
def get_n(self): raise NotImplementedError
@abstractmethod
def get_c(self): raise NotImplementedError
@abstractmethod
def get_sz(self): raise NotImplementedError
@abstractmethod
def get_x(self, i): raise NotImplementedError
@abstractmethod
def get_y(self, i): raise NotImplementedError
@property
def is_multi(self): return False
@property
def is_reg(self): return False
def open_image(fn):
""" Opens an image using OpenCV given the file path.
Arguments:
fn: the file path of the image
Returns:
The numpy array representation of the image in the RGB format
"""
flags = cv2.IMREAD_UNCHANGED+cv2.IMREAD_ANYDEPTH+cv2.IMREAD_ANYCOLOR
if not os.path.exists(fn):
print('No such file or directory: {}'.format(fn))
elif os.path.isdir(fn):
print('Is a directory: {}'.format(fn))
else:
try:
return cv2.cvtColor(cv2.imread(fn, flags), cv2.COLOR_BGR2RGB).astype(np.float32)/255
except Exception as e:
print(fn, e)
class FilesDataset(BaseDataset):
'''Justin modified get_x'''
def __init__(self, fnames, transform, path):
self.path,self.fnames = path,fnames
super().__init__(transform)
def get_n(self): return len(self.y)
def get_sz(self): return self.transform.sz
def get_x(self, i): return open_image(os.path.join(self.path, self.fnames[i]))
def resize_imgs(self, targ, new_path):
dest = resize_imgs(self.fnames, targ, self.path, new_path)
return self.__class__(self.fnames, self.y, self.transform, dest)
def denorm(self,arr):
"""Reverse the normalization done to a batch of images.
Arguments:
arr: of shape/size (N,3,sz,sz)
"""
if type(arr) is not np.ndarray: arr = to_np(arr)
if len(arr.shape)==3: arr = arr[None]
return self.transform.denorm(np.rollaxis(arr,1,4))
class FilesArrayDataset(FilesDataset):
def __init__(self, fnames, y, transform, path):
self.y=y
assert(len(fnames)==len(y))
super().__init__(fnames, transform, path)
def get_y(self, i): return self.y[i]
def get_c(self): return self.y.shape[1]
class FilesIndexArrayDataset(FilesArrayDataset):
def get_c(self): return int(self.y.max())+1
class FilesNhotArrayDataset(FilesArrayDataset):
@property
def is_multi(self): return True
class FilesIndexArrayRegressionDataset(FilesArrayDataset):
def is_reg(self): return True
class ArraysDataset(BaseDataset):
def __init__(self, x, y, transform):
self.x,self.y=x,y
assert(len(x)==len(y))
super().__init__(transform)
def get_x(self, i): return self.x[i]
def get_y(self, i): return self.y[i]
def get_n(self): return len(self.y)
def get_sz(self): return self.x.shape[1]
class ArraysIndexDataset(ArraysDataset):
def get_c(self): return int(self.y.max())+1
def get_y(self, i): return self.y[i]
class ArraysNhotDataset(ArraysDataset):
def get_c(self): return self.y.shape[1]
@property
def is_multi(self): return True
class ModelData():
def __init__(self, path, trn_dl, val_dl, test_dl=None):
self.path,self.trn_dl,self.val_dl,self.test_dl = path,trn_dl,val_dl,test_dl
@classmethod
def from_dls(cls, path,trn_dl,val_dl,test_dl=None):
trn_dl,val_dl = ModelDataLoader(trn_dl),ModelDataLoader(val_dl)
if test_dl: test_dl = ModelDataLoader(test_dl)
return cls(path, trn_dl, val_dl, test_dl)
@property
def is_reg(self): return self.trn_ds.is_reg
@property
def trn_ds(self): return self.trn_dl.dataset
@property
def val_ds(self): return self.val_dl.dataset
@property
def test_ds(self): return self.test_dl.dataset
@property
def trn_y(self): return self.trn_ds.y
@property
def val_y(self): return self.val_ds.y
class ModelDataLoader():
def __init__(self, dl): self.dl=dl
@classmethod
def create_dl(cls, *args, **kwargs): return cls(DataLoader(*args, **kwargs))
def __iter__(self):
self.it,self.i = iter(self.dl),0
return self
def __len__(self): return len(self.dl)
def __next__(self):
if self.i>=len(self.dl): raise StopIteration
self.i+=1
return next(self.it)
@property
def dataset(self): return self.dl.dataset
class ImageData(ModelData):
def __init__(self, path, datasets, bs, num_workers, classes):
trn_ds,val_ds,fix_ds,aug_ds,test_ds,test_aug_ds = datasets
self.path,self.bs,self.num_workers,self.classes = path,bs,num_workers,classes
self.trn_dl,self.val_dl,self.fix_dl,self.aug_dl,self.test_dl,self.test_aug_dl = [
self.get_dl(ds,shuf) for ds,shuf in [
(trn_ds,True),(val_ds,False),(fix_ds,False),(aug_ds,False),
(test_ds,False),(test_aug_ds,False)
]
]
def get_dl(self, ds, shuffle):
if ds is None: return None
return ModelDataLoader.create_dl(ds, batch_size=self.bs, shuffle=shuffle,
num_workers=self.num_workers, pin_memory=False)
@property
def sz(self): return self.trn_ds.sz
@property
def c(self): return self.trn_ds.c
def resized(self, dl, targ, new_path):
return dl.dataset.resize_imgs(targ,new_path) if dl else None
def resize(self, targ, new_path):
new_ds = []
dls = [self.trn_dl,self.val_dl,self.fix_dl,self.aug_dl]
if self.test_dl: dls += [self.test_dl, self.test_aug_dl]
else: dls += [None,None]
t = tqdm_notebook(dls)
for dl in t: new_ds.append(self.resized(dl, targ, new_path))
t.close()
return self.__class__(new_ds[0].path, new_ds, self.bs, self.num_workers, self.classes)
class ImageClassifierData(ImageData):
@property
def is_multi(self): return self.trn_dl.dataset.is_multi
@staticmethod
def get_ds(fn, trn, val, tfms, test=None, **kwargs):
res = [
fn(trn[0], trn[1], tfms[0], **kwargs), # train
fn(val[0], val[1], tfms[1], **kwargs), # val
fn(trn[0], trn[1], tfms[1], **kwargs), # fix
fn(val[0], val[1], tfms[0], **kwargs) # aug
]
if test is not None:
test_lbls = np.zeros((len(test),1))
res += [
fn(test, test_lbls, tfms[1], **kwargs), # test
fn(test, test_lbls, tfms[0], **kwargs) # test_aug
]
else: res += [None,None]
return res
@classmethod
def from_arrays(cls, path, trn, val, bs=64, tfms=(None,None), classes=None, num_workers=4, test=None):
""" Read in images and their labels given as numpy arrays
Arguments:
path: a root path of the data (used for storing trained models, precomputed values, etc)
trn: a tuple of training data matrix and target label/classification array (e.g. `trn=(x,y)` where `x` has the
shape of `(5000, 784)` and `y` has the shape of `(5000,)`)
val: a tuple of validation data matrix and target label/classification array.
bs: batch size
tfms: transformations (for data augmentations). e.g. output of `tfms_from_model`
classes: a list of all labels/classifications
num_workers: a number of workers
test: a matrix of test data (the shape should match `trn[0]`)
Returns:
ImageClassifierData
"""
datasets = cls.get_ds(ArraysIndexDataset, trn, val, tfms, test=test)
return cls(path, datasets, bs, num_workers, classes=classes)
@classmethod
def from_paths(cls, path, bs=64, tfms=(None,None), trn_name='train', val_name='valid', test_name=None, num_workers=8):
""" Read in images and their labels given as sub-folder names
Arguments:
path: a root path of the data (used for storing trained models, precomputed values, etc)
bs: batch size
tfms: transformations (for data augmentations). e.g. output of `tfms_from_model`
trn_name: a name of the folder that contains training images.
val_name: a name of the folder that contains validation images.
test_name: a name of the folder that contains test images.
num_workers: number of workers
Returns:
ImageClassifierData
"""
trn,val = [folder_source(path, o) for o in (trn_name, val_name)]
test_fnames = read_dir(path, test_name) if test_name else None
datasets = cls.get_ds(FilesIndexArrayDataset, trn, val, tfms, path=path, test=test_fnames)
return cls(path, datasets, bs, num_workers, classes=trn[2])
@classmethod
def from_csv(cls, path, folder, csv_fname, bs=64, tfms=(None,None),
val_idxs=None, suffix='', test_name=None, continuous=False, skip_header=True, num_workers=8):
""" Read in images and their labels given as a CSV file.
This method should be used when training image labels are given in an CSV file as opposed to
sub-directories with label names.
Arguments:
path: a root path of the data (used for storing trained models, precomputed values, etc)
folder: a name of the folder in which training images are contained.
csv_fname: a name of the CSV file which contains target labels.
bs: batch size
tfms: transformations (for data augmentations). e.g. output of `tfms_from_model`
val_idxs: index of images to be used for validation. e.g. output of `get_cv_idxs`
suffix: suffix to add to image names in CSV file (sometimes CSV only contains the file name without file
extension e.g. '.jpg' - in which case, you can set suffix as '.jpg')
test_name: a name of the folder which contains test images.
continuous: TODO
skip_header: skip the first row of the CSV file.
num_workers: number of workers
Returns:
ImageClassifierData
"""
fnames,y,classes = csv_source(folder, csv_fname, skip_header, suffix, continuous=continuous)
((val_fnames,trn_fnames),(val_y,trn_y)) = split_by_idx(val_idxs, np.array(fnames), y)
test_fnames = read_dir(path, test_name) if test_name else None
if continuous:
f = FilesIndexArrayRegressionDataset
else:
f = FilesIndexArrayDataset if len(trn_y.shape)==1 else FilesNhotArrayDataset
datasets = cls.get_ds(f, (trn_fnames,trn_y), (val_fnames,val_y), tfms,
path=path, test=test_fnames)
return cls(path, datasets, bs, num_workers, classes=classes)
def split_by_idx(idxs, *a):
mask = np.zeros(len(a[0]),dtype=bool)
mask[np.array(idxs)] = True
return [(o[mask],o[~mask]) for o in a]
In [ ]:
arch=resnet34
data = ImageClassifierData.from_paths(PATH, tfms=tfms_from_model(arch, sz))
learn = ConvLearner.pretrained(arch, data, precompute=True)
In [ ]:
learn.fit(0.01, 3)
In [ ]:
learn.fit(0.01,3)
In [ ]:
learn.fit(0.01,2)
How good is this model? Well, as we mentioned, prior to this competition, the state of the art was 80% accuracy. But the competition resulted in a huge jump to 98.9% accuracy, with the author of a popular deep learning library winning the competition. Extraordinarily, less than 4 years later, we can now beat that result in seconds! Even last year in this same course, our initial model had 98.3% accuracy, which is nearly double the error we're getting just a year later, and that took around 10 minutes to compute.
As well as looking at the overall metrics, it's also a good idea to look at examples of each of:
In [ ]:
# This is the label for a val data
data.val_y
In [ ]:
# from here we know that 'cats' is label 0 and 'dogs' is label 1.
data.classes
In [ ]:
# this gives prediction for validation set. Predictions are in log scale
log_preds = learn.predict()
log_preds.shape
In [ ]:
log_preds[:10]
In [ ]:
preds = np.argmax(log_preds, axis=1) # from log probabilities to 0 or 1
probs = np.exp(log_preds[:,1]) # pr(dog)
In [ ]:
def rand_by_mask(mask): return np.random.choice(np.where(mask)[0], 4, replace=False)
def rand_by_correct(is_correct): return rand_by_mask((preds == data.val_y)==is_correct)
In [ ]:
# def plot_val_with_title(idxs, title):
# imgs = np.stack([data.val_ds[x][0] for x in idxs])
# title_probs = [probs[x] for x in idxs]
# print(title)
# return plots(data.val_ds.denorm(imgs), rows=1, titles=title_probs)
In [ ]:
def plots(ims, figsize=(12,6), rows=1, titles=None):
f = plt.figure(figsize=figsize)
for i in range(len(ims)):
sp = f.add_subplot(rows, len(ims)//rows, i+1)
sp.axis('Off')
if titles is not None: sp.set_title(titles[i], fontsize=16)
plt.imshow(ims[i])
In [ ]:
def load_img_id(ds, idx): return np.array(PIL.Image.open(PATH+ds.fnames[idx]))
def plot_val_with_title(idxs, title):
imgs = [load_img_id(data.val_ds,x) for x in idxs]
title_probs = [probs[x] for x in idxs]
print(title)
return plots(imgs, rows=1, titles=title_probs, figsize=(16,8))
In [ ]:
# 1. A few correct labels at random
plot_val_with_title(rand_by_correct(True), "Correctly classified")
In [ ]:
# 2. A few incorrect labels at random
plot_val_with_title(rand_by_correct(False), "Incorrectly classified")
In [ ]:
def most_by_mask(mask, mult):
idxs = np.where(mask)[0]
return idxs[np.argsort(mult * probs[idxs])[:4]]
def most_by_correct(y, is_correct):
mult = -1 if (y==1)==is_correct else 1
return most_by_mask((preds == data.val_y)==is_correct & (data.val_y == y), mult)
In [ ]:
plot_val_with_title(most_by_correct(0, True), "Most correct cats")
In [ ]:
plot_val_with_title(most_by_correct(1, True), "Most correct dogs")
In [ ]:
plot_val_with_title(most_by_correct(0, False), "Most incorrect cats")
In [ ]:
plot_val_with_title(most_by_correct(1, False), "Most incorrect dogs")
In [ ]:
most_uncertain = np.argsort(np.abs(probs -0.5))[:4]
plot_val_with_title(most_uncertain, "Most uncertain predictions")
The learning rate determines how quickly or how slowly you want to update the weights (or parameters). Learning rate is one of the most difficult parameters to set, because it significantly affect model performance.
The method learn.lr_find()
helps you find an optimal learning rate. It uses the technique developed in the 2015 paper Cyclical Learning Rates for Training Neural Networks, where we simply keep increasing the learning rate from a very small value, until the loss starts decreasing. We can plot the learning rate across batches to see what this looks like.
We first create a new learner, since we want to know how to set the learning rate for a new (untrained) model.
In [ ]:
learn = ConvLearner.pretrained(arch, data, precompute=True)
In [ ]:
lrf=learn.lr_find()
Our learn
object contains an attribute sched
that contains our learning rate scheduler, and has some convenient plotting functionality including this one:
In [ ]:
learn.sched.plot_lr()
Note that in the previous plot iteration is one iteration (or minibatch) of SGD. In one epoch there are (num_train_samples/num_iterations) of SGD.
We can see the plot of loss versus learning rate to see where our loss stops decreasing:
In [ ]:
learn.sched.plot()
The loss is still clearly improving at lr=1e-2 (0.01), so that's what we use. Note that the optimal learning rate can change as we training the model, so you may want to re-run this function from time to time.
If you try training for more epochs, you'll notice that we start to overfit, which means that our model is learning to recognize the specific images in the training set, rather than generalizaing such that we also get good results on the validation set. One way to fix this is to effectively create more data, through data augmentation. This refers to randomly changing the images in ways that shouldn't impact their interpretation, such as horizontal flipping, zooming, and rotating.
We can do this by passing aug_tfms
(augmentation transforms) to tfms_from_model
, with a list of functions to apply that randomly change the image however we wish. For photos that are largely taken from the side (e.g. most photos of dogs and cats, as opposed to photos taken from the top down, such as satellite imagery) we can use the pre-defined list of functions transforms_side_on
. We can also specify random zooming of images up to specified scale by adding the max_zoom
parameter.
In [ ]:
tfms = tfms_from_model(resnet34, sz, aug_tfms=transforms_side_on, max_zoom=1.1)
In [ ]:
def get_augs():
data = ImageClassifierData.from_paths(PATH, bs=2, tfms=tfms, num_workers=1)
x,_ = next(iter(data.aug_dl))
return data.trn_ds.denorm(x)[1]
In [ ]:
ims = np.stack([get_augs() for i in range(6)])
In [ ]:
plots(ims, rows=2)
Let's create a new data
object that includes this augmentation in the transforms.
In [ ]:
data = ImageClassifierData.from_paths(PATH, tfms=tfms)
learn = ConvLearner.pretrained(arch, data, precompute=True)
In [ ]:
learn.fit(1e-2, 1)
In [ ]:
learn.precompute=False
By default when we create a learner, it sets all but the last layer to frozen. That means that it's still only updating the weights in the last layer when we call fit
.
In [ ]:
learn.fit(1e-2, 30, cycle_len=1)
What is that cycle_len
parameter? What we've done here is used a technique called stochastic gradient descent with restarts (SGDR), a variant of learning rate annealing, which gradually decreases the learning rate as training progresses. This is helpful because as we get closer to the optimal weights, we want to take smaller steps.
However, we may find ourselves in a part of the weight space that isn't very resilient - that is, small changes to the weights may result in big changes to the loss. We want to encourage our model to find parts of the weight space that are both accurate and stable. Therefore, from time to time we increase the learning rate (this is the 'restarts' in 'SGDR'), which will force the model to jump to a different part of the weight space if the current area is "spikey". Here's a picture of how that might look if we reset the learning rates 3 times (in this paper they call it a "cyclic LR schedule"):
The number of epochs between resetting the learning rate is set by cycle_len
, and the number of times this happens is refered to as the number of cycles, and is what we're actually passing as the 2nd parameter to fit()
. So here's what our actual learning rates looked like:
In [ ]:
learn.sched.plot_lr()
Our validation loss isn't improving much, so there's probably no point further training the last layer on its own.
Since we've got a pretty good model at this point, we might want to save it so we can load it again later without training it from scratch.
In [ ]:
learn.save('224_lastlayer')
In [ ]:
learn.load('224_lastlayer')
Now that we have a good final layer trained, we can try fine-tuning the other layers. To tell the learner that we want to unfreeze the remaining layers, just call (surprise surprise!) unfreeze()
.
In [ ]:
learn.unfreeze()
Note that the other layers have already been trained to recognize imagenet photos (whereas our final layers where randomly initialized), so we want to be careful of not destroying the carefully tuned weights that are already there.
Generally speaking, the earlier layers (as we've seen) have more general-purpose features. Therefore we would expect them to need less fine-tuning for new datasets. For this reason we will use different learning rates for different layers: the first few layers will be at 1e-4, the middle layers at 1e-3, and our FC layers we'll leave at 1e-2 as before. We refer to this as differential learning rates, although there's no standard name for this techique in the literature that we're aware of.
In [ ]:
lr=np.array([1e-4,1e-3,1e-2])
In [ ]:
learn.fit(lr, 5, cycle_len=1, cycle_mult=2)
Another trick we've used here is adding the cycle_mult
parameter. Take a look at the following chart, and see if you can figure out what the parameter is doing:
In [ ]:
learn.sched.plot_lr()
Note that's what being plotted above is the learning rate of the final layers. The learning rates of the earlier layers are fixed at the same multiples of the final layer rates as we initially requested (i.e. the first layers have 100x smaller, and middle layers 10x smaller learning rates, since we set lr=np.array([1e-4,1e-3,1e-2])
.
In [ ]:
learn.save('224_all')
In [ ]:
learn.load('224_all')
There is something else we can do with data augmentation: use it at inference time (also known as test time). Not surprisingly, this is known as test time augmentation, or just TTA.
TTA simply makes predictions not just on the images in your validation set, but also makes predictions on a number of randomly augmented versions of them too (by default, it uses the original image along with 4 randomly augmented versions). It then takes the average prediction from these images, and uses that. To use TTA on the validation set, we can use the learner's TTA()
method.
In [ ]:
log_preds,y = learn.TTA()
probs = np.mean(np.exp(log_preds),0)
In [ ]:
np.exp(log_preds).shape
In [ ]:
probs.shape
In [ ]:
accuracy(probs, y)
I generally see about a 10-20% reduction in error on this dataset when using TTA at this point, which is an amazing result for such a quick and easy technique!
In [ ]:
preds = np.argmax(probs, axis=1)
probs = probs[:,1]
A common way to analyze the result of a classification model is to use a confusion matrix. Scikit-learn has a convenient function we can use for this purpose:
In [ ]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y, preds)
We can just print out the confusion matrix, or we can show a graphical view (which is mainly useful for dependents with a larger number of categories).
In [ ]:
plot_confusion_matrix(cm, data.classes)
In [ ]:
plot_val_with_title(most_by_correct(0, False), "Most incorrect cats")
In [ ]:
plot_val_with_title(most_by_correct(1, False), "Most incorrect dogs")
lr_find()
to find highest learning rate where loss is still clearly improvinglr_find()
againLet's look at the Dogs v Cats code line by line.
tfms stands for transformations. tfms_from_model
takes care of resizing, image cropping, initial normalization (creating data with (mean,stdev) of (0,1)), and more.
In [ ]:
tfms = tfms_from_model(resnet34, sz)
We need a path that points to the dataset. In this path we will also store temporary data and final results. ImageClassifierData.from_paths
reads data from a provided path and creates a dataset ready for training.
In [ ]:
data = ImageClassifierData.from_paths(PATH, tfms=tfms)
In [ ]:
ImageClassifierData.from_paths??
ConvLearner.pretrained
builds learner that contains a pre-trained model. The last layer of the model needs to be replaced with the layer of the right dimensions. The pretained model was trained for 1000 classes therfore the final layer predicts a vector of 1000 probabilities. The model for cats and dogs needs to output a two dimensional vector. The diagram below shows in an example how this was done in one of the earliest successful CNNs. The layer "FC8" here would get replaced with a new layer with 2 outputs.
In [ ]:
learn = ConvLearner.pretrained(resnet34, data, precompute=True)
Parameters are learned by fitting a model to the data. Hyparameters are another kind of parameter, that cannot be directly learned from the regular training process. These parameters express “higher-level” properties of the model such as its complexity or how fast it should learn. Two examples of hyperparameters are the learning rate and the number of epochs.
During iterative training of a neural network, a batch or mini-batch is a subset of training samples used in one iteration of Stochastic Gradient Descent (SGD). An epoch is a single pass through the entire training set which consists of multiple iterations of SGD.
We can now fit the model; that is, use gradient descent to find the best parameters for the fully connected layer we added, that can separate cat pictures from dog pictures. We need to pass two hyperameters: the learning rate (generally 1e-2 or 1e-3 is a good starting point, we'll look more at this next) and the number of epochs (you can pass in a higher number and just stop training when you see it's no longer improving, then re-run it with the number of epochs you found works well.)
In [ ]:
learn.fit(1e-2, 1)
When we run learn.fit
we print 3 performance values (see above.) Here 0.03 is the value of the loss in the training set, 0.0226 is the value of the loss in the validation set and 0.9927 is the validation accuracy. What is the loss? What is accuracy? Why not to just show accuracy?
Accuracy is the ratio of correct prediction to the total number of predictions.
In machine learning the loss function or cost function is representing the price paid for inaccuracy of predictions.
The loss associated with one example in binary classification is given by:
-(y * log(p) + (1-y) * log (1-p))
where y
is the true label of x
and p
is the probability predicted by our model that the label is 1.
In [ ]:
def binary_loss(y, p):
return np.mean(-(y * np.log(p) + (1-y)*np.log(1-p)))
In [ ]:
acts = np.array([1, 0, 0, 1])
preds = np.array([0.95, 0.1, 0.2, 0.8])
binary_loss(acts, preds)
Note that in our toy example above our accuracy is 100% and our loss is 0.16. Compare that to a loss of 0.03 that we are getting while predicting cats and dogs. Exercise: play with preds
to get a lower loss for this example.
Example: Here is an example on how to compute the loss for one example of binary classification problem. Suppose for an image x with label 1 and your model gives it a prediction of 0.9. For this case the loss should be small because our model is predicting a label $1$ with high probability.
loss = -log(0.9) = 0.10
Now suppose x has label 0 but our model is predicting 0.9. In this case our loss is should be much larger.
loss = -log(1-0.9) = 2.30
binary_loss
using if
instead of *
and +
?Why not just maximize accuracy? The binary classification loss is an easier function to optimize.
In [ ]: