In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib
import seaborn as sns
import matplotlib.pyplot as plt
import os
from os import getcwd
from os import listdir
from os import mkdir
from os.path import isfile, join, isdir

import skimage
from skimage import measure
from skimage import io

from PIL import Image


from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from sklearn.preprocessing import LabelEncoder
from skimage.transform import resize

# from keras.models import Sequential
# from keras.layers import Dropout, Flatten, Convolution2D, MaxPooling2D, ZeroPadding2D, Dense, Activation
# from keras.optimizers import RMSprop, Adam
# from keras.callbacks import EarlyStopping
# from keras.utils import np_utils
# from keras import backend as K


/Users/stokesjd/anaconda/envs/tensorflow2.7/lib/python2.7/site-packages/IPython/html.py:14: ShimWarning: The `IPython.html` package has been deprecated since IPython 4.0. You should import from `notebook` instead. `IPython.html.widgets` has moved to `ipywidgets`.
  "`IPython.html.widgets` has moved to `ipywidgets`.", ShimWarning)

In [2]:
import tensorflow as tf

In [3]:
import tflearn
from tflearn.data_utils import shuffle
from tflearn.layers.core import input_data, dropout, fully_connected
from tflearn.layers.conv import conv_2d, max_pool_2d
from tflearn.layers.estimator import regression
from tflearn.data_preprocessing import ImagePreprocessing
from tflearn.data_augmentation import ImageAugmentation
from tflearn.metrics import Accuracy

In [4]:
def get_paths(foldNames):
  
    paths = dict.fromkeys(foldNames)

    for idx,g in enumerate(foldNames):
        fileNames = [f for f in listdir(join(trainPath,g)) if isfile(join(trainPath,g, f))]
        for i,f in enumerate(fileNames):
            fileNames[i] = join(trainPath,g,f)     
        paths[g] = fileNames
        
    return paths

Setup


In [5]:
ROWS = 90  #90 720
COLS = 160 #160 1280
CHANNELS = 3
trainPath = '../train'
testPath = '../test_stg1'
fish_classes = [f for f in listdir(trainPath) if isdir(join(trainPath, f))]
groupData = pd.DataFrame ({'group': fish_classes})
fish_paths = get_paths(fish_classes)
numFish = len(fish_classes)

Build x and y arrays


In [6]:
for idx,fish in enumerate(fish_classes):
    groupData.ix[idx,'num files'] = int(len(fish_paths[fish]))
    
files = []
Y_cat = []

for fish in fish_classes:
    fish_files = fish_paths[fish]
    files.extend(fish_files)
    
    y_fish = np.tile(fish, len(fish_files))
    Y_cat.extend(y_fish)
  
Y_cat = np.array(Y_cat)

Training data

  • One hot encoding labels
  • Split data

In [ ]:
# One Hot Encoding Labels
#    Transform the categorical array Y_all into matrix of the same height, 
#    but with a boolean column for each category.
Y_all = LabelEncoder().fit_transform(Y_cat)
# Y_all = np_utils.to_categorical(Y_all)
Y_all = tflearn.data_utils.to_categorical(Y_all,len(fish_classes))
# test_size: between 0 and 1. proportion of the dataset to include in the test split
# random_state: Pseudo-random number generator state used for random sampling. How to shoose this?
# stratify: this is ensuring that the split datasets are balanced, i.e. contains the same 
# percentage of classes

X_train, X_valid, Y_train, Y_valid = train_test_split(X_all, Y_all, 
                                                    test_size=0.2, random_state=23, 
                                                    stratify=Y_all)

TFLEARN


In [ ]:
def dnn_test1():
    #needed to run this tensorflow operation in order to build the network and subsequently 
    #create the model, multiple times. Rebuilding without resetting the tf.Graph object produces
    #errors. Could also get around this issue by restarting kernel, but that's annoying.
    
    #python with statement: it will use this session for the code following the with statement 
    # and then automatically close the session
    with tf.Graph().as_default():
#         n_nodes_hl1 = 500
#         n_nodes_hl2 = 500
#         n_nodes_hl3 = 500
#         n_classes = 10
#         batch_size = 100
    
        # normalisation of images
        img_prep = ImagePreprocessing()
        img_prep.add_featurewise_zero_center()
        img_prep.add_featurewise_stdnorm()

        # Create extra synthetic training data by flipping & rotating images
        img_aug = ImageAugmentation()
        img_aug.add_random_flip_leftright()
        img_aug.add_random_rotation(max_angle=25.)

        
        #specific a specific device or gpu
#           with tf.device("/gpu:1"): 
        
        #input layer with shape of data specified. In this case, dimensions of our images, 
        #rows X cols X rgb array. The initial 'None' is for an unknown dimension reflecting the 
        #"number of samples that are processed in a batch"
        network = input_data(shape=[None, ROWS, COLS, 3],
                        data_preprocessing=img_prep,
                        data_augmentation=img_aug)
        
        # 1: Convolution layer with 32 filters, each 3x3x3
        conv_1 = conv_2d(network, 32, 3, activation='relu', name='conv_1')

        # 2: Max pooling layer
        network = max_pool_2d(conv_1, 2)

        # 3: Convolution layer with 64 filters
        conv_2 = conv_2d(network, 64, 3, activation='relu', name='conv_2')

        # 4: Convolution layer with 64 filters
        conv_3 = conv_2d(conv_2, 64, 3, activation='relu', name='conv_3')

        # 5: Max pooling layer
        network = max_pool_2d(conv_3, 2)

        # 6: Fully-connected 512 node layer
        network = fully_connected(network, 512, activation='relu')

        # 7: Dropout layer to combat overfitting
        network = dropout(network, 0.5)

        #output latyer
        network = fully_connected(network, 8, activation='softmax')
        
        # Configure how the network will be trained
        acc = Accuracy(name="Accuracy")
        
        network = regression(network, optimizer='adam',
                     loss='categorical_crossentropy',
                     learning_rate=0.0005, metric=acc)
        return tflearn.DNN(network)

In [ ]:
# Define 

model_tf = dnn_test1()

# Start training (apply gradient descent algorithm). Will want to specify multiple epochs 
# typically unless just testing
model_tf.fit(X_train, Y_train, n_epoch=1,validation_set=(X_valid,Y_valid),
          show_metric=True, batch_size=15)

Model importing


In [7]:
""" Finetuning Example. Using weights from model trained in
convnet_cifar10.py to retrain network for a new task (your own dataset).
All weights are restored except last layer (softmax) that will be retrained
to match the new task (finetuning).
"""

from __future__ import division, print_function, absolute_import

import tflearn
from tflearn.layers.core import input_data, dropout, fully_connected
from tflearn.layers.conv import conv_2d, max_pool_2d
from tflearn.layers.estimator import regression

# Data loading
# Note: You input here any dataset you would like to finetune

X = X_train
Y = Y_train

num_classes = numFish

# Redefinition of convnet_cifar10 network
network = input_data(shape=[None, 32, 32, 3])
network = conv_2d(network, 32, 3, activation='relu')
network = max_pool_2d(network, 2)
network = dropout(network, 0.75)
network = conv_2d(network, 64, 3, activation='relu')
network = conv_2d(network, 64, 3, activation='relu')
network = max_pool_2d(network, 2)
network = dropout(network, 0.5)
network = fully_connected(network, 512, activation='relu')
network = dropout(network, 0.5)
# Finetuning Softmax layer (Setting restore=False to not restore its weights)
softmax = fully_connected(network, num_classes, activation='softmax', restore=False)
regression = regression(softmax, optimizer='adam',
                        loss='categorical_crossentropy',
                        learning_rate=0.001)

model = tflearn.DNN(regression, checkpoint_path='model_finetuning',
                    max_checkpoints=3, tensorboard_verbose=0)
# Load pre-existing model, restoring all weights, except softmax layer ones
model.load('cifar10_cnn')


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-7-f7f050806856> in <module>()
     15 # Note: You input here any dataset you would like to finetune
     16 
---> 17 X = X_train
     18 Y = Y_train
     19 

NameError: name 'X_train' is not defined

In [ ]:
# Start finetuning
model.fit(X, Y, n_epoch=1, validation_set=0.1, shuffle=True,
          show_metric=True, batch_size=4, snapshot_step=200,
          snapshot_epoch=False, run_id='model_finetuning')

model.save('model_finetuning')

Load images


In [ ]:
BUILD_HDF5_DATASET = False
IMAGE_SIZE         = 128
VALIDATION_SPLIT   = True
output_path        = join(rawdataPath, 'fish_dataset_{}x{}.h5'.format(IMAGE_SIZE, IMAGE_SIZE))
input_path         = join(rawdataPath, 'train')

if BUILD_HDF5_DATASET:
    # Build a HDF5 dataset (only required once)
    from tflearn.data_utils import build_hdf5_image_dataset


    build_hdf5_image_dataset(target_path        =input_path, 
                             image_shape        =(IMAGE_SIZE, IMAGE_SIZE), 
                             mode               ='folder', 
                             output_path        =output_path, 
                             categorical_labels =True, 
                             normalize          =True)

In [ ]:
%%time

# Load HDF5 dataset
import h5py

h5f         = h5py.File(output_path, 'r')
X_all       = h5f['X'][()]
Y_all       = h5f['Y'][()]

# Split into 
if VALIDATION_SPLIT:
    X_train, X_valid, Y_train, Y_valid = train_test_split(X_all, Y_all, 
                                                          test_size    =0.2, 
                                                          random_state =23, 
                                                          stratify     =Y_all)