In [1]:
#import tflearn packages

import tflearn
from tflearn.layers.core import input_data, dropout, fully_connected
from tflearn.layers.conv import conv_2d, max_pool_2d
from tflearn.layers.normalization import local_response_normalization
from tflearn.layers.estimator import regression

import tensorflow as tf

from tflearn.data_utils import shuffle
from tflearn.layers.core import input_data, dropout, fully_connected
from tflearn.layers.conv import conv_2d, max_pool_2d
from tflearn.layers.estimator import regression
from tflearn.data_preprocessing import ImagePreprocessing
from tflearn.data_augmentation import ImageAugmentation

#import other packages
import pandas as pd
import sklearn
import os
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
from skimage import measure
%matplotlib inline

from keras.utils import np_utils

import os
from os import getcwd
from os import listdir
from os.path import isfile, join, isdir

import skimage
from skimage import measure
from skimage import io
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from sklearn.preprocessing import LabelEncoder
from skimage.transform import resize


Using TensorFlow backend.

In [2]:
#things to set
trainPath = '../data/processed/gray'
testPath = '../data/processed/test/gray'
outname = 'submission_170225_gray.csv'

##need to adjust channels somehow because it's black and white now!

ROWS = 90  #90 720
COLS = 160 #160 1280
CHANNELS = 3

In [3]:
#import images

def get_paths(foldNames):
  
    paths = dict.fromkeys(foldNames)

    for idx,g in enumerate(foldNames):
        fileNames = [f for f in listdir(join(trainPath,g)) if isfile(join(trainPath,g, f))]
        for i,f in enumerate(fileNames):
            fileNames[i] = join(trainPath,g,f)     
        paths[g] = fileNames
        
    return paths

fish_classes = [f for f in listdir(trainPath) if isdir(join(trainPath, f))]
groupData = pd.DataFrame ({'group': fish_classes})
fish_paths = get_paths(fish_classes)

#remove mac added files
keys_to_remove = [key for key, value in fish_paths.iteritems()
                  if '.DS_Store' in value]

for key in keys_to_remove:
    del fish_paths[key]

In [4]:
#label images by directory

for idx,fish in enumerate(fish_classes):
    groupData.ix[idx,'num files'] = int(len(fish_paths[fish]))
    
files = []
Y_cat = []

for fish in fish_classes:
    fish_files = fish_paths[fish]
    files.extend(fish_files)
    
    y_fish = np.tile(fish, len(fish_files))
    Y_cat.extend(y_fish)


#remove mac added files
is_to_remove = [i for i in range(0,len(files))
                  if '.DS_Store' in files[i]]

for i in is_to_remove:
    del files[i]
    del Y_cat[i]
        
#change to numpy array
Y_cat = np.array(Y_cat)

In [5]:
#downsample images
def read_image(src):
    """Read and resize individual images"""
    im = io.imread(src)
    im = resize(im, (ROWS, COLS))
    return im

#change to numpy array
X_all = np.ndarray((len(files), ROWS, COLS, CHANNELS), dtype=float)

In [6]:
for i, f in enumerate(files): 
    im = read_image(f)
    X_all[i] = im
    if i%1000 == 0: print('Processed {} of {}'.format(i, len(files)))

##view example image
image = X_all[0]
plt.figure(figsize=(5, 5))
plt.imshow(image, cmap='gray', interpolation='nearest')
plt.axis('off')
plt.tight_layout()
plt.show()


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-6-13141d03264d> in <module>()
      1 for i, f in enumerate(files):
      2     im = read_image(f)
----> 3     X_all[i] = im
      4     if i%1000 == 0: print('Processed {} of {}'.format(i, len(files)))
      5 

ValueError: could not broadcast input array from shape (90,160,4) into shape (90,160,1)

In [ ]:
#split into train and test for cross-validation
# One Hot Encoding Labels
#    Transform the categorical array Y_all into matrix of the same height, 
#    but with a boolean column for each category.
Y_all = LabelEncoder().fit_transform(Y_cat)
Y_all = np_utils.to_categorical(Y_all)

# test_size: between 0 and 1. proportion of the dataset to include in the test split
# random_state: Pseudo-random number generator state used for random sampling. How to shoose this?
# stratify: this is ensuring that the split datasets are balanced, i.e. contains the same 
# percentage of classes

X_train, X_valid, Y_train, Y_valid = train_test_split(X_all, Y_all, 
                                                    test_size=0.2, random_state=23, 
                                                    stratify=Y_all)

In [ ]:
# read in test photo set now so it's there regardless of if I do dnn1 or dnn2
test_files = [im for im in os.listdir(testPath)]
test = np.ndarray((len(test_files), ROWS, COLS, CHANNELS), dtype=float)
for i, im in enumerate(test_files): 
    test[i] = read_image(join(testPath,im))

In [ ]:
#build a simple CNN

def dnn_test1():
    #needed to run this tensorflow operation in order to build the network and subsequently 
    #create the model, multiple times. Rebuilding without resetting the tf.Graph object produces
    #errors. Could also get around this issue by restarting kernel, but that's annoying.
    with tf.Graph().as_default():
        
        #input layer with shape of data specified. In this case, dimensions of our images, 
        #rows X cols X rgb array. The initial 'None' is for an unknown dimension reflecting the 
        #"number of samples that are processed in a batch"
        
        # Building convolutional network

        net = input_data(shape=[None, ROWS, COLS, 3])
        net = conv_2d(net, 32, 3, activation='relu', regularizer="L2")
        net = max_pool_2d(net, 2)
        net = local_response_normalization(net)
        net = conv_2d(net, 64, 3, activation='relu', regularizer="L2")
        net = max_pool_2d(net, 2)
        net = local_response_normalization(net)
        net = fully_connected(net, 72, activation='relu')
        net = fully_connected(net, 8, activation='softmax')
        net = regression(net)
        return tflearn.DNN(net)

#test on subset of training images you didn't use; if it works well could do a full train:test and submit
# Define model
model = dnn_test1()

# Start training (apply gradient descent algorithm). Will want to specify multiple epochs 
# typically unless just testing
model.fit(X_train, Y_train, n_epoch=10,
          show_metric=True, batch_size=16, validation_set = (X_valid, Y_valid))

#model predict

test_preds1 = model.predict(test)

submission = pd.DataFrame(test_preds1, columns=fish_classes)
submission.insert(0, 'image', test_files)
submission.head()

submission.to_csv('../data/processed/tflearn/' + outname)