In [1]:
#import tflearn packages
import tflearn
from tflearn.layers.core import input_data, dropout, fully_connected
from tflearn.layers.conv import conv_2d, max_pool_2d
from tflearn.layers.normalization import local_response_normalization
from tflearn.layers.estimator import regression
import tensorflow as tf
from tflearn.data_utils import shuffle
from tflearn.layers.core import input_data, dropout, fully_connected
from tflearn.layers.conv import conv_2d, max_pool_2d
from tflearn.layers.estimator import regression
from tflearn.data_preprocessing import ImagePreprocessing
from tflearn.data_augmentation import ImageAugmentation
#import other packages
import pandas as pd
import sklearn
import os
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
from skimage import measure
%matplotlib inline
from keras.utils import np_utils
import os
from os import getcwd
from os import listdir
from os.path import isfile, join, isdir
import skimage
from skimage import measure
from skimage import io
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from sklearn.preprocessing import LabelEncoder
from skimage.transform import resize
In [2]:
#things to set
trainPath = '../data/processed/gray'
testPath = '../data/processed/test/gray'
outname = 'submission_170225_gray.csv'
##need to adjust channels somehow because it's black and white now!
ROWS = 90 #90 720
COLS = 160 #160 1280
CHANNELS = 3
In [3]:
#import images
def get_paths(foldNames):
paths = dict.fromkeys(foldNames)
for idx,g in enumerate(foldNames):
fileNames = [f for f in listdir(join(trainPath,g)) if isfile(join(trainPath,g, f))]
for i,f in enumerate(fileNames):
fileNames[i] = join(trainPath,g,f)
paths[g] = fileNames
return paths
fish_classes = [f for f in listdir(trainPath) if isdir(join(trainPath, f))]
groupData = pd.DataFrame ({'group': fish_classes})
fish_paths = get_paths(fish_classes)
#remove mac added files
keys_to_remove = [key for key, value in fish_paths.iteritems()
if '.DS_Store' in value]
for key in keys_to_remove:
del fish_paths[key]
In [4]:
#label images by directory
for idx,fish in enumerate(fish_classes):
groupData.ix[idx,'num files'] = int(len(fish_paths[fish]))
files = []
Y_cat = []
for fish in fish_classes:
fish_files = fish_paths[fish]
files.extend(fish_files)
y_fish = np.tile(fish, len(fish_files))
Y_cat.extend(y_fish)
#remove mac added files
is_to_remove = [i for i in range(0,len(files))
if '.DS_Store' in files[i]]
for i in is_to_remove:
del files[i]
del Y_cat[i]
#change to numpy array
Y_cat = np.array(Y_cat)
In [5]:
#downsample images
def read_image(src):
"""Read and resize individual images"""
im = io.imread(src)
im = resize(im, (ROWS, COLS))
return im
#change to numpy array
X_all = np.ndarray((len(files), ROWS, COLS, CHANNELS), dtype=float)
In [6]:
for i, f in enumerate(files):
im = read_image(f)
X_all[i] = im
if i%1000 == 0: print('Processed {} of {}'.format(i, len(files)))
##view example image
image = X_all[0]
plt.figure(figsize=(5, 5))
plt.imshow(image, cmap='gray', interpolation='nearest')
plt.axis('off')
plt.tight_layout()
plt.show()
In [ ]:
#split into train and test for cross-validation
# One Hot Encoding Labels
# Transform the categorical array Y_all into matrix of the same height,
# but with a boolean column for each category.
Y_all = LabelEncoder().fit_transform(Y_cat)
Y_all = np_utils.to_categorical(Y_all)
# test_size: between 0 and 1. proportion of the dataset to include in the test split
# random_state: Pseudo-random number generator state used for random sampling. How to shoose this?
# stratify: this is ensuring that the split datasets are balanced, i.e. contains the same
# percentage of classes
X_train, X_valid, Y_train, Y_valid = train_test_split(X_all, Y_all,
test_size=0.2, random_state=23,
stratify=Y_all)
In [ ]:
# read in test photo set now so it's there regardless of if I do dnn1 or dnn2
test_files = [im for im in os.listdir(testPath)]
test = np.ndarray((len(test_files), ROWS, COLS, CHANNELS), dtype=float)
for i, im in enumerate(test_files):
test[i] = read_image(join(testPath,im))
In [ ]:
#build a simple CNN
def dnn_test1():
#needed to run this tensorflow operation in order to build the network and subsequently
#create the model, multiple times. Rebuilding without resetting the tf.Graph object produces
#errors. Could also get around this issue by restarting kernel, but that's annoying.
with tf.Graph().as_default():
#input layer with shape of data specified. In this case, dimensions of our images,
#rows X cols X rgb array. The initial 'None' is for an unknown dimension reflecting the
#"number of samples that are processed in a batch"
# Building convolutional network
net = input_data(shape=[None, ROWS, COLS, 3])
net = conv_2d(net, 32, 3, activation='relu', regularizer="L2")
net = max_pool_2d(net, 2)
net = local_response_normalization(net)
net = conv_2d(net, 64, 3, activation='relu', regularizer="L2")
net = max_pool_2d(net, 2)
net = local_response_normalization(net)
net = fully_connected(net, 72, activation='relu')
net = fully_connected(net, 8, activation='softmax')
net = regression(net)
return tflearn.DNN(net)
#test on subset of training images you didn't use; if it works well could do a full train:test and submit
# Define model
model = dnn_test1()
# Start training (apply gradient descent algorithm). Will want to specify multiple epochs
# typically unless just testing
model.fit(X_train, Y_train, n_epoch=10,
show_metric=True, batch_size=16, validation_set = (X_valid, Y_valid))
#model predict
test_preds1 = model.predict(test)
submission = pd.DataFrame(test_preds1, columns=fish_classes)
submission.insert(0, 'image', test_files)
submission.head()
submission.to_csv('../data/processed/tflearn/' + outname)