In [1]:
#import tflearn packages
import tflearn
from tflearn.layers.core import input_data, dropout, fully_connected
from tflearn.layers.conv import conv_2d, max_pool_2d
from tflearn.layers.normalization import local_response_normalization
from tflearn.layers.estimator import regression
import tensorflow as tf
from tflearn.data_utils import shuffle
from tflearn.layers.core import input_data, dropout, fully_connected
from tflearn.layers.conv import conv_2d, max_pool_2d
from tflearn.layers.estimator import regression
from tflearn.data_preprocessing import ImagePreprocessing
from tflearn.data_augmentation import ImageAugmentation
#import other packages
import pandas as pd
import sklearn
import os
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
from skimage import measure
%matplotlib inline
from keras.utils import np_utils
import os
from os import getcwd
from os import listdir
from os.path import isfile, join, isdir
import skimage
from skimage import measure
from skimage import io
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from sklearn.preprocessing import LabelEncoder
from skimage.transform import resize
In [24]:
#things to set
trainPath = '../data/processed/padded'
testPath = '../data/processed/test/padded'
outname = 'submission_170310_padded_clipped.csv'
##need to adjust channels somehow because it's black and white now!
ROWS = 90 #90 720
COLS = 160 #160 1280
CHANNELS = 3
In [25]:
#import images
def get_paths(foldNames):
paths = dict.fromkeys(foldNames)
for idx,g in enumerate(foldNames):
fileNames = [f for f in listdir(join(trainPath,g)) if isfile(join(trainPath,g, f))]
for i,f in enumerate(fileNames):
fileNames[i] = join(trainPath,g,f)
paths[g] = fileNames
return paths
fish_classes = [f for f in listdir(trainPath) if isdir(join(trainPath, f))]
groupData = pd.DataFrame ({'group': fish_classes})
fish_paths = get_paths(fish_classes)
#remove mac added files
keys_to_remove = [key for key, value in fish_paths.iteritems()
if '.DS_Store' in value]
for key in keys_to_remove:
del fish_paths[key]
In [26]:
#label images by directory
for idx,fish in enumerate(fish_classes):
groupData.ix[idx,'num files'] = int(len(fish_paths[fish]))
files = []
Y_cat = []
for fish in fish_classes:
fish_files = fish_paths[fish]
files.extend(fish_files)
y_fish = np.tile(fish, len(fish_files))
Y_cat.extend(y_fish)
#remove mac added files
is_to_remove = [i for i in range(0,len(files))
if '.DS_Store' in files[i]]
for i in is_to_remove:
del files[i]
del Y_cat[i]
#change to numpy array
Y_cat = np.array(Y_cat)
In [27]:
#downsample images
def read_image(src):
"""Read and resize individual images"""
im = io.imread(src)
im = resize(im, (ROWS, COLS))
return im
#change to numpy array
X_all = np.ndarray((len(files), ROWS, COLS, CHANNELS), dtype=float)
In [28]:
for i, f in enumerate(files):
im = read_image(f)
X_all[i] = im
if i%1000 == 0: print('Processed {} of {}'.format(i, len(files)))
##view example image
image = X_all[0]
plt.figure(figsize=(5, 5))
plt.imshow(image, cmap='gray', interpolation='nearest')
plt.axis('off')
plt.tight_layout()
plt.show()
In [29]:
#split into train and test for cross-validation
# One Hot Encoding Labels
# Transform the categorical array Y_all into matrix of the same height,
# but with a boolean column for each category.
Y_all = LabelEncoder().fit_transform(Y_cat)
Y_all = np_utils.to_categorical(Y_all)
# test_size: between 0 and 1. proportion of the dataset to include in the test split
# random_state: Pseudo-random number generator state used for random sampling. How to shoose this?
# stratify: this is ensuring that the split datasets are balanced, i.e. contains the same
# percentage of classes
X_train, X_valid, Y_train, Y_valid = train_test_split(X_all, Y_all,
test_size=0.2, random_state=23,
stratify=Y_all)
In [30]:
# read in test photo set now so it's there regardless of if I do dnn1 or dnn2
test_files = [im for im in os.listdir(testPath)]
test = np.ndarray((len(test_files), ROWS, COLS, CHANNELS), dtype=float)
for i, im in enumerate(test_files):
test[i] = read_image(join(testPath,im))
In [31]:
def dnn_test2():
#needed to run this tensorflow operation in order to build the network and subsequently
#create the model, multiple times. Rebuilding without resetting the tf.Graph object produces
#errors. Could also get around this issue by restarting kernel, but that's annoying.
with tf.Graph().as_default():
#input layer with shape of data specified. In this case, dimensions of our images,
#rows X cols X rgb array. The initial 'None' is for an unknown dimension reflecting the
#"number of samples that are processed in a batch"
# Building convolutional network
network = input_data(shape=[None, ROWS, COLS, 3])
network = conv_2d(network, 32, 3, activation='relu', regularizer="L2")
network = max_pool_2d(network, 2)
network = local_response_normalization(network)
network = conv_2d(network, 64, 3, activation='relu', regularizer="L2")
network = max_pool_2d(network, 2)
network = local_response_normalization(network)
network = fully_connected(network, 128, activation='tanh')
network = dropout(network, 0.8)
network = fully_connected(network, 256, activation='tanh')
network = dropout(network, 0.8)
network = fully_connected(network, 8, activation='softmax')
network = regression(network, optimizer='adam', learning_rate=0.01,
loss='categorical_crossentropy', name='target')
return tflearn.DNN(network)
#test on subset of training images you didn't use; if it works well could do a full train:test and submit
# Define model
model2 = dnn_test2()
# Start training (apply gradient descent algorithm). Will want to specify multiple epochs
# typically unless just testing
model2.fit(X_train, Y_train, n_epoch=10,
show_metric=True, batch_size=16)
#model predict
test_preds2 = model2.predict(test)
submission2 = pd.DataFrame(test_preds2, columns=fish_classes)
submission2.insert(0, 'image', test_files)
print(submission2.head())
In [32]:
#use numpy.clip() to restrict values to between 0.2 and 0.8
sub3 = np.clip(submission2, 0.2, 0.8)
print(sub3.head())
In [33]:
sub3.to_csv('../data/processed/tflearn/' + outname)
In [16]:
#for some reason it keeps coming up with the same prediction for everything, did I break something somewhere?
df = pd.read_csv('../data/processed/tflearn/submission_170224_padded.csv')
dfa = df.as_matrix(columns = ['ALB', 'BET', 'DOL', 'LAG', 'NoF', 'OTHER', 'SHARK', 'YFT'])
dfa_clip = np.clip(dfa, 0.2, 0.8)
In [19]:
dfp = pd.DataFrame(dfa_clip)
dfp.to_csv('../data/processed/tflearn/clipped_submission_170224_padded.csv')
#hmm, something about using clip is making it the same prediction for everything
In [34]:
###Ten Epochs###
#Training Step: 1889 | total loss: 1.87054 | time: 116.244s
#| Adam | epoch: 010 | loss: 1.87054 - acc: 0.4134 -- iter: 3008/3021
#Training Step: 1890 | total loss: 1.83847 | time: 116.848s
#| Adam | epoch: 010 | loss: 1.83847 - acc: 0.4221 -- iter: 3021/3021
#--
###20 Epochs - why did it get worse?###
#Training Step: 3779 | total loss: 2.24260 | time: 139.508s
#| Adam | epoch: 020 | loss: 2.24260 - acc: 0.2229 -- iter: 3008/3021
#Training Step: 3780 | total loss: 2.22141 | time: 140.204s
#| Adam | epoch: 020 | loss: 2.22141 - acc: 0.2193 -- iter: 3021/3021
#--
###10 epochs - learning rate 0.001###
#Training Step: 1889 | total loss: 0.08010 | time: 121.254s
#| Adam | epoch: 010 | loss: 0.08010 - acc: 0.9808 -- iter: 3008/3021
#Training Step: 1890 | total loss: 0.07451 | time: 121.887s
#| Adam | epoch: 010 | loss: 0.07451 - acc: 0.9827 -- iter: 3021/3021
#--
###adding two extra layers, and upping nodes accordingly
#Training Step: 1889 | total loss: 5.01305 | time: 239.112s
#| Adam | epoch: 010 | loss: 5.01305 - acc: 0.3373 -- iter: 3008/3021
#Training Step: 1890 | total loss: 5.00190 | time: 240.249s
#| Adam | epoch: 010 | loss: 5.00190 - acc: 0.3348 -- iter: 3021/3021
#--
###ten epochs, padded hist matched
#Training Step: 1889 | total loss: 1.86504 | time: 130.465s
#| Adam | epoch: 010 | loss: 1.86504 - acc: 0.3812 -- iter: 3008/3021
#Training Step: 1890 | total loss: 1.88255 | time: 131.165s
#| Adam | epoch: 010 | loss: 1.88255 - acc: 0.3806 -- iter: 3021/3021
#--