In [2]:
from keras.preprocessing.image import ImageDataGenerator, array_to_img, img_to_array, load_img
from keras.models import Sequential, Model, load_model, model_from_yaml
from keras.layers import Conv2D, MaxPooling2D
from keras.layers import Activation, Dropout, Flatten, Dense
from keras import backend as K
import tensorflow as tf

import os
import matplotlib.pyplot as plt
from PIL import Image
from ipywidgets import interact
from scipy.misc import imresize, imsave
from shutil import copyfile
from skimage.segmentation import mark_boundaries
from sklearn.model_selection import train_test_split

import numpy as np
import random
import os
import glob
import cv2
import datetime
import pandas as pd
import time
import h5py
import csv
import random


Using TensorFlow backend.

In [28]:
%matplotlib inline
df = pd.read_csv('/home/pramit/projects/data/whale_identification/train.csv')
print(df.head(4))
X = df.Image
y = df.Id
n_classes = len(np.unique(df.Id))
print("Number of unique classes: {}".format(n_classes))


           Image         Id
0  0000e88ab.jpg  w_f48451c
1  0001f9222.jpg  w_c3d896a
2  00029d126.jpg  w_20df2c5
3  00050a15a.jpg  new_whale
Number of unique classes: 5005

In [37]:
for i in X:
    if '76b7c02d3' == str(i).split('.')[0]:
        print(i)


76b7c02d3.jpg

In [4]:
from random import randint
def browse_images(dir_path, index_number=None, define_range=10):
    # Read the csv file to extract info about the images
    list_of_images = pd.read_csv("{}.csv".format(dir_path)).Image
    n = len(list_of_images)
    if define_range > n:
        raise ValueError("out of range")
    
    def view_image(index):
        # all images are of shape (700, 1050, 3)
        im = Image.open('{}/{}'.format(dir_path, list_of_images[index]))
        plt.imshow(im, cmap=plt.cm.gray_r, interpolation='nearest')
        plt.show()
    index_range = index_number if index_number else (0, define_range)
    interact(view_image, index=(0, index_range))
    

def browse_img_dir(dir_path, define_range=10):
    list_of_images = os.listdir(dir_path)
    n = len(list_of_images)
    if define_range > n:
        raise ValueError("out of range")
    def view_image(index):
        im = Image.open('{}{}'.format(dir_path, list_of_images[index]))
        plt.imshow(im, cmap=plt.cm.gray_r, interpolation='nearest')
        plt.show()
    interact(view_image, index=(0, define_range))

Quick Exploration of the images


In [5]:
browse_images('/home/pramit/projects/data/whale_identification/train', index_number=3, define_range=df.shape[0])



In [6]:
# image dimension of the input images
img_width, img_height = 700, 1050

if K.image_data_format() == 'channels_first':
    input_shape = (3, img_width, img_height)
else:
    input_shape = (img_width, img_height, 3)
print(input_shape)


(700, 1050, 3)

In [42]:
105*7


Out[42]:
735

Model Architecture


In [7]:
model = Sequential()
model.add(Conv2D(32, (3, 3), input_shape=input_shape))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Conv2D(32, (3, 3)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Conv2D(64, (3, 3)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Flatten())
model.add(Dense(64))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(1))
model.add(Activation('sigmoid'))

model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])

Data Preparation

1. Train 2. Validation set


In [8]:
X_train, X_validate, y_train, y_validate = train_test_split(X, y, test_size=0.33, random_state=42)
X_train = X_train[0:1000]
y_train = y_train[0:1000]

X_validate = X_validate[0:500]
y_validate = y_validate[0:500]
print("Train shape: {}".format(X_train.shape[0]))
print("Validate shape: {}".format(X_validate.shape[0]))


Train shape: 1000
Validate shape: 500

In [14]:
# Create sub-directories and copy the respective files so that it's easier to compute classification
x_train_index = X_train.index
print("Number of training sample: {}".format(len(x_train_index)))
x_validate_index = X_validate.index
print("Number of training sample: {}".format(len(x_validate_index)))

x_train_image_names = list(X.iloc[x_train_index])
x_validate_image_names = list(X.iloc[x_validate_index])


import errno
directory_train = "/home/pramit/projects/data/whale_identification/train_set"
directory_validate = "/home/pramit/projects/data/whale_identification/validate_set"

def create_folder(directory_name):
    try:
        os.makedirs(directory_name)
    except OSError as e:
        if e.errno != errno.EEXIST:
            raise
            
# For train set
create_folder(directory_train)
# For validate set
create_folder(directory_validate)


Number of training sample: 1000
Number of training sample: 500

In [15]:
# Copy respective files to train and validate location
def copy_files(image_name_list, src="/home/pramit/projects/data/whale_identification/train/", dst=None):
    for item in image_name_list:
        image_name = item
        copyfile(src+image_name, dst+image_name)
        
copy_files(x_train_image_names, dst="/home/pramit/projects/data/whale_identification/train_set/")
copy_files(x_validate_image_names, dst="/home/pramit/projects/data/whale_identification/validate_set/")

In [16]:
create_folder("/home/pramit/projects/data/whale_identification/train_set/preview")
batch_size = 16

datagen = ImageDataGenerator(
        rotation_range=40,
        width_shift_range=0.2,
        height_shift_range=0.2,
        rescale=1./255,
        shear_range=0.2,
        zoom_range=0.2,
        horizontal_flip=True,
        fill_mode='nearest')

img = load_img('/home/pramit/projects/data/whale_identification/train_set/983f6c9db.jpg')  # this is a PIL image
x = img_to_array(img)  # this is a Numpy array with shape (3, 150, 150)
x = x.reshape((1,) + x.shape)  # this is a Numpy array with shape (1, 3, 150, 150)

# the .flow() command below generates batches of randomly transformed images
# and saves the results to the `preview/` directory
i = 0
# remember to run mkdir -p /kaggle/working/train_set/preview using the console
for batch in datagen.flow(x, batch_size=1,
                          save_to_dir='/home/pramit/projects/data/whale_identification/train_set/preview', save_format='jpeg'):
    i += 1
    if i > 20:
        break

In [17]:
browse_img_dir("/home/pramit/projects/data/whale_identification/train_set/preview/")



In [20]:
generator = datagen.flow_from_directory(
        '/home/pramit/projects/data/whale_identification/train_set',
        target_size=(700, 1050),
        batch_size=batch_size,
        class_mode=None,  # this means our generator will only yield batches of data, no labels
        shuffle=True)


Found 1000 images belonging to 1 classes.

In [45]:
len(generator)


Out[45]:
63

In [ ]:
bottleneck_features_train = model.predict_generator(generator, steps=63, use_multiprocessing=True)

In [44]:
bottleneck_features_train[0]


Out[44]:
array([0.4965581], dtype=float32)

In [23]:
import numpy as np
np.save('/home/pramit/projects/data/whale_identification/bottleneck_features_train.npy', bottleneck_features_train)

In [24]:
generator_validate = datagen.flow_from_directory(
        '/home/pramit/projects/data/whale_identification/validate_set',
        target_size=(700, 1050),
        batch_size=batch_size,
        class_mode=None,
        shuffle=False)


Found 500 images belonging to 1 classes.

In [ ]:
bottleneck_features_validate = model.predict_generator(generator_validate, 500, use_multiprocessing=True)

In [27]:
np.save('/home/pramit/projects/data/whale_identification/bottleneck_features_validate.npy', 
        bottleneck_features_validate)

In [40]:
# Load the training data
train_data = np.load('/home/pramit/projects/data/whale_identification/bottleneck_features_train.npy')
print(train_data.shape)


(15880, 1)

In [ ]: