In [2]:
from keras.preprocessing.image import ImageDataGenerator, array_to_img, img_to_array, load_img
from keras.models import Sequential, Model, load_model, model_from_yaml
from keras.layers import Conv2D, MaxPooling2D
from keras.layers import Activation, Dropout, Flatten, Dense
from keras import backend as K
import tensorflow as tf
import os
import matplotlib.pyplot as plt
from PIL import Image
from ipywidgets import interact
from scipy.misc import imresize, imsave
from shutil import copyfile
from skimage.segmentation import mark_boundaries
from sklearn.model_selection import train_test_split
import numpy as np
import random
import os
import glob
import cv2
import datetime
import pandas as pd
import time
import h5py
import csv
import random
In [28]:
%matplotlib inline
df = pd.read_csv('/home/pramit/projects/data/whale_identification/train.csv')
print(df.head(4))
X = df.Image
y = df.Id
n_classes = len(np.unique(df.Id))
print("Number of unique classes: {}".format(n_classes))
In [37]:
for i in X:
if '76b7c02d3' == str(i).split('.')[0]:
print(i)
In [4]:
from random import randint
def browse_images(dir_path, index_number=None, define_range=10):
# Read the csv file to extract info about the images
list_of_images = pd.read_csv("{}.csv".format(dir_path)).Image
n = len(list_of_images)
if define_range > n:
raise ValueError("out of range")
def view_image(index):
# all images are of shape (700, 1050, 3)
im = Image.open('{}/{}'.format(dir_path, list_of_images[index]))
plt.imshow(im, cmap=plt.cm.gray_r, interpolation='nearest')
plt.show()
index_range = index_number if index_number else (0, define_range)
interact(view_image, index=(0, index_range))
def browse_img_dir(dir_path, define_range=10):
list_of_images = os.listdir(dir_path)
n = len(list_of_images)
if define_range > n:
raise ValueError("out of range")
def view_image(index):
im = Image.open('{}{}'.format(dir_path, list_of_images[index]))
plt.imshow(im, cmap=plt.cm.gray_r, interpolation='nearest')
plt.show()
interact(view_image, index=(0, define_range))
In [5]:
browse_images('/home/pramit/projects/data/whale_identification/train', index_number=3, define_range=df.shape[0])
In [6]:
# image dimension of the input images
img_width, img_height = 700, 1050
if K.image_data_format() == 'channels_first':
input_shape = (3, img_width, img_height)
else:
input_shape = (img_width, img_height, 3)
print(input_shape)
In [42]:
105*7
Out[42]:
In [7]:
model = Sequential()
model.add(Conv2D(32, (3, 3), input_shape=input_shape))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Conv2D(32, (3, 3)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Conv2D(64, (3, 3)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Flatten())
model.add(Dense(64))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(1))
model.add(Activation('sigmoid'))
model.compile(loss='categorical_crossentropy',
optimizer='rmsprop',
metrics=['accuracy'])
In [8]:
X_train, X_validate, y_train, y_validate = train_test_split(X, y, test_size=0.33, random_state=42)
X_train = X_train[0:1000]
y_train = y_train[0:1000]
X_validate = X_validate[0:500]
y_validate = y_validate[0:500]
print("Train shape: {}".format(X_train.shape[0]))
print("Validate shape: {}".format(X_validate.shape[0]))
In [14]:
# Create sub-directories and copy the respective files so that it's easier to compute classification
x_train_index = X_train.index
print("Number of training sample: {}".format(len(x_train_index)))
x_validate_index = X_validate.index
print("Number of training sample: {}".format(len(x_validate_index)))
x_train_image_names = list(X.iloc[x_train_index])
x_validate_image_names = list(X.iloc[x_validate_index])
import errno
directory_train = "/home/pramit/projects/data/whale_identification/train_set"
directory_validate = "/home/pramit/projects/data/whale_identification/validate_set"
def create_folder(directory_name):
try:
os.makedirs(directory_name)
except OSError as e:
if e.errno != errno.EEXIST:
raise
# For train set
create_folder(directory_train)
# For validate set
create_folder(directory_validate)
In [15]:
# Copy respective files to train and validate location
def copy_files(image_name_list, src="/home/pramit/projects/data/whale_identification/train/", dst=None):
for item in image_name_list:
image_name = item
copyfile(src+image_name, dst+image_name)
copy_files(x_train_image_names, dst="/home/pramit/projects/data/whale_identification/train_set/")
copy_files(x_validate_image_names, dst="/home/pramit/projects/data/whale_identification/validate_set/")
In [16]:
create_folder("/home/pramit/projects/data/whale_identification/train_set/preview")
batch_size = 16
datagen = ImageDataGenerator(
rotation_range=40,
width_shift_range=0.2,
height_shift_range=0.2,
rescale=1./255,
shear_range=0.2,
zoom_range=0.2,
horizontal_flip=True,
fill_mode='nearest')
img = load_img('/home/pramit/projects/data/whale_identification/train_set/983f6c9db.jpg') # this is a PIL image
x = img_to_array(img) # this is a Numpy array with shape (3, 150, 150)
x = x.reshape((1,) + x.shape) # this is a Numpy array with shape (1, 3, 150, 150)
# the .flow() command below generates batches of randomly transformed images
# and saves the results to the `preview/` directory
i = 0
# remember to run mkdir -p /kaggle/working/train_set/preview using the console
for batch in datagen.flow(x, batch_size=1,
save_to_dir='/home/pramit/projects/data/whale_identification/train_set/preview', save_format='jpeg'):
i += 1
if i > 20:
break
In [17]:
browse_img_dir("/home/pramit/projects/data/whale_identification/train_set/preview/")
In [20]:
generator = datagen.flow_from_directory(
'/home/pramit/projects/data/whale_identification/train_set',
target_size=(700, 1050),
batch_size=batch_size,
class_mode=None, # this means our generator will only yield batches of data, no labels
shuffle=True)
In [45]:
len(generator)
Out[45]:
In [ ]:
bottleneck_features_train = model.predict_generator(generator, steps=63, use_multiprocessing=True)
In [44]:
bottleneck_features_train[0]
Out[44]:
In [23]:
import numpy as np
np.save('/home/pramit/projects/data/whale_identification/bottleneck_features_train.npy', bottleneck_features_train)
In [24]:
generator_validate = datagen.flow_from_directory(
'/home/pramit/projects/data/whale_identification/validate_set',
target_size=(700, 1050),
batch_size=batch_size,
class_mode=None,
shuffle=False)
In [ ]:
bottleneck_features_validate = model.predict_generator(generator_validate, 500, use_multiprocessing=True)
In [27]:
np.save('/home/pramit/projects/data/whale_identification/bottleneck_features_validate.npy',
bottleneck_features_validate)
In [40]:
# Load the training data
train_data = np.load('/home/pramit/projects/data/whale_identification/bottleneck_features_train.npy')
print(train_data.shape)
In [ ]: