This session we explored the Human Protein Atlas Image Classification and utilized the kernal Pretrained InceptionResNetV2 base classifier.
In this competition, Kagglers will develop models capable of classifying mixed patterns of proteins in microscope images. The Human Protein Atlas will use these models to build a tool integrated with their smart-microscopy system to identify a protein's location(s) from a high-throughput image.
When running this notebook on colab, run a GPU instance in order to ensure ample space when unzipping dataset. Go to Runtime --> Change Runtime type --> GPU.
In [0]:
!pip3 install kaggle
!pip3 install google
In [0]:
from google.colab import files
upload = files.upload()
In [0]:
!mkdir ~/.kaggle
In [0]:
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
In [0]:
!kaggle competitions download -c human-protein-atlas-image-classification
In [0]:
!mkdir ./human_protein_atlas/
!mkdir ./human_protein_atlas/train
!mkdir ./human_protein_atlas/test
In [0]:
!mv train.csv ./human_protein_atlas/train.csv
In [0]:
!unzip -q ./train.zip -d ./human_protein_atlas/train
In [0]:
!unzip -q ./test.zip -d ./human_protein_atlas/test
This notebook uses an inception resnet v2 model from the kernel Pretrained InceptionResNetV2 base classifier. We are using a pre-designed architecture that has been trained and does well on general image classification. We add a few more layers that are custom to our specific task.
In [0]:
import os, sys, math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from PIL import Image
import cv2
from imgaug import augmenters as iaa
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")
In [0]:
INPUT_SHAPE = (299,299,3)
BATCH_SIZE = 10
In [0]:
path_to_train = './human_protein_atlas/train/'
data = pd.read_csv('human_protein_atlas/train.csv')
train_dataset_info = []
for name, labels in zip(data['Id'], data['Target'].str.split(' ')):
train_dataset_info.append({
'path':os.path.join(path_to_train, name),
'labels':np.array([int(label) for label in labels])})
train_dataset_info = np.array(train_dataset_info)
In [0]:
from sklearn.model_selection import train_test_split
train_ids, test_ids, train_targets, test_target = train_test_split(
data['Id'], data['Target'], test_size=0.2, random_state=42)
In [0]:
class data_generator:
def create_train(dataset_info, batch_size, shape, augument=True):
assert shape[2] == 3
while True:
random_indexes = np.random.choice(len(dataset_info), batch_size)
batch_images = np.empty((batch_size, shape[0], shape[1], shape[2]))
batch_labels = np.zeros((batch_size, 28))
for i, idx in enumerate(random_indexes):
image = data_generator.load_image(
dataset_info[idx]['path'], shape)
if augument:
image = data_generator.augment(image)
batch_images[i] = image
batch_labels[i][dataset_info[idx]['labels']] = 1
yield batch_images, batch_labels
def load_image(path, shape):
R = np.array(Image.open(path+'_red.png'))
G = np.array(Image.open(path+'_green.png'))
B = np.array(Image.open(path+'_blue.png'))
Y = np.array(Image.open(path+'_yellow.png'))
image = np.stack((
R/2 + Y/2,
G/2 + Y/2,
B),-1)
image = cv2.resize(image, (shape[0], shape[1]))
image = np.divide(image, 255)
return image
def augment(image):
augment_img = iaa.Sequential([
iaa.OneOf([
iaa.Affine(rotate=0),
iaa.Affine(rotate=90),
iaa.Affine(rotate=180),
iaa.Affine(rotate=270),
iaa.Fliplr(0.5),
iaa.Flipud(0.5),
])], random_order=True)
image_aug = augment_img.augment_image(image)
return image_aug
In [0]:
# create train datagen
input_shape = (299, 299, 3)
train_datagen = data_generator.create_train(
train_dataset_info, 5, input_shape, augument=True)
In [0]:
images, labels = next(train_datagen)
fig, ax = plt.subplots(1,5,figsize=(25,5))
for i in range(5):
ax[i].imshow(images[i])
print('min: {0}, max: {1}'.format(images.min(), images.max()))
In [0]:
from keras.preprocessing.image import ImageDataGenerator
from keras.models import Sequential, load_model
from keras.layers import Activation
from keras.layers import Dropout
from keras.layers import Flatten
from keras.layers import Dense
from keras.layers import Input
from keras.layers import BatchNormalization
from keras.layers import Conv2D
from keras.models import Model
from keras.applications import InceptionResNetV2
from keras.callbacks import ModelCheckpoint
from keras.callbacks import LambdaCallback
from keras.callbacks import Callback
from keras import metrics
from keras.optimizers import Adam
from keras import backend as K
import tensorflow as tf
import keras
def create_model(input_shape, n_out):
pretrain_model = InceptionResNetV2(
include_top=False,
weights='imagenet',
input_shape=input_shape)
input_tensor = Input(shape=input_shape)
bn = BatchNormalization()(input_tensor)
x = pretrain_model(bn)
x = Conv2D(128, kernel_size=(1,1), activation='relu')(x)
x = Flatten()(x)
x = Dropout(0.5)(x)
x = Dense(512, activation='relu')(x)
x = Dropout(0.5)(x)
output = Dense(n_out, activation='sigmoid')(x)
model = Model(input_tensor, output)
return model
In [0]:
def f1(y_true, y_pred):
tp = K.sum(K.cast(y_true*y_pred, 'float'), axis=0)
fp = K.sum(K.cast((1-y_true)*y_pred, 'float'), axis=0)
fn = K.sum(K.cast(y_true*(1-y_pred), 'float'), axis=0)
p = tp / (tp + fp + K.epsilon())
r = tp / (tp + fn + K.epsilon())
f1 = 2*p*r / (p+r+K.epsilon())
f1 = tf.where(tf.is_nan(f1), tf.zeros_like(f1), f1)
return K.mean(f1)
In [0]:
def show_history(history):
fig, ax = plt.subplots(1, 3, figsize=(15,5))
ax[0].set_title('loss')
ax[0].plot(history.epoch, history.history["loss"], label="Train loss")
ax[0].plot(history.epoch, history.history["val_loss"], label="Validation loss")
ax[1].set_title('f1')
ax[1].plot(history.epoch, history.history["f1"], label="Train f1")
ax[1].plot(history.epoch, history.history["val_f1"], label="Validation f1")
ax[2].set_title('acc')
ax[2].plot(history.epoch, history.history["acc"], label="Train acc")
ax[2].plot(history.epoch, history.history["val_acc"], label="Validation acc")
ax[0].legend()
ax[1].legend()
ax[2].legend()
In [0]:
keras.backend.clear_session()
model = create_model(
input_shape=(299,299,3),
n_out=28)
model.summary()
In [0]:
checkpointer = ModelCheckpoint(
'./InceptionResNetV2.model', monitor = 'val_f1',
verbose=2, save_best_only=True)
# no data augmentation training
train_generator = data_generator.create_train(
train_dataset_info[train_ids.index], BATCH_SIZE, INPUT_SHAPE, augument=False)
validation_generator = data_generator.create_train(
train_dataset_info[test_ids.index], 256, INPUT_SHAPE, augument=False)
model.layers[2].trainable = False
model.compile(
loss='categorical_crossentropy',
optimizer=Adam(1e-3),
metrics=['acc', f1])
history = model.fit_generator(
train_generator,
steps_per_epoch=100,
validation_data=next(validation_generator),
epochs=1,
verbose=1,
callbacks=[checkpointer])
In [0]:
# To prevent error during training, custom f1 scoring object must be defined
from keras.utils.generic_utils import get_custom_objects
get_custom_objects().update({'f1': f1})
In [0]:
# Take a checkpoint model and load into keras
from keras.models import load_model
checkpointer_savepath = './InceptionResNetV2.model'
model = load_model(checkpointer_savepath)
In [0]:
show_history(history)
In [0]:
!pip install scikit-optimize
In [0]:
from skopt.space import Real
from skopt.utils import use_named_args
from skopt import gp_minimize
def create_model_and_compile(input_shape, n_out, lr):
"""
Args:
input_shape:
n_out: number of output classes
"""
pretrain_model = InceptionResNetV2(
include_top=False,
weights='imagenet',
input_shape=input_shape)
input_tensor = Input(shape=input_shape)
bn = BatchNormalization()(input_tensor)
x = pretrain_model(bn)
x = Conv2D(128, kernel_size=(1,1), activation='relu')(x)
x = Flatten()(x)
x = Dropout(0.5)(x)
x = Dense(512, activation='relu')(x)
x = Dropout(0.5)(x)
output = Dense(n_out, activation='sigmoid')(x)
model = Model(input_tensor, output)
model.layers[2].trainable = False
model.compile(
loss='categorical_crossentropy',
optimizer=Adam(lr = lr),
metrics=['acc', f1])
return model
In [0]:
path_best_model = '/content/'
best_f1 = history.history['val_f1'][-1]
dimensions = [Real(name='learning_rate', low=1e-6, high=1e-3, prior='log-uniform'),]
# Integer(name='num_nodes', low=10, high=256),
# Categorical(name='activation', categories=['relu', 'sigmoid'])]
In [0]:
@use_named_args(dimensions=dimensions)
def fitness(learning_rate):
model = create_model_and_compile(input_shape = (299, 299, 3), n_out = 28, lr = learning_rate)
history = model.fit_generator(
train_generator,
steps_per_epoch=100,
validation_data=next(validation_generator),
epochs=1,
verbose=1,
callbacks=[checkpointer])
val_f1 = history.history['val_f1'][-1]
global best_f1
if val_f1<best_f1:
best_f1 = val_f1
del model
K.clear_session()
return val_f1
In [0]:
history.history
Out[0]:
In [0]:
#running the automatic creation of models using gp_minimize and fitness function
from skopt import gp_minimize
search_result = gp_minimize(func=fitness,
dimensions=dimensions,
acq_func='EI',
n_calls=10,)
# x0=default_parameters)
In [0]:
search_result.x
Out[0]:
In [0]:
# with data augmentation
train_generator = data_generator.create_train(
train_dataset_info[train_ids.index], BATCH_SIZE, INPUT_SHAPE, augument=True)
validation_generator = data_generator.create_train(
train_dataset_info[test_ids.index], 256, INPUT_SHAPE, augument=False)
model.layers[2].trainable = True
model.compile(
loss='categorical_crossentropy',
optimizer=Adam(1e-4),
metrics=['acc', f1])
history = model.fit_generator(
train_generator,
steps_per_epoch=100,
validation_data=next(validation_generator),
epochs=180,
verbose=1,
callbacks=[checkpointer])
In [0]:
show_history(history)
In [0]:
model = load_model(
'./InceptionResNetV2.model',
custom_objects={'f1': f1})
In [0]:
submit = pd.read_csv('./sample_submission.csv')
In [0]:
%%time
predicted = []
for name in tqdm(submit['Id']):
path = os.path.join('./human_protein_atlas/test/', name)
image = data_generator.load_image(path, INPUT_SHAPE)
score_predict = model.predict(image[np.newaxis])[0]
label_predict = np.arange(28)[score_predict>=0.2]
str_predict_label = ' '.join(str(l) for l in label_predict)
predicted.append(str_predict_label)
In [0]:
submit['Predicted'] = predicted
submit.to_csv('submission.csv', index=False)