In [ ]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
import math
import multiprocessing as mp
import os
import keras
import keras.backend as K
from keras.applications.resnet50 import ResNet50
from keras.callbacks import ModelCheckpoint, TensorBoard
from keras.initializers import VarianceScaling
from keras.layers import Dense, Dropout, Flatten, GlobalAveragePooling2D, Input, Lambda, merge
from keras.models import Model, load_model
from keras.optimizers import SGD
# from keras.preprocessing.image import ImageDataGenerator
from keras.regularizers import l2
from keras.utils import to_categorical
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from PIL import Image
import tensorflow as tf
# After move to Keras 2.0 API, need to check if this can still be used.
from preprocessing.image_eval import ImageDataGenerator # multiprocessing ImageDataGenerator
plt.rcParams['figure.figsize'] = (10, 10)
In [ ]:
# NOTE: Need to update the following for each model
# 1. train & val data dirs
# 2. train & val data percentages
# 3. experiment directory
# 4. model file
# 5. preprocessing channel means
In [ ]:
#os.environ['CUDA_VISIBLE_DEVICES'] = ""
size = 224
channels = 3
classes = 3
p = 0.01 # 0.01
val_p = 0.01 #0.01
num_gpus = 4
batch_size = 32 * num_gpus # for 2 GPUs, 32/GPU has 1.2x systems speedup over 16/GPU
train_dir = "train_updated_norm_v3"
val_dir = "val_updated_norm_v3"
run = 13
# exp_dir = "experiments/keras/resnet50-1%-4-gpu-128-batch-size-updated-norm-v3-data-1%-val-sanity/4"
experiment_template = "resnet50-{p}%-{num_gpus}-gpu-{batch_size}-batch-size-{train_dir}-data-{val_p}%-val-sanity/{run}"
experiment = experiment_template.format(p=int(p*100), val_p=int(val_p*100), num_gpus=num_gpus,
batch_size=batch_size, train_dir=train_dir, run=run)
model_file = "0.38936_acc_0.27847_loss_model.hdf5"
exp_dir = os.path.join("experiments", "keras", experiment)
# experiment_name = model_file.replace("/", "_")[:-5]
print(exp_dir)
In [ ]:
# os.makedirs(os.path.join("results", experiment_name), exist_ok=True)
In [ ]:
model = load_model(os.path.join(exp_dir, model_file))
In [ ]:
print(model.summary())
print(model.get_layer("resnet50").summary())
In [ ]:
# Visualize Model
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot
SVG(model_to_dot(model.get_layer("resnet50")).create(prog='dot', format='svg'))
In [ ]:
# Note: previous `model` is already compiled and ready to go.
# However, it may have been built for multi-GPU training, so it
# would still require multiple parallel inputs at eval time.
# Even worse, the device settings will not be retained, so all
# towers would be run on one device. To fix this, we can extract
# a single tower, rewrap in a multi-GPU block, and recompile.
# Extract single tower
resnet50 = model.get_layer("resnet50")
#model.save("resnet50-100%-4-gpu-128-batch-size-updated-norm-v3-data-1%-val-dropout_0_1.56-19_NO_GPU_TOWERS.hdf5")
# Multi-GPU exploitation via a linear combination of GPU loss functions.
ins = []
outs = []
for i in range(num_gpus):
with tf.device("/gpu:{}".format(i)):
x = Input(shape=(size,size,channels)) # split of batch
out = resnet50(x) # run split on shared model
ins.append(x)
outs.append(out)
model = Model(inputs=ins, outputs=outs) # multi-GPU, data-parallel model
# Compile model.
metrics = ['accuracy']
model.compile(optimizer="sgd", loss="categorical_crossentropy",
loss_weights=[1/num_gpus]*num_gpus, metrics=metrics)
In [ ]:
# # Explore model
for x in model.inputs + model.outputs + model.metrics_tensors + model.targets:
print(x.name, x.device) # check that tensor devices exploit multi-GPU
# print(model.summary())
# print(resnet50.summary())
In [ ]:
train_save_dir = "images/{stage}/{p}".format(stage=train_dir, p=p)
val_save_dir = "images/{stage}/{p}".format(stage=val_dir, p=val_p)
print(train_save_dir, val_save_dir)
In [ ]:
def preprocess_input(x):
"""
Preprocesses a tensor encoding a batch of images.
Adapted from keras/applications/imagenet_utils.py
# Arguments
x: input Numpy tensor, 4D of shape (N, H, W, C).
# Returns
Preprocessed tensor.
"""
# Zero-center by subtracting mean pixel value per channel
# based on means from a 50%, evenly-distributed sample.
# Means: updated-data norm v3, norm, no-norm original
x[:, :, :, 0] -= 183.36777842 #189.54944625 #194.27633667
x[:, :, :, 1] -= 138.81743141 #152.73427159 #145.3067627
x[:, :, :, 2] -= 166.07406199 #176.89543273 #181.27861023
x = x[:, :, :, ::-1] # 'RGB'->'BGR'
return x
# Multi-GPU exploitation
def split(x, num_splits):
"""Split batch into K equal-sized batches."""
# Split tensors evenly, even if it means throwing away a few examples.
samples = math.floor(len(x) / num_splits)
x_splits = [arr[:samples] for arr in np.array_split(x, num_splits)]
return x_splits
def gen_preprocessed_batch(batch_generator, num_gpus):
"""Yield preprocessed batches of x,y data."""
# for xs, ys in batch_generator:
# yield split(preprocess_input(xs), num_gpus), split(ys, num_gpus)
# yield split(xs, num_gpus), split(ys, num_gpus) for tf aug experiments
for xs, ys, filenames in batch_generator:
yield split(preprocess_input(xs), num_gpus), split(ys, num_gpus), split(filenames, num_gpus)
In [ ]:
# Create train & val image generators
try:
# For interactive work, kill any existing pool.
pool.terminate()
except:
pass
pool = mp.Pool(processes=8)
train_datagen = ImageDataGenerator(pool=pool) #, horizontal_flip=True, vertical_flip=True,
# rotation_range=180, shear_range=0.1, fill_mode='reflect')
val_datagen = ImageDataGenerator(pool=pool)
#train_datagen = ImageDataGenerator()
#val_datagen = ImageDataGenerator()
train_generator_orig = train_datagen.flow_from_directory(train_save_dir, batch_size=batch_size, target_size=(size, size))
val_generator_orig = val_datagen.flow_from_directory(val_save_dir, batch_size=batch_size, target_size=(size, size))
In [ ]:
# Create train & val preprocessed generators
train_generator = gen_preprocessed_batch(train_generator_orig, num_gpus)
val_generator = gen_preprocessed_batch(val_generator_orig, num_gpus)
In [ ]:
# Number of examples.
tc = train_generator_orig.nb_sample
vc = val_generator_orig.nb_sample
#tc = train_generator_orig.samples
#vc = val_generator_orig.samples
# Number of batches for multi-GPU exploitation.
# Note: Multi-GPU exploitation for data parallelism splits mini-batches
# into a set of micro-batches to be run in parallel on each GPU, but
# Keras will view the set of micro-batches as a single batch with
# multiple sources of inputs (i.e. Keras will view a set of examples
# being run in parallel as a single example with multiple sources of
# inputs).
train_batches = int(math.ceil(tc/batch_size))
val_batches = int(math.ceil(vc/batch_size))
# Class counts (just for information)
train_class_counts = np.bincount(train_generator_orig.classes)
val_class_counts = np.bincount(val_generator_orig.classes)
print(tc, vc)
print(train_batches, val_batches)
print(train_class_counts / np.sum(train_class_counts), val_class_counts / np.sum(val_class_counts))
In [ ]:
class_counts = np.bincount(train_generator_orig.classes)
class_weights = dict(zip(range(classes), min(class_counts) / class_counts))
print(class_counts)
print(class_weights)
In [ ]:
def show_random_image(save_dir):
c = np.random.randint(1, 4)
class_dir = os.path.join(save_dir, str(c))
files = os.listdir(class_dir)
i = np.random.randint(0, len(files))
fname = os.path.join(class_dir, files[i])
print(fname)
img = Image.open(fname)
plt.imshow(img)
# show_random_image(train_save_dir)
In [ ]:
def plot(gen):
r, c = 6, 6
fig, ax = plt.subplots(r, c)
plt.setp(ax, xticks=[], yticks=[])
plt.tight_layout()
x, y, fname = next(gen)
batch_size = x.shape[0]
for i in range(r):
for j in range(c):
if i*c + j < batch_size:
im = x[i*c + j].astype(np.uint8)
if K.image_data_format() == 'channels_first':
im = im.transpose(1,2,0) # (C,H,W) -> (H,W,C)
ax[i][j].imshow(im)
ax[i][j].set_xlabel(y[i*c + j])
plot(train_generator_orig)
plot(val_generator_orig)
In [ ]:
# NOTE: We could call the `model.evaluate*` methods,
# but that would not allow us to create contingency
# matrices. Instead, we repeatedly loop over batches
# of data, collecting both the true labels and
# predictions. Then, we can compute any metrics
# desired, including 3x3 contingency matrices.
In [ ]:
# def extract_metrics(model, raw_metrics):
# labeled_metrics = list(zip(model.metrics_names, raw_metrics))
# losses = [v for k,v in labeled_metrics if k == "loss"]
# accuracies = [v for k,v in labeled_metrics if k.endswith("acc")]
# loss = sum(losses) / num_gpus
# acc = sum(accuracies) / num_gpus
# metrics = {"loss": loss, "acc": acc}
# return labeled_metrics, metrics
# raw_metrics = model.evaluate_generator(val_generator, val_samples=32,
# max_q_size=8, nb_worker=1, pickle_safe=False)
# labeled_metrics, metrics = extract_metrics(model, raw_metrics)
# print(labeled_metrics)
# print(metrics)
In [ ]:
# Get predictions
for dataset in [("train", p, tc, val_generator)]: #, ("val", val_p, vc, val_generator)]:
name, perc, count, gen = dataset
ys = []
preds = []
fnames = []
batches = math.floor(count / batch_size)
for i in range(batches):
# Get batch.
# x, y = next(gen)
x, y, fname = next(gen)
# Get predictions
pred = model.predict(x)
# Store y and predictions
ys.extend(y) # y is always a list of parallel batches, even if only 1 batch
if isinstance(pred, list):
preds.extend(pred)
else:
preds.append(pred)
fnames.extend(fname)
In [ ]:
# Create DataFrames
y = np.concatenate(ys)
pred = np.concatenate(preds)
fname = np.concatenate(fnames)
y_df = pd.DataFrame(y, columns=[1,2,3])
pred_df = pd.DataFrame(pred, columns=[1,2,3])
fname_df = pd.DataFrame(np.atleast_2d(fname).T, columns=["filenames"])
# Create class, prediction, slide_num DataFrames
y_class_df = y_df.idxmax(axis=1)
pred_class_df = pred_df.idxmax(axis=1)
y_class_df.name = "actual"
pred_class_df.name = "predicted"
slide_info_df = fname_df.filenames.str.extract('(?P<class>\d)\/\d+_(?P<slide_num>\d+)_\d+.jpeg', expand=True)
slide_info_df["class"] = slide_info_df["class"].astype(int)
slide_info_df["slide_num"] = slide_info_df["slide_num"].astype(int)
df = pd.concat([fname_df, slide_info_df, y_class_df, pred_class_df], axis=1)
# sanity check
assert np.allclose(df["class"], df.actual)
# Create Contingency matrix
contingency_mat = pd.crosstab(df.actual, df.predicted)
# # Save DataFrames
# y_df.to_csv(os.path.join(exp_dir, "{model_ck}-{perc}%-{data}-y_df.csv".format(model_ck=model_file[:-5], perc=100*perc, data=name)), header=True)
# pred_df.to_csv(os.path.join(exp_dir, "{model_ck}-{perc}%-{data}-pred_df.csv".format(model_ck=model_file[:-5], perc=100*perc, data=name)), header=True)
# df.to_csv(os.path.join(exp_dir, "{model_ck}-{perc}%-{data}-df.csv".format(model_ck=model_file[:-5], perc=100*perc, data=name)), header=True)
# # Save results
# with open(os.path.join(exp_dir, "{model_ck}-{perc}%-{data}-results.txt".format(model_ck=model_file[:-5], perc=100*perc, data=name)), 'w') as f:
# print("Dataset: {}".format(name), file=f)
# print("Number of samples: {}".format(len(y_df)), file=f)
# print(contingency_mat, file=f)
# print("Accuracy: {}".format(np.mean(np.equal(y_class, pred_class))), file=f)
print("Number of samples: {}".format(len(y_df)))
print(contingency_mat)
print("Accuracy: {}".format(np.mean(np.equal(y_class_df, pred_class_df))))
In [ ]:
len(y_df), len(pred_df), len(fname_df), len(df)
In [ ]:
df
In [ ]:
df2 = df.loc[:, ["slide_num", "actual", "predicted"]]
df2
In [ ]:
df3 = df2.groupby("slide_num").mean()
df3["predicted_round"] = df3.predicted.map(round)
df3
In [ ]:
sum(df3.actual == df3.predicted_round) / len(df3)
In [ ]:
pd.crosstab(df3.actual, df3.predicted_round)
In [ ]:
gb = df2.groupby(["slide_num"]) #, "predicted"])
gb.describe()
In [ ]:
# # Read DataFrames
# y_df = pd.read_csv(os.path.join(exp_dir, "{}-y_df.csv".format(model_file[:-5])), index_col=0)
# pred_df = pd.read_csv(os.path.join(exp_dir, "{}-pred_df.csv".format(model_file[:-5])), index_col=0)
In [ ]:
# # Create Contingency matrix
# y_class = y_df.idxmax(axis=1)
# pred_class = pred_df.idxmax(axis=1)
# y_class.name = "Actual"
# pred_class.name = "Predicted"
# contingency_mat = pd.crosstab(y_class, pred_class)
# print("Number of samples: {}".format(len(y_df)))
# print(contingency_mat)
# print("Accuracy: {}".format(np.mean(np.equal(y_class, pred_class))))
In [ ]:
# # --- Alternate approach with NumPy arrays only
# y_c = np.argmax(y, axis=1) + 1
# pred_c = np.argmax(pred, axis=1) + 1
# y_actu = pd.Series(y_c, name="Actual")
# y_pred = pd.Series(pred_c, name="Predicted")
# contingency_mat = pd.crosstab(y_actu, y_pred)
# print("Number of samples: {}".format(len(y_c)))
# print(contingency_mat)
# print("Accuracy: {}".format(np.mean(np.equal(y_c, pred_c))))
In [ ]:
# path_template = os.path.join("visualize", "{dataset}", "Pred_{pred}-Actual_{actual}")
# for dataset in ["train", "val"]:
# for i in range(3):
# for j in range(3):
# os.makedirs(path_template.format(dataset=dataset, pred=i+1, actual=j+1), exist_ok=True)
In [ ]:
# filename_template = os.path.join(path_template, "{hash}.jpeg")
# batches = 8
# for dataset in [("train", train_generator_orig), ("val", val_generator_orig)]:
# name, gen = dataset
# print(name)
# for i in range(batches):
# # Get batch.
# x_orig, y_orig = next(gen)
# x = preprocess_input(np.copy(x_orig))
# y = y_orig
# # Get predictions
# raw_preds = model.predict(x)
# raw_metrics = model.evaluate(x, y)
# labeled_metrics, metrics = extract_metrics(model, raw_metrics)
# # Create contingency matrix
# y = np.argmax(y, axis=1)+1
# preds = np.argmax(raw_preds, axis=1)+1
# y_actu = pd.Series(y, name="Actual")
# y_pred = pd.Series(preds, name="Predicted")
# contingency_mat = pd.crosstab(y_actu, y_pred)
# # # Output images in directories based on misclassification.
# # def plot(x, y):
# # r, c = 6, 6
# # fig, ax = plt.subplots(r, c)
# # plt.setp(ax, xticks=[], yticks=[])
# # plt.tight_layout()
# # batch_size = x.shape[0]
# # for i in range(r):
# # for j in range(c):
# # if i*c + j < batch_size:
# # ax[i][j].imshow(x[i*c + j].astype(np.uint8))
# # ax[i][j].set_xlabel("{preds}-{y}".format(y=y[i*c + j], preds=preds[i*c + j]))
# # plot(x_orig, y)
# # plt.show()
# for n in range(x_orig.shape[0]):
# img = Image.fromarray(x_orig[n].astype(np.uint8), 'RGB')
# filename = filename_template.format(dataset=name, pred=preds[n], actual=y[n], hash=np.random.randint(1e6))
# img.save(filename)
# print(contingency_mat)
# print(np.mean(y==preds))
# print(labeled_metrics)
# print(metrics)
In [ ]:
x, label, _ = (next(train_generator_orig))
Image.fromarray((x[0]).astype(np.uint8))
In [ ]:
preds = resnet50.predict(preprocess_input(x[0].reshape(1, 224, 224, 3)))
In [ ]:
print("Actual: {}".format(label[0]))
print("Pred: {}".format(preds[0]))
In [ ]:
# Stop processes cleanly. Otherwise, zombie processes will
# persist and hold onto GPU memory.
try:
pool.terminate()
except:
pass
for p in mp.active_children():
p.terminate()
mp.active_children()