Imports



In [ ]:

    
%load_ext autoreload
%autoreload 2
%matplotlib inline

import math
import multiprocessing as mp
import os

import keras
import keras.backend as K
from keras.applications.resnet50 import ResNet50
from keras.callbacks import ModelCheckpoint, TensorBoard
from keras.initializers import VarianceScaling
from keras.layers import Dense, Dropout, Flatten, GlobalAveragePooling2D, Input, Lambda, merge
from keras.models import Model, load_model
from keras.optimizers import SGD
# from keras.preprocessing.image import ImageDataGenerator
from keras.regularizers import l2
from keras.utils import to_categorical
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from PIL import Image
import tensorflow as tf

# After move to Keras 2.0 API, need to check if this can still be used.
from preprocessing.image_eval import ImageDataGenerator  # multiprocessing ImageDataGenerator

plt.rcParams['figure.figsize'] = (10, 10)

Settings



In [ ]:

    
# NOTE: Need to update the following for each model
# 1. train & val data dirs
# 2. train & val data percentages
# 3. experiment directory
# 4. model file
# 5. preprocessing channel means



In [ ]:

    
#os.environ['CUDA_VISIBLE_DEVICES'] = ""
size = 224
channels = 3
classes = 3
p = 0.01  # 0.01
val_p = 0.01  #0.01
num_gpus = 4
batch_size = 32 * num_gpus  # for 2 GPUs, 32/GPU has 1.2x systems speedup over 16/GPU
train_dir = "train_updated_norm_v3"
val_dir = "val_updated_norm_v3"
run = 13
# exp_dir = "experiments/keras/resnet50-1%-4-gpu-128-batch-size-updated-norm-v3-data-1%-val-sanity/4"
experiment_template = "resnet50-{p}%-{num_gpus}-gpu-{batch_size}-batch-size-{train_dir}-data-{val_p}%-val-sanity/{run}"
experiment = experiment_template.format(p=int(p*100), val_p=int(val_p*100), num_gpus=num_gpus,
                                        batch_size=batch_size, train_dir=train_dir, run=run)
model_file = "0.38936_acc_0.27847_loss_model.hdf5"
exp_dir = os.path.join("experiments", "keras", experiment)
# experiment_name = model_file.replace("/", "_")[:-5]
print(exp_dir)



In [ ]:

    
# os.makedirs(os.path.join("results", experiment_name), exist_ok=True)

Load model



In [ ]:

    
model = load_model(os.path.join(exp_dir, model_file))



In [ ]:

    
print(model.summary())
print(model.get_layer("resnet50").summary())



In [ ]:

    
# Visualize Model
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot
SVG(model_to_dot(model.get_layer("resnet50")).create(prog='dot', format='svg'))



In [ ]:

    
# Note: previous `model` is already compiled and ready to go.
# However, it may have been built for multi-GPU training, so it
# would still require multiple parallel inputs at eval time.
# Even worse, the device settings will not be retained, so all
# towers would be run on one device.  To fix this, we can extract
# a single tower, rewrap in a multi-GPU block, and recompile.

# Extract single tower
resnet50 = model.get_layer("resnet50")
#model.save("resnet50-100%-4-gpu-128-batch-size-updated-norm-v3-data-1%-val-dropout_0_1.56-19_NO_GPU_TOWERS.hdf5")

# Multi-GPU exploitation via a linear combination of GPU loss functions.
ins = []
outs = []
for i in range(num_gpus):
  with tf.device("/gpu:{}".format(i)):
    x = Input(shape=(size,size,channels))  # split of batch
    out = resnet50(x)  # run split on shared model
    ins.append(x)
    outs.append(out)
model = Model(inputs=ins, outputs=outs)  # multi-GPU, data-parallel model

# Compile model.
metrics = ['accuracy']
model.compile(optimizer="sgd", loss="categorical_crossentropy",
              loss_weights=[1/num_gpus]*num_gpus, metrics=metrics)



In [ ]:

    
# # Explore model
for x in model.inputs + model.outputs + model.metrics_tensors + model.targets:
  print(x.name, x.device)  # check that tensor devices exploit multi-GPU

# print(model.summary())

# print(resnet50.summary())

Create train & val data generators



In [ ]:

    
train_save_dir = "images/{stage}/{p}".format(stage=train_dir, p=p)
val_save_dir = "images/{stage}/{p}".format(stage=val_dir, p=val_p)
print(train_save_dir, val_save_dir)



In [ ]:

    
def preprocess_input(x):
  """
  Preprocesses a tensor encoding a batch of images.

  Adapted from keras/applications/imagenet_utils.py

  # Arguments
      x: input Numpy tensor, 4D of shape (N, H, W, C).
  # Returns
      Preprocessed tensor.
  """
  # Zero-center by subtracting mean pixel value per channel
  # based on means from a 50%, evenly-distributed sample.
  # Means: updated-data norm v3, norm, no-norm original
  x[:, :, :, 0] -= 183.36777842  #189.54944625  #194.27633667
  x[:, :, :, 1] -= 138.81743141  #152.73427159  #145.3067627
  x[:, :, :, 2] -= 166.07406199  #176.89543273  #181.27861023 
  x = x[:, :, :, ::-1]  # 'RGB'->'BGR'
  return x

# Multi-GPU exploitation
def split(x, num_splits):
  """Split batch into K equal-sized batches."""
  # Split tensors evenly, even if it means throwing away a few examples.
  samples = math.floor(len(x) / num_splits)
  x_splits = [arr[:samples] for arr in np.array_split(x, num_splits)]
  return x_splits

def gen_preprocessed_batch(batch_generator, num_gpus):
  """Yield preprocessed batches of x,y data."""
#   for xs, ys in batch_generator:
#     yield split(preprocess_input(xs), num_gpus), split(ys, num_gpus)
#     yield split(xs, num_gpus), split(ys, num_gpus)  for tf aug experiments
  for xs, ys, filenames in batch_generator:
    yield split(preprocess_input(xs), num_gpus), split(ys, num_gpus), split(filenames, num_gpus)



In [ ]:

    
# Create train & val image generators
try:
  # For interactive work, kill any existing pool.
  pool.terminate()
except:
  pass
pool = mp.Pool(processes=8)
train_datagen = ImageDataGenerator(pool=pool) #, horizontal_flip=True, vertical_flip=True,
#                                    rotation_range=180, shear_range=0.1, fill_mode='reflect')
val_datagen = ImageDataGenerator(pool=pool)
#train_datagen = ImageDataGenerator()
#val_datagen = ImageDataGenerator()
train_generator_orig = train_datagen.flow_from_directory(train_save_dir, batch_size=batch_size, target_size=(size, size))
val_generator_orig = val_datagen.flow_from_directory(val_save_dir, batch_size=batch_size, target_size=(size, size))



In [ ]:

    
# Create train & val preprocessed generators
train_generator = gen_preprocessed_batch(train_generator_orig, num_gpus)
val_generator = gen_preprocessed_batch(val_generator_orig, num_gpus)

Get number of samples



In [ ]:

    
# Number of examples.
tc = train_generator_orig.nb_sample
vc = val_generator_orig.nb_sample
#tc = train_generator_orig.samples
#vc = val_generator_orig.samples

# Number of batches for multi-GPU exploitation.
# Note: Multi-GPU exploitation for data parallelism splits mini-batches
# into a set of micro-batches to be run in parallel on each GPU, but
# Keras will view the set of micro-batches as a single batch with
# multiple sources of inputs (i.e. Keras will view a set of examples
# being run in parallel as a single example with multiple sources of
# inputs).
train_batches = int(math.ceil(tc/batch_size))
val_batches = int(math.ceil(vc/batch_size))

# Class counts (just for information)
train_class_counts = np.bincount(train_generator_orig.classes)
val_class_counts = np.bincount(val_generator_orig.classes)

print(tc, vc)
print(train_batches, val_batches)
print(train_class_counts / np.sum(train_class_counts), val_class_counts / np.sum(val_class_counts))

Generate class weights for training



In [ ]:

    
class_counts = np.bincount(train_generator_orig.classes)
class_weights = dict(zip(range(classes), min(class_counts) / class_counts))
print(class_counts)
print(class_weights)

Plot random images (Optional)



In [ ]:

    
def show_random_image(save_dir):
  c = np.random.randint(1, 4)
  class_dir = os.path.join(save_dir, str(c))
  files = os.listdir(class_dir)
  i = np.random.randint(0, len(files))
  fname = os.path.join(class_dir, files[i])
  print(fname)
  img = Image.open(fname)
  plt.imshow(img)

# show_random_image(train_save_dir)



In [ ]:

    
def plot(gen):
  r, c = 6, 6
  fig, ax = plt.subplots(r, c)
  plt.setp(ax, xticks=[], yticks=[])
  plt.tight_layout()
  x, y, fname = next(gen)
  batch_size = x.shape[0]
  for i in range(r):
    for j in range(c):
      if i*c + j < batch_size:
        im = x[i*c + j].astype(np.uint8)
        if K.image_data_format() == 'channels_first':
          im = im.transpose(1,2,0)  # (C,H,W) -> (H,W,C)
        ax[i][j].imshow(im)
        ax[i][j].set_xlabel(y[i*c + j])

plot(train_generator_orig)
plot(val_generator_orig)

Evaluate previous model checkpoint



In [ ]:

    
# NOTE: We could call the `model.evaluate*` methods,
# but that would not allow us to create contingency
# matrices.  Instead, we repeatedly loop over batches
# of data, collecting both the true labels and
# predictions.  Then, we can compute any metrics
# desired, including 3x3 contingency matrices.



In [ ]:

    
# def extract_metrics(model, raw_metrics):
#   labeled_metrics = list(zip(model.metrics_names, raw_metrics))
#   losses = [v for k,v in labeled_metrics if k == "loss"]
#   accuracies = [v for k,v in labeled_metrics if k.endswith("acc")]
#   loss = sum(losses) / num_gpus
#   acc = sum(accuracies) / num_gpus
#   metrics = {"loss": loss, "acc": acc}
#   return labeled_metrics, metrics

# raw_metrics = model.evaluate_generator(val_generator, val_samples=32,
#                                        max_q_size=8, nb_worker=1, pickle_safe=False)

# labeled_metrics, metrics = extract_metrics(model, raw_metrics)
# print(labeled_metrics)
# print(metrics)



In [ ]:

    
# Get predictions
for dataset in [("train", p, tc, val_generator)]:  #, ("val", val_p, vc, val_generator)]:
  name, perc, count, gen = dataset

  ys = []
  preds = []
  fnames = []
  batches = math.floor(count / batch_size)
  for i in range(batches):
    # Get batch.
#     x, y = next(gen)
    x, y, fname = next(gen)

    # Get predictions
    pred = model.predict(x)

    # Store y and predictions
    ys.extend(y)  # y is always a list of parallel batches, even if only 1 batch
    if isinstance(pred, list):
      preds.extend(pred)
    else:
      preds.append(pred)
    fnames.extend(fname)



In [ ]:

    
# Create DataFrames
  y = np.concatenate(ys)
  pred = np.concatenate(preds)
  fname = np.concatenate(fnames)
  y_df = pd.DataFrame(y, columns=[1,2,3])
  pred_df = pd.DataFrame(pred, columns=[1,2,3])
  fname_df = pd.DataFrame(np.atleast_2d(fname).T, columns=["filenames"])

  # Create class, prediction, slide_num DataFrames
  y_class_df = y_df.idxmax(axis=1)
  pred_class_df = pred_df.idxmax(axis=1)
  y_class_df.name = "actual"
  pred_class_df.name = "predicted"
  slide_info_df = fname_df.filenames.str.extract('(?P<class>\d)\/\d+_(?P<slide_num>\d+)_\d+.jpeg', expand=True)
  slide_info_df["class"] = slide_info_df["class"].astype(int)
  slide_info_df["slide_num"] = slide_info_df["slide_num"].astype(int)
  df = pd.concat([fname_df, slide_info_df, y_class_df, pred_class_df], axis=1)
  
  # sanity check
  assert np.allclose(df["class"], df.actual)
  
  # Create Contingency matrix
  contingency_mat = pd.crosstab(df.actual, df.predicted)

#   # Save DataFrames
#   y_df.to_csv(os.path.join(exp_dir, "{model_ck}-{perc}%-{data}-y_df.csv".format(model_ck=model_file[:-5], perc=100*perc, data=name)), header=True)
#   pred_df.to_csv(os.path.join(exp_dir, "{model_ck}-{perc}%-{data}-pred_df.csv".format(model_ck=model_file[:-5], perc=100*perc, data=name)), header=True)
#   df.to_csv(os.path.join(exp_dir, "{model_ck}-{perc}%-{data}-df.csv".format(model_ck=model_file[:-5], perc=100*perc, data=name)), header=True)

#   # Save results
#   with open(os.path.join(exp_dir, "{model_ck}-{perc}%-{data}-results.txt".format(model_ck=model_file[:-5], perc=100*perc, data=name)), 'w') as f:
#     print("Dataset: {}".format(name), file=f)
#     print("Number of samples: {}".format(len(y_df)), file=f)
#     print(contingency_mat, file=f)
#     print("Accuracy: {}".format(np.mean(np.equal(y_class, pred_class))), file=f)
  print("Number of samples: {}".format(len(y_df)))
  print(contingency_mat)
  print("Accuracy: {}".format(np.mean(np.equal(y_class_df, pred_class_df))))



In [ ]:

    
len(y_df), len(pred_df), len(fname_df), len(df)



In [ ]:

    
df



In [ ]:

    
df2 = df.loc[:, ["slide_num", "actual", "predicted"]]
df2



In [ ]:

    
df3 = df2.groupby("slide_num").mean()
df3["predicted_round"] = df3.predicted.map(round)
df3



In [ ]:

    
sum(df3.actual == df3.predicted_round) / len(df3)



In [ ]:

    
pd.crosstab(df3.actual, df3.predicted_round)



In [ ]:

    
gb = df2.groupby(["slide_num"])  #, "predicted"])
gb.describe()

Read in predictions + true DataFrames and extract metrics



In [ ]:

    
# # Read DataFrames
# y_df = pd.read_csv(os.path.join(exp_dir, "{}-y_df.csv".format(model_file[:-5])), index_col=0)
# pred_df = pd.read_csv(os.path.join(exp_dir, "{}-pred_df.csv".format(model_file[:-5])), index_col=0)



In [ ]:

    
# # Create Contingency matrix
# y_class = y_df.idxmax(axis=1)
# pred_class = pred_df.idxmax(axis=1)
# y_class.name = "Actual"
# pred_class.name = "Predicted"
# contingency_mat = pd.crosstab(y_class, pred_class)

# print("Number of samples: {}".format(len(y_df)))
# print(contingency_mat)
# print("Accuracy: {}".format(np.mean(np.equal(y_class, pred_class))))



In [ ]:

    
# # --- Alternate approach with NumPy arrays only
# y_c = np.argmax(y, axis=1) + 1
# pred_c = np.argmax(pred, axis=1) + 1
# y_actu = pd.Series(y_c, name="Actual")
# y_pred = pd.Series(pred_c, name="Predicted")
# contingency_mat = pd.crosstab(y_actu, y_pred)

# print("Number of samples: {}".format(len(y_c)))
# print(contingency_mat)
# print("Accuracy: {}".format(np.mean(np.equal(y_c, pred_c))))

Sample images + predictions & write to disk



In [ ]:

    
# path_template = os.path.join("visualize", "{dataset}", "Pred_{pred}-Actual_{actual}")
# for dataset in ["train", "val"]:
#   for i in range(3):
#     for j in range(3):
#       os.makedirs(path_template.format(dataset=dataset, pred=i+1, actual=j+1), exist_ok=True)



In [ ]:

    
# filename_template = os.path.join(path_template, "{hash}.jpeg")
# batches = 8

# for dataset in [("train", train_generator_orig), ("val", val_generator_orig)]:
#   name, gen = dataset
#   print(name)
  
#   for i in range(batches):
#     # Get batch.
#     x_orig, y_orig = next(gen)
#     x = preprocess_input(np.copy(x_orig))
#     y = y_orig

#     # Get predictions
#     raw_preds = model.predict(x)
#     raw_metrics = model.evaluate(x, y)
#     labeled_metrics, metrics = extract_metrics(model, raw_metrics)

#     # Create contingency matrix
#     y = np.argmax(y, axis=1)+1
#     preds = np.argmax(raw_preds, axis=1)+1
#     y_actu = pd.Series(y, name="Actual")
#     y_pred = pd.Series(preds, name="Predicted")
#     contingency_mat = pd.crosstab(y_actu, y_pred)

# #     # Output images in directories based on misclassification.
# #     def plot(x, y):
# #       r, c = 6, 6
# #       fig, ax = plt.subplots(r, c)
# #       plt.setp(ax, xticks=[], yticks=[])
# #       plt.tight_layout()
# #       batch_size = x.shape[0]
# #       for i in range(r):
# #         for j in range(c):
# #           if i*c + j < batch_size:
# #             ax[i][j].imshow(x[i*c + j].astype(np.uint8))
# #             ax[i][j].set_xlabel("{preds}-{y}".format(y=y[i*c + j], preds=preds[i*c + j]))

# #     plot(x_orig, y)
# #     plt.show()

#     for n in range(x_orig.shape[0]):
#       img = Image.fromarray(x_orig[n].astype(np.uint8), 'RGB')
#       filename = filename_template.format(dataset=name, pred=preds[n], actual=y[n], hash=np.random.randint(1e6))
#       img.save(filename)

#     print(contingency_mat)
#     print(np.mean(y==preds))
#     print(labeled_metrics)
#     print(metrics)

Predict



In [ ]:

    
x, label, _ = (next(train_generator_orig))
Image.fromarray((x[0]).astype(np.uint8))



In [ ]:

    
preds = resnet50.predict(preprocess_input(x[0].reshape(1, 224, 224, 3)))



In [ ]:

    
print("Actual: {}".format(label[0]))
print("Pred:   {}".format(preds[0]))

Cleanup



In [ ]:

    
# Stop processes cleanly.  Otherwise, zombie processes will
# persist and hold onto GPU memory.
try:
    pool.terminate()
except:
    pass
for p in mp.active_children():
  p.terminate()
mp.active_children()