Generate splits of ImageNet and save in sharded TFRECORD files

Licensed under the Apache License, Version 2.0


In [0]:
import os
os.environ['UNITTEST_ON_FORGE'] = '1'

import string
import random
import pickle
import zipfile
import io
import itertools

import time
import tensorflow as tf
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit
from matplotlib import pyplot as plt
import tqdm
import tensorflow_datasets as tfds

tf.enable_eager_execution()


import distutils
if distutils.version.LooseVersion(tf.__version__) < '1.14':
    raise Exception('This notebook is compatible with TensorFlow 1.14 or higher, for TensorFlow 1.13 or lower please use the previous version at https://github.com/tensorflow/tpu/blob/r1.13/tools/colab/fashion_mnist.ipynb')

  
print('Tensorflow version {}'.format(tf.__version__))

Path to ImageNet on Placer


In [0]:
# EDIT THESE
IMAGENET_TFRECORDS_SOURCE_PATH = r'<source path in here>'
OUT_SUBSET_SHARDS_PATH = r'<destination path in here>'
IMAGENET_SIZE = 1281167

Description of features in ImageNet tfrecord files


In [0]:
feature_description = {
    'label': tf.io.FixedLenFeature([], tf.int64, default_value=0),
    'image': tf.io.FixedLenFeature([], tf.string),
    'file_name': tf.io.FixedLenFeature([], tf.string),
}

Load tfrecord dataset


In [0]:
train_files = [f for f in tf.io.gfile.listdir(IMAGENET_TFRECORDS_SOURCE_PATH) if f.startswith('imagenet2012-train.tfrecord')]
train_paths = [IMAGENET_TFRECORDS_SOURCE_PATH + f for f in train_files]
ds = tf.data.TFRecordDataset(train_paths)

Get ground truth labels and filenames for all samples in training set


In [0]:
def get_labels(ds, N):
  it = ds.prefetch(tf.data.experimental.AUTOTUNE).make_one_shot_iterator()

  all_ys = []
  all_fns = []
  for _ in tqdm.tqdm(range(N)):
    sample = it.next()
    all_ys.append(sample['label'])
    all_fns.append(sample[SAMPLE_FILENAME_ATTR].numpy())

  return np.array(all_ys), all_fns

all_y, all_filenames = get_labels(tfds.load(name='imagenet2012', split=tfds.Split.TRAIN), IMAGENET_SIZE)

Define functions for getting ImageNet subsets


In [0]:
def random_name():
  allchar = string.ascii_letters
  name = "".join([random.choice(allchar) for x in range(8)])
  return name

def dataset_subset_by_file_name(in_ds, subset_filenames):
  kv_init = tf.lookup.KeyValueTensorInitializer(np.array(subset_filenames), np.ones((len(subset_filenames),), dtype=int),
                                                key_dtype=tf.string, value_dtype=tf.int64, name=random_name())
  ht = tf.lookup.StaticHashTable(kv_init, 0, name=random_name())

  def pred_fn(x):
    f = tf.io.parse_single_example(x, feature_description)
    return tf.equal(ht.lookup(f['file_name']), 1)

  return in_ds.filter(pred_fn)


def imagenet_subset(n_samples, seed):
  splitter = StratifiedShuffleSplit(1, test_size=n_samples, random_state=seed)
  _, ndx = next(splitter.split(data_index['y'], data_index['y']))

  sub_fn = [all_filenames[int(i)] for i in ndx]

  return dataset_subset_by_file_name(ds, sub_fn)



def write_imagenet_subset_by_fn_sharded(out_dir, name, filenames, num_shards, group='brain-ams'):
  # out_path is a directory name
  out_path = os.path.join(out_dir, name)
  if tf.io.gfile.exists(out_path):
    print('Skipping already existing {}'.format(out_path))
    return
  print('Generating {} ...'.format(out_path))
  tf.io.gfile.mkdir(out_path)
  t1 = time.time()
  sub_ds = dataset_subset_by_file_name(ds, filenames)

  shard_base_path = os.path.join(out_path, '{}.tfrecord-'.format(name))
  def reduce_func(key, dataset):
    filename = tf.strings.join([shard_base_path, tf.strings.as_string(key)])
    writer = tf.data.experimental.TFRecordWriter(filename)
    writer.write(dataset.map(lambda _, x: x))
    return tf.data.Dataset.from_tensors(filename)

  write_ds = sub_ds.enumerate()
  write_ds = write_ds.apply(tf.data.experimental.group_by_window(
    lambda i, _: i % num_shards, reduce_func, tf.int64.max
  ))
  for x in write_ds:
    pass

  t2 = time.time()
  print('Built subset {} in {:.2f}s'.format(
      name, t2 - t1
  ))


def write_imagenet_subset_sharded(out_dir, name, ds_filenames, ds_y, n_samples, seed, num_shards, group='brain-ams'):
  splitter = StratifiedShuffleSplit(1, test_size=n_samples, random_state=seed)
  _, ndx = next(splitter.split(ds_y, ds_y))

  sub_fn = [ds_filenames[int(i)] for i in ndx]

  write_imagenet_subset_by_fn_sharded(out_dir, name, sub_fn, num_shards, group=group)

Build our subsets

Split into train and val


In [0]:
n_train_val = len(all_filenames)
N_VAL = 50000
VAL_SEED = 131

# Validation set
trainval_splitter = StratifiedShuffleSplit(1, test_size=N_VAL, random_state=VAL_SEED)
train_ndx, val_ndx = next(trainval_splitter.split(data_index['y'], data_index['y']))

train_fn = [all_filenames[int(i)] for i in train_ndx]
train_y = all_y[train_ndx]
val_fn = [all_filenames[int(i)] for i in val_ndx]
val_y = all_y[val_ndx]

print('Split train-val set of {} into {} train and {} val'.format(
    n_train_val, len(train_fn), len(val_fn)
))

In [0]:
split_path = os.path.join(OUT_SUBSET_SHARDS_PATH, 'imagenet_tv{}s{}_split.pkl'.format(N_VAL, VAL_SEED))
with tf.io.gfile.GFile(split_path, mode='wb') as f_split:
  split_data = dict(train_fn=train_fn, train_y=train_y, val_fn=val_fn, val_y=val_y)
  pickle.dump(split_data, f_split)

In [0]:
# Val
write_imagenet_subset_by_fn_sharded(OUT_SUBSET_SHARDS_PATH, 'imagenet_tv{}s{}_val'.format(N_VAL, VAL_SEED),
                                    val_fn, num_shards=256)


# 1% subsets
write_imagenet_subset_sharded(OUT_SUBSET_SHARDS_PATH, 'imagenet_tv{}s{}_{}_seed{}'.format(N_VAL, VAL_SEED, len(train_fn)//100, 12345),
                              train_fn, train_y, len(train_fn)//100, 12345, num_shards=256)
write_imagenet_subset_sharded(OUT_SUBSET_SHARDS_PATH, 'imagenet_tv{}s{}_{}_seed{}'.format(N_VAL, VAL_SEED, len(train_fn)//100, 23456),
                              train_fn, train_y, len(train_fn)//100, 23456, num_shards=256)
write_imagenet_subset_sharded(OUT_SUBSET_SHARDS_PATH, 'imagenet_tv{}s{}_{}_seed{}'.format(N_VAL, VAL_SEED, len(train_fn)//100, 34567),
                              train_fn, train_y, len(train_fn)//100, 34567, num_shards=256)
write_imagenet_subset_sharded(OUT_SUBSET_SHARDS_PATH, 'imagenet_tv{}s{}_{}_seed{}'.format(N_VAL, VAL_SEED, len(train_fn)//100, 45678),
                              train_fn, train_y, len(train_fn)//100, 45678, num_shards=256)
write_imagenet_subset_sharded(OUT_SUBSET_SHARDS_PATH, 'imagenet_tv{}s{}_{}_seed{}'.format(N_VAL, VAL_SEED, len(train_fn)//100, 56789),
                              train_fn, train_y, len(train_fn)//100, 56789, num_shards=256)


# 10% subsets
write_imagenet_subset_sharded(OUT_SUBSET_SHARDS_PATH, 'imagenet_tv{}s{}_{}_seed{}'.format(N_VAL, VAL_SEED, len(train_fn)//10, 12345),
                              train_fn, train_y, len(train_fn)//10, 12345, num_shards=256)
write_imagenet_subset_sharded(OUT_SUBSET_SHARDS_PATH, 'imagenet_tv{}s{}_{}_seed{}'.format(N_VAL, VAL_SEED, len(train_fn)//10, 23456),
                              train_fn, train_y, len(train_fn)//10, 23456, num_shards=256)
write_imagenet_subset_sharded(OUT_SUBSET_SHARDS_PATH, 'imagenet_tv{}s{}_{}_seed{}'.format(N_VAL, VAL_SEED, len(train_fn)//10, 34567),
                              train_fn, train_y, len(train_fn)//10, 34567, num_shards=256)
write_imagenet_subset_sharded(OUT_SUBSET_SHARDS_PATH, 'imagenet_tv{}s{}_{}_seed{}'.format(N_VAL, VAL_SEED, len(train_fn)//10, 45678),
                              train_fn, train_y, len(train_fn)//10, 45678, num_shards=256)
write_imagenet_subset_sharded(OUT_SUBSET_SHARDS_PATH, 'imagenet_tv{}s{}_{}_seed{}'.format(N_VAL, VAL_SEED, len(train_fn)//10, 56789),
                              train_fn, train_y, len(train_fn)//10, 56789, num_shards=256)

Write train subsets (no validation split; use ImageNet validation as evaluation set)


In [0]:
# 1% subsets
write_imagenet_subset_sharded(OUT_SUBSET_SHARDS_PATH, 'imagenet_{}_seed{}'.format(IMAGENET_SIZE//100, 12345),
                              all_filenames, len(all_filenames)//100, 12345, num_shards=256)
write_imagenet_subset_sharded(OUT_SUBSET_SHARDS_PATH, 'imagenet_{}_seed{}'.format(IMAGENET_SIZE//100, 23456),
                              all_filenames, len(all_filenames)//100, 23456, num_shards=256)
write_imagenet_subset_sharded(OUT_SUBSET_SHARDS_PATH, 'imagenet_{}_seed{}'.format(IMAGENET_SIZE//100, 34567),
                              all_filenames, len(all_filenames)//100, 34567, num_shards=256)
write_imagenet_subset_sharded(OUT_SUBSET_SHARDS_PATH, 'imagenet_{}_seed{}'.format(IMAGENET_SIZE//100, 45678),
                              all_filenames, len(all_filenames)//100, 45678, num_shards=256)
write_imagenet_subset_sharded(OUT_SUBSET_SHARDS_PATH, 'imagenet_{}_seed{}'.format(IMAGENET_SIZE//100, 56789),
                              all_filenames, len(all_filenames)//100, 56789, num_shards=256)


# 10% subsets
write_imagenet_subset_sharded(OUT_SUBSET_SHARDS_PATH, 'imagenet_{}_seed{}'.format(IMAGENET_SIZE//10, 12345),
                              all_filenames, len(all_filenames)//10, 12345, num_shards=256)
write_imagenet_subset_sharded(OUT_SUBSET_SHARDS_PATH, 'imagenet_{}_seed{}'.format(IMAGENET_SIZE//10, 23456),
                              all_filenames, len(all_filenames)//10, 23456, num_shards=256)
write_imagenet_subset_sharded(OUT_SUBSET_SHARDS_PATH, 'imagenet_{}_seed{}'.format(IMAGENET_SIZE//10, 34567),
                              all_filenames, len(all_filenames)//10, 34567, num_shards=256)
write_imagenet_subset_sharded(OUT_SUBSET_SHARDS_PATH, 'imagenet_{}_seed{}'.format(IMAGENET_SIZE//10, 45678),
                              all_filenames, len(all_filenames)//10, 45678, num_shards=256)
write_imagenet_subset_sharded(OUT_SUBSET_SHARDS_PATH, 'imagenet_{}_seed{}'.format(IMAGENET_SIZE//10, 56789),
                              all_filenames, len(all_filenames)//10, 56789, num_shards=256)