In [0]:
import os
os.environ['UNITTEST_ON_FORGE'] = '1'
import string
import random
import pickle
import zipfile
import io
import itertools
import time
import tensorflow as tf
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit
from matplotlib import pyplot as plt
import tqdm
import tensorflow_datasets as tfds
tf.enable_eager_execution()
import distutils
if distutils.version.LooseVersion(tf.__version__) < '1.14':
raise Exception('This notebook is compatible with TensorFlow 1.14 or higher, for TensorFlow 1.13 or lower please use the previous version at https://github.com/tensorflow/tpu/blob/r1.13/tools/colab/fashion_mnist.ipynb')
print('Tensorflow version {}'.format(tf.__version__))
In [0]:
# EDIT THESE
IMAGENET_TFRECORDS_SOURCE_PATH = r'<source path in here>'
OUT_SUBSET_SHARDS_PATH = r'<destination path in here>'
IMAGENET_SIZE = 1281167
In [0]:
feature_description = {
'label': tf.io.FixedLenFeature([], tf.int64, default_value=0),
'image': tf.io.FixedLenFeature([], tf.string),
'file_name': tf.io.FixedLenFeature([], tf.string),
}
In [0]:
train_files = [f for f in tf.io.gfile.listdir(IMAGENET_TFRECORDS_SOURCE_PATH) if f.startswith('imagenet2012-train.tfrecord')]
train_paths = [IMAGENET_TFRECORDS_SOURCE_PATH + f for f in train_files]
ds = tf.data.TFRecordDataset(train_paths)
In [0]:
def get_labels(ds, N):
it = ds.prefetch(tf.data.experimental.AUTOTUNE).make_one_shot_iterator()
all_ys = []
all_fns = []
for _ in tqdm.tqdm(range(N)):
sample = it.next()
all_ys.append(sample['label'])
all_fns.append(sample[SAMPLE_FILENAME_ATTR].numpy())
return np.array(all_ys), all_fns
all_y, all_filenames = get_labels(tfds.load(name='imagenet2012', split=tfds.Split.TRAIN), IMAGENET_SIZE)
In [0]:
def random_name():
allchar = string.ascii_letters
name = "".join([random.choice(allchar) for x in range(8)])
return name
def dataset_subset_by_file_name(in_ds, subset_filenames):
kv_init = tf.lookup.KeyValueTensorInitializer(np.array(subset_filenames), np.ones((len(subset_filenames),), dtype=int),
key_dtype=tf.string, value_dtype=tf.int64, name=random_name())
ht = tf.lookup.StaticHashTable(kv_init, 0, name=random_name())
def pred_fn(x):
f = tf.io.parse_single_example(x, feature_description)
return tf.equal(ht.lookup(f['file_name']), 1)
return in_ds.filter(pred_fn)
def imagenet_subset(n_samples, seed):
splitter = StratifiedShuffleSplit(1, test_size=n_samples, random_state=seed)
_, ndx = next(splitter.split(data_index['y'], data_index['y']))
sub_fn = [all_filenames[int(i)] for i in ndx]
return dataset_subset_by_file_name(ds, sub_fn)
def write_imagenet_subset_by_fn_sharded(out_dir, name, filenames, num_shards, group='brain-ams'):
# out_path is a directory name
out_path = os.path.join(out_dir, name)
if tf.io.gfile.exists(out_path):
print('Skipping already existing {}'.format(out_path))
return
print('Generating {} ...'.format(out_path))
tf.io.gfile.mkdir(out_path)
t1 = time.time()
sub_ds = dataset_subset_by_file_name(ds, filenames)
shard_base_path = os.path.join(out_path, '{}.tfrecord-'.format(name))
def reduce_func(key, dataset):
filename = tf.strings.join([shard_base_path, tf.strings.as_string(key)])
writer = tf.data.experimental.TFRecordWriter(filename)
writer.write(dataset.map(lambda _, x: x))
return tf.data.Dataset.from_tensors(filename)
write_ds = sub_ds.enumerate()
write_ds = write_ds.apply(tf.data.experimental.group_by_window(
lambda i, _: i % num_shards, reduce_func, tf.int64.max
))
for x in write_ds:
pass
t2 = time.time()
print('Built subset {} in {:.2f}s'.format(
name, t2 - t1
))
def write_imagenet_subset_sharded(out_dir, name, ds_filenames, ds_y, n_samples, seed, num_shards, group='brain-ams'):
splitter = StratifiedShuffleSplit(1, test_size=n_samples, random_state=seed)
_, ndx = next(splitter.split(ds_y, ds_y))
sub_fn = [ds_filenames[int(i)] for i in ndx]
write_imagenet_subset_by_fn_sharded(out_dir, name, sub_fn, num_shards, group=group)
In [0]:
n_train_val = len(all_filenames)
N_VAL = 50000
VAL_SEED = 131
# Validation set
trainval_splitter = StratifiedShuffleSplit(1, test_size=N_VAL, random_state=VAL_SEED)
train_ndx, val_ndx = next(trainval_splitter.split(data_index['y'], data_index['y']))
train_fn = [all_filenames[int(i)] for i in train_ndx]
train_y = all_y[train_ndx]
val_fn = [all_filenames[int(i)] for i in val_ndx]
val_y = all_y[val_ndx]
print('Split train-val set of {} into {} train and {} val'.format(
n_train_val, len(train_fn), len(val_fn)
))
In [0]:
split_path = os.path.join(OUT_SUBSET_SHARDS_PATH, 'imagenet_tv{}s{}_split.pkl'.format(N_VAL, VAL_SEED))
with tf.io.gfile.GFile(split_path, mode='wb') as f_split:
split_data = dict(train_fn=train_fn, train_y=train_y, val_fn=val_fn, val_y=val_y)
pickle.dump(split_data, f_split)
In [0]:
# Val
write_imagenet_subset_by_fn_sharded(OUT_SUBSET_SHARDS_PATH, 'imagenet_tv{}s{}_val'.format(N_VAL, VAL_SEED),
val_fn, num_shards=256)
# 1% subsets
write_imagenet_subset_sharded(OUT_SUBSET_SHARDS_PATH, 'imagenet_tv{}s{}_{}_seed{}'.format(N_VAL, VAL_SEED, len(train_fn)//100, 12345),
train_fn, train_y, len(train_fn)//100, 12345, num_shards=256)
write_imagenet_subset_sharded(OUT_SUBSET_SHARDS_PATH, 'imagenet_tv{}s{}_{}_seed{}'.format(N_VAL, VAL_SEED, len(train_fn)//100, 23456),
train_fn, train_y, len(train_fn)//100, 23456, num_shards=256)
write_imagenet_subset_sharded(OUT_SUBSET_SHARDS_PATH, 'imagenet_tv{}s{}_{}_seed{}'.format(N_VAL, VAL_SEED, len(train_fn)//100, 34567),
train_fn, train_y, len(train_fn)//100, 34567, num_shards=256)
write_imagenet_subset_sharded(OUT_SUBSET_SHARDS_PATH, 'imagenet_tv{}s{}_{}_seed{}'.format(N_VAL, VAL_SEED, len(train_fn)//100, 45678),
train_fn, train_y, len(train_fn)//100, 45678, num_shards=256)
write_imagenet_subset_sharded(OUT_SUBSET_SHARDS_PATH, 'imagenet_tv{}s{}_{}_seed{}'.format(N_VAL, VAL_SEED, len(train_fn)//100, 56789),
train_fn, train_y, len(train_fn)//100, 56789, num_shards=256)
# 10% subsets
write_imagenet_subset_sharded(OUT_SUBSET_SHARDS_PATH, 'imagenet_tv{}s{}_{}_seed{}'.format(N_VAL, VAL_SEED, len(train_fn)//10, 12345),
train_fn, train_y, len(train_fn)//10, 12345, num_shards=256)
write_imagenet_subset_sharded(OUT_SUBSET_SHARDS_PATH, 'imagenet_tv{}s{}_{}_seed{}'.format(N_VAL, VAL_SEED, len(train_fn)//10, 23456),
train_fn, train_y, len(train_fn)//10, 23456, num_shards=256)
write_imagenet_subset_sharded(OUT_SUBSET_SHARDS_PATH, 'imagenet_tv{}s{}_{}_seed{}'.format(N_VAL, VAL_SEED, len(train_fn)//10, 34567),
train_fn, train_y, len(train_fn)//10, 34567, num_shards=256)
write_imagenet_subset_sharded(OUT_SUBSET_SHARDS_PATH, 'imagenet_tv{}s{}_{}_seed{}'.format(N_VAL, VAL_SEED, len(train_fn)//10, 45678),
train_fn, train_y, len(train_fn)//10, 45678, num_shards=256)
write_imagenet_subset_sharded(OUT_SUBSET_SHARDS_PATH, 'imagenet_tv{}s{}_{}_seed{}'.format(N_VAL, VAL_SEED, len(train_fn)//10, 56789),
train_fn, train_y, len(train_fn)//10, 56789, num_shards=256)
In [0]:
# 1% subsets
write_imagenet_subset_sharded(OUT_SUBSET_SHARDS_PATH, 'imagenet_{}_seed{}'.format(IMAGENET_SIZE//100, 12345),
all_filenames, len(all_filenames)//100, 12345, num_shards=256)
write_imagenet_subset_sharded(OUT_SUBSET_SHARDS_PATH, 'imagenet_{}_seed{}'.format(IMAGENET_SIZE//100, 23456),
all_filenames, len(all_filenames)//100, 23456, num_shards=256)
write_imagenet_subset_sharded(OUT_SUBSET_SHARDS_PATH, 'imagenet_{}_seed{}'.format(IMAGENET_SIZE//100, 34567),
all_filenames, len(all_filenames)//100, 34567, num_shards=256)
write_imagenet_subset_sharded(OUT_SUBSET_SHARDS_PATH, 'imagenet_{}_seed{}'.format(IMAGENET_SIZE//100, 45678),
all_filenames, len(all_filenames)//100, 45678, num_shards=256)
write_imagenet_subset_sharded(OUT_SUBSET_SHARDS_PATH, 'imagenet_{}_seed{}'.format(IMAGENET_SIZE//100, 56789),
all_filenames, len(all_filenames)//100, 56789, num_shards=256)
# 10% subsets
write_imagenet_subset_sharded(OUT_SUBSET_SHARDS_PATH, 'imagenet_{}_seed{}'.format(IMAGENET_SIZE//10, 12345),
all_filenames, len(all_filenames)//10, 12345, num_shards=256)
write_imagenet_subset_sharded(OUT_SUBSET_SHARDS_PATH, 'imagenet_{}_seed{}'.format(IMAGENET_SIZE//10, 23456),
all_filenames, len(all_filenames)//10, 23456, num_shards=256)
write_imagenet_subset_sharded(OUT_SUBSET_SHARDS_PATH, 'imagenet_{}_seed{}'.format(IMAGENET_SIZE//10, 34567),
all_filenames, len(all_filenames)//10, 34567, num_shards=256)
write_imagenet_subset_sharded(OUT_SUBSET_SHARDS_PATH, 'imagenet_{}_seed{}'.format(IMAGENET_SIZE//10, 45678),
all_filenames, len(all_filenames)//10, 45678, num_shards=256)
write_imagenet_subset_sharded(OUT_SUBSET_SHARDS_PATH, 'imagenet_{}_seed{}'.format(IMAGENET_SIZE//10, 56789),
all_filenames, len(all_filenames)//10, 56789, num_shards=256)