In [1]:
# Initial setup following http://docs.chainer.org/en/stable/tutorial/basic.html
import numpy as np
import chainer
from chainer import cuda, Function, gradient_check, report, training, utils, Variable
from chainer import datasets, iterators, optimizers, serializers
from chainer import Link, Chain, ChainList
import chainer.functions as F
import chainer.links as L
from chainer.training import extensions
import chainer.dataset
import chainer.datasets
In [2]:
from chainer.datasets import TupleDataset
x = np.arange(10)
t = x * x
data = TupleDataset(x, t)
print('data type: {}, len: {}'.format(type(data), len(data)))
In [3]:
# Unlike numpy, it does not have shape property.
data.shape
i
-th data can be accessed by data[i]
which is a tuple of format ($x_i$, $t_i$, ...)
In [4]:
# get forth data -> x=3, t=9
data[3]
Out[4]:
Slice accessing
When TupleDataset is accessed by slice indexing, e.g. data[i:j]
, returned value is list of tuple
$[(x_i, t_i), ..., (x_{j-1}, t_{j-1})]$
In [5]:
# Get 1st, 2nd, 3rd data at the same time.
examples = data[0:4]
print(examples)
print('examples type: {}, len: {}'
.format(type(examples), len(examples)))
To convert examples into minibatch format, you can use concat_examples
function in chainer.dataset
.
Its return value is in format ([x_array], [t array], ...)
In [6]:
from chainer.dataset import concat_examples
data_minibatch = concat_examples(examples)
#print(data_minibatch)
#print('data_minibatch type: {}, len: {}'
# .format(type(data_minibatch), len(data_minibatch)))
x_minibatch, t_minibatch = data_minibatch
# Now it is array format, which has shape
print('x_minibatch = {}, type: {}, shape: {}'.format(x_minibatch, type(x_minibatch), x_minibatch.shape))
print('t_minibatch = {}, type: {}, shape: {}'.format(t_minibatch, type(t_minibatch), t_minibatch.shape))
In [10]:
from chainer.datasets import DictDataset
x = np.arange(10)
t = x * x
# To construct `DictDataset`, you can specify each key-value pair by passing "key=value" in kwargs.
data = DictDataset(x=x, t=t)
print('data type: {}, len: {}'.format(type(data), len(data)))
In [16]:
# Get 3rd data at the same time.
example = data[2]
print(example)
print('examples type: {}, len: {}'
.format(type(example), len(example)))
# You can access each value via key
print('x: {}, t: {}'.format(example['x'], example['t']))
This is util class for image dataset.
If the number of dataset becomes very big (for example ImageNet dataset), it is not practical to load all the images into memory unlike CIFAR-10 or CIFAR-100.
In this case, ImageDataset
class can be used to open image from storage everytime of minibatch creation.
[Note] ImageDataset
will download only the images, if you need another label information
(for example if you are working with image classification task) use LabeledImageDataset
instead.
You need to create a text file which contains the list of image paths to use ImageDataset
.
See data/images.dat
for how the paths text file look like.
In [28]:
import os
from chainer.datasets import ImageDataset
# print('Current direcotory: ', os.path.abspath(os.curdir))
filepath = './data/images.dat'
image_dataset = ImageDataset(filepath, root='./data/images')
print('image_dataset type: {}, len: {}'.format(type(image_dataset), len(image_dataset)))
We have created the image_dataset
above, however, images are not expanded into memory yet.
Image data will be loaded into memory from storage every time when you access via index, for efficient memory use.
In [31]:
# Access i-th image by image_dataset[i].
# image data is loaded here. for only 0-th image.
img = image_dataset[0]
# img is numpy array, already aligned as (channels, height, width),
# which is the standard shape format to feed into convolutional layer.
print('img', type(img), img.shape)
In [21]:
You need to create a text file which contains the list of image paths and labels to use LabeledImageDataset
.
See data/images_labels.dat
for how the text file look like.
In [32]:
import os
from chainer.datasets import LabeledImageDataset
# print('Current direcotory: ', os.path.abspath(os.curdir))
filepath = './data/images_labels.dat'
labeled_image_dataset = LabeledImageDataset(filepath, root='./data/images')
print('labeled_image_dataset type: {}, len: {}'.format(type(labeled_image_dataset), len(labeled_image_dataset)))
We have created the labeled_image_dataset
above, however, images are not expanded into memory yet.
Image data will be loaded into memory from storage every time when you access via index, for efficient memory use.
In [34]:
# Access i-th image and label by image_dataset[i].
# image data is loaded here. for only 0-th image.
img, label = labeled_image_dataset[0]
print('img', type(img), img.shape)
print('label', type(label), label)
In [9]:
datasets.split_dataset_n_random()
If you want to define custom dataset, DatasetMixin
provides the base function to make compatible with other dataset format.
Another important usage for DatasetMixin
is to preprocess the input data, including data augmentation.
To implement subclass of DatasetMixin
, you usually need to implement these 3 functions.
__init__(self, *args)
function: It is not compulsary but__len__(self)
function : Iterator need to know the length of this dataset to understand the end of epoch.get_examples(self, i)
function:
In [10]:
from chainer.dataset import DatasetMixin
print_debug = True
class SimpleDataset(DatasetMixin):
def __init__(self, values):
self.values = values
def __len__(self):
return len(self.values)
def get_example(self, i):
if print_debug:
print('get_example, i = {}'.format(i))
return self.values[i]
Important function in DatasetMixin
is get_examples(self, i)
function.
This function is called when they access data[i]
In [11]:
simple_data = SimpleDataset([0, 1, 4, 9, 16, 25])
In [12]:
# get_example(self, i) is called when data is accessed by data[i]
simple_data[3]
Out[12]:
In [13]:
# data can be accessed using slice indexing as well
simple_data[1:3]
Out[13]:
The important point is that get_example
function is called every time when the data is accessed by [] indexing.
Thus you may put random value generation for data augmentation code in get_example.
In [14]:
import numpy as np
from chainer.dataset import DatasetMixin
print_debug = False
def calc(x):
return x * x
class SquareNoiseDataset(DatasetMixin):
def __init__(self, values):
self.values = values
def __len__(self):
return len(self.values)
def get_example(self, i):
if print_debug:
print('get_example, i = {}'.format(i))
x = self.values[i]
t = calc(x)
t_noise = t + np.random.normal(0, 0.1)
return x, t_noise
In [15]:
square_noise_data = SquareNoiseDataset(np.arange(10))
Below SimpleNoiseDataset
adds small Gaussian noise to the original value,
and every time the value is accessed, get_example
is called and differenct noise is added even if you access to the data with same index.
In [16]:
# Accessing to the same index, but the value is different!
print('Accessing square_noise_data[3]', )
print('1st: ', square_noise_data[3])
print('2nd: ', square_noise_data[3])
print('3rd: ', square_noise_data[3])
In [17]:
# Same applies for slice index accessing.
print('Accessing square_noise_data[0:4]')
print('1st: ', square_noise_data[0:4])
print('2nd: ', square_noise_data[0:4])
print('3rd: ', square_noise_data[0:4])
To convert examples into minibatch format, you can use concat_examples
function in chainer.dataset
in the sameway explained at TupleDataset.
In [19]:
from chainer.dataset import concat_examples
examples = square_noise_data[0:4]
print('examples = {}'.format(examples))
data_minibatch = concat_examples(examples)
x_minibatch, t_minibatch = data_minibatch
# Now it is array format, which has shape
print('x_minibatch = {}, type: {}, shape: {}'.format(x_minibatch, type(x_minibatch), x_minibatch.shape))
print('t_minibatch = {}, type: {}, shape: {}'.format(t_minibatch, type(t_minibatch), t_minibatch.shape))
Let's see a concrete example to create new dataset from original tuple dataset by adding a small noise.
In [23]:
from chainer.datasets import TransformDataset
x = np.arange(10)
t = x * x - x
original_dataset = TupleDataset(x, t)
def transform_function(in_data):
x_i, t_i = in_data
new_t_i = t_i + np.random.normal(0, 0.1)
return x_i, new_t_i
transformed_dataset = TransformDataset(original_dataset, transform_function)
In [24]:
original_dataset[:3]
Out[24]:
In [26]:
# Now Gaussian noise is added (in transform_function) to the original_dataset.
transformed_dataset[:3]
Out[26]:
In [ ]: