In [13]:
import torch
import torch.utils.data as Data

torch.manual_seed(1)    # reproducible


Out[13]:
<torch._C.Generator at 0x7f515c4d6258>

Create Dataset


In [15]:
x = torch.linspace(1, 10, 10)       # this is x data (torch tensor)
y = torch.linspace(10, 1, 10)       # this is y data (torch tensor)
torch.cat((x.view(len(x),-1),y.view(len(y),-1)),1)


Out[15]:
    1    10
    2     9
    3     8
    4     7
    5     6
    6     5
    7     4
    8     3
    9     2
   10     1
[torch.FloatTensor of size 10x2]

In [ ]:
dataset = Data.TensorDataset(data_tensor=x, target_tensor=y)

DataLoader (= 전체 Data를 쪼개서 Batch Size 단위로 불러 오는 역할)


In [29]:
BATCH_SIZE = 5

loader = Data.DataLoader(
    dataset=dataset,            # torch TensorDataset format
    batch_size=BATCH_SIZE,      # mini batch size
    shuffle=True,               # random shuffle for training
    num_workers=1,              # subprocesses for loading data
)

In [34]:
# EPOCH: 모든 데이터를 한 사이클 돌았을 때 1 epoch 이라고 한다.

for epoch in range(3):   # train entire dataset 3 times
    for step, (batch_x, batch_y) in enumerate(loader):  # for each training step
        
        print('Epoch: ', epoch, '| Step: ', step, '| batch x: ',
              batch_x.numpy(), '| batch y: ', batch_y.numpy())

        # train your model


Epoch:  0 | Step:  0 | batch x:  [  3.   8.   2.   9.   4.  10.   7.   6.] | batch y:  [ 8.  3.  9.  2.  7.  1.  4.  5.]
Epoch:  0 | Step:  1 | batch x:  [ 5.  1.] | batch y:  [  6.  10.]
Epoch:  1 | Step:  0 | batch x:  [  7.   5.   9.   3.   6.   2.  10.   8.] | batch y:  [ 4.  6.  2.  8.  5.  9.  1.  3.]
Epoch:  1 | Step:  1 | batch x:  [ 4.  1.] | batch y:  [  7.  10.]
Epoch:  2 | Step:  0 | batch x:  [ 2.  1.  7.  3.  6.  4.  8.  9.] | batch y:  [  9.  10.   4.   8.   5.   7.   3.   2.]
Epoch:  2 | Step:  1 | batch x:  [ 10.   5.] | batch y:  [ 1.  6.]

Dataset 을 Batch Size로 나누어 지지 않을때?


In [35]:
BATCH_SIZE = 8

loader = Data.DataLoader(
    dataset=dataset,      # torch TensorDataset format
    batch_size=BATCH_SIZE,      # mini batch size
    shuffle=True,               # random shuffle for training
    num_workers=1,              # subprocesses for loading data
)

for epoch in range(3):   # train entire dataset 3 times
    for step, (batch_x, batch_y) in enumerate(loader):  # for each training step
        
        # train your data...
        
        print('Epoch: ', epoch, '| Step: ', step, '| batch x: ',
              batch_x.numpy(), '| batch y: ', batch_y.numpy())


Epoch:  0 | Step:  0 | batch x:  [  4.   9.   7.  10.   8.   5.   3.   1.] | batch y:  [  7.   2.   4.   1.   3.   6.   8.  10.]
Epoch:  0 | Step:  1 | batch x:  [ 2.  6.] | batch y:  [ 9.  5.]
Epoch:  1 | Step:  0 | batch x:  [  2.  10.   5.   8.   4.   9.   7.   3.] | batch y:  [ 9.  1.  6.  3.  7.  2.  4.  8.]
Epoch:  1 | Step:  1 | batch x:  [ 1.  6.] | batch y:  [ 10.   5.]
Epoch:  2 | Step:  0 | batch x:  [  4.   6.   5.   8.  10.   1.   7.   3.] | batch y:  [  7.   5.   6.   3.   1.  10.   4.   8.]
Epoch:  2 | Step:  1 | batch x:  [ 2.  9.] | batch y:  [ 9.  2.]

Create dataset from Image folder


In [45]:
import torchvision
import torchvision.datasets as dsets
import torchvision.transforms as transforms

# Input pipeline from a folder containing multiple folders of images
# we can check the classes, class_to_idx, and filename with idx

img_dir = "./images"
img_data = dset.ImageFolder(img_dir, transforms.Compose([
            transforms.RandomSizedCrop(224),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            ]))

print(img_data.classes)
print(img_data.class_to_idx)
print(img_data.imgs)


['dogs', 'onepun']
{'dogs': 0, 'onepun': 1}
[('./images/onepun/5320.jpg', 1), ('./images/onepun/5720.jpg', 1), ('./images/onepun/1454.jpg', 1), ('./images/dogs/3b1a8d3fca2adb68a7a908b73005764a.jpg', 0), ('./images/dogs/dog-650299.jpg', 0)]

DataLoader


In [50]:
# After we get the list of images, we can turn the list into batches of images
# with torch.utils.data.DataLoader()

img_batch = Data.DataLoader(img_data, batch_size=3,
                            shuffle=True, num_workers=2)

for img,label in img_batch:
    print(img.size())
    print(label)


torch.Size([3, 3, 224, 224])

 1
 0
 1
[torch.LongTensor of size 3]

torch.Size([2, 3, 224, 224])

 0
 1
[torch.LongTensor of size 2]

Pre-Defined Dataset in Torchvision library

  • MNIST
  • COCO
  • Captions
  • Detection
  • LSUN
  • Imagenet-12
  • CIFAR
  • STL10
  • SVHN
  • PhotoTour

In [57]:
# MNIST Dataset 
train_dataset = dsets.MNIST(root='./data', 
                            train=True, 
                            transform=transforms.ToTensor(),  
                            download=True)

# Select one data pair (read data from disk).
image, label = train_dataset[0]
print (image.size())
print (label)


torch.Size([1, 28, 28])
5

DataLoader


In [60]:
# Data Loader (this provides queue and thread in a very simple way).
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                           batch_size=100, 
                                           shuffle=True,
                                           num_workers=2)

# When iteration starts, queue and thread start to load dataset from files.
data_iter = iter(train_loader)

# Mini-batch images and labels.
images, labels = data_iter.next()

# # Actual usage of data loader is as below.
# for images, labels in train_loader:
#     # Your training code will be written here
#     pass

images.size()


Out[60]:
torch.Size([100, 1, 28, 28])

Define Custom Dataset


In [56]:
# You should build custom dataset as below.
class CustomDataset(Data.Dataset):
    def __init__(self):
        # TODO
        # 1. Initialize file path or list of file names. 
        pass
    def __getitem__(self, index):
        # TODO
        # 1. Read one data from file (e.g. using numpy.fromfile, PIL.Image.open).
        # 2. Preprocess the data (e.g. torchvision.Transform).
        # 3. Return a data pair (e.g. image and label).
        pass
    def __len__(self):
        # You should change 0 to the total size of your dataset.
        return 0 

# Then, you can just use prebuilt torch's data loader. 
custom_dataset = CustomDataset()
train_loader = Data.DataLoader(dataset=custom_dataset,
                                           batch_size=100, 
                                           shuffle=True,
                                           num_workers=2)

Pretrained-Model


In [ ]:
#========================== Using pretrained model ==========================#
# Download and load pretrained resnet.
resnet = torchvision.models.resnet18(pretrained=True)

# If you want to finetune only top layer of the model.
for param in resnet.parameters():
    param.requires_grad = False
    
# Replace top layer for finetuning.
resnet.fc = torch.nn.Linear(resnet.fc.in_features, 100)  # 100 is for example.

# For test.
images = Variable(torch.randn(10, 3, 256, 256))
outputs = resnet(images)
print (outputs.size())   # (10, 100)