1. 数据预处理

主要是将图片数据解压,然后复制到ImageDataGenerator需要的文件目录中。

导入包


In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import seaborn as sns
%matplotlib inline
from IPython.display import display
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

import os
import zipfile
import pickle
from PIL import Image
from shutil import copy2

解压zip文件


In [2]:
def Unzip(data_path, zip_name):
    extract_name = zip_name[0:-4]
    extract_path = os.path.join(data_path, extract_name)
    zip_path = os.path.join(data_path, zip_name)
    if not (os.path.isdir(extract_path) or os.path.isfile(extract_path)):
        with zipfile.ZipFile(zip_path) as file:
            for name in file.namelist():
                file.extract(name, data_path)

In [3]:
cwd = os.getcwd()
data_path = os.path.join(cwd, 'input')
Unzip(data_path, os.path.join(data_path, 'labels.csv.zip'))
Unzip(data_path, os.path.join(data_path, 'sample_submission.csv.zip'))
Unzip(data_path, os.path.join(data_path, 'test.zip'))
Unzip(data_path, os.path.join(data_path, 'train.zip'))

预览labels.csv


In [4]:
labels_path = os.path.join(data_path, 'labels.csv')
labels = pd.read_csv(labels_path)
print('labels.shape is {0}.'.format(labels.shape))
display(labels.head())


labels.shape is (10222, 2).
id breed
0 000bec180eb18c7604dcecc8fe0dba07 boston_bull
1 001513dfcb2ffafc82cccf4d8bbaba97 dingo
2 001cdf01b096e06d78e9e5112d419397 pekinese
3 00214f311d5d2247d5dfe4fe24b2303d bluetick
4 0021f9ceb3235effd7fcde7f7538ed62 golden_retriever

获取狗的品种和品种数量


In [5]:
label_classes = labels.iloc[:,1].unique()
label_classes = sorted(label_classes)
display('The breeds of dogs is {0}'.format(len(label_classes)))
display(label_classes)  ## You can display all to confirm this breeds are correct.


'The breeds of dogs is 120'
['affenpinscher',
 'afghan_hound',
 'african_hunting_dog',
 'airedale',
 'american_staffordshire_terrier',
 'appenzeller',
 'australian_terrier',
 'basenji',
 'basset',
 'beagle',
 'bedlington_terrier',
 'bernese_mountain_dog',
 'black-and-tan_coonhound',
 'blenheim_spaniel',
 'bloodhound',
 'bluetick',
 'border_collie',
 'border_terrier',
 'borzoi',
 'boston_bull',
 'bouvier_des_flandres',
 'boxer',
 'brabancon_griffon',
 'briard',
 'brittany_spaniel',
 'bull_mastiff',
 'cairn',
 'cardigan',
 'chesapeake_bay_retriever',
 'chihuahua',
 'chow',
 'clumber',
 'cocker_spaniel',
 'collie',
 'curly-coated_retriever',
 'dandie_dinmont',
 'dhole',
 'dingo',
 'doberman',
 'english_foxhound',
 'english_setter',
 'english_springer',
 'entlebucher',
 'eskimo_dog',
 'flat-coated_retriever',
 'french_bulldog',
 'german_shepherd',
 'german_short-haired_pointer',
 'giant_schnauzer',
 'golden_retriever',
 'gordon_setter',
 'great_dane',
 'great_pyrenees',
 'greater_swiss_mountain_dog',
 'groenendael',
 'ibizan_hound',
 'irish_setter',
 'irish_terrier',
 'irish_water_spaniel',
 'irish_wolfhound',
 'italian_greyhound',
 'japanese_spaniel',
 'keeshond',
 'kelpie',
 'kerry_blue_terrier',
 'komondor',
 'kuvasz',
 'labrador_retriever',
 'lakeland_terrier',
 'leonberg',
 'lhasa',
 'malamute',
 'malinois',
 'maltese_dog',
 'mexican_hairless',
 'miniature_pinscher',
 'miniature_poodle',
 'miniature_schnauzer',
 'newfoundland',
 'norfolk_terrier',
 'norwegian_elkhound',
 'norwich_terrier',
 'old_english_sheepdog',
 'otterhound',
 'papillon',
 'pekinese',
 'pembroke',
 'pomeranian',
 'pug',
 'redbone',
 'rhodesian_ridgeback',
 'rottweiler',
 'saint_bernard',
 'saluki',
 'samoyed',
 'schipperke',
 'scotch_terrier',
 'scottish_deerhound',
 'sealyham_terrier',
 'shetland_sheepdog',
 'shih-tzu',
 'siberian_husky',
 'silky_terrier',
 'soft-coated_wheaten_terrier',
 'staffordshire_bullterrier',
 'standard_poodle',
 'standard_schnauzer',
 'sussex_spaniel',
 'tibetan_mastiff',
 'tibetan_terrier',
 'toy_poodle',
 'toy_terrier',
 'vizsla',
 'walker_hound',
 'weimaraner',
 'welsh_springer_spaniel',
 'west_highland_white_terrier',
 'whippet',
 'wire-haired_fox_terrier',
 'yorkshire_terrier']

创建data_train文件夹,然后,在data_train文件夹下面给每个品种创建一个子文件夹


In [6]:
## Create data_train folder
data_train_path = os.path.join(data_path, 'data_train')
if os.path.isdir(data_train_path):
    print('{0} is existed!'.format(data_train_path))
else:
    os.mkdir(data_train_path)
    print('{0} created!'.format(data_train_path))
    
    ## Create subfolders of data_train folder
    for label in label_classes:
        class_dir = os.path.join(data_train_path, label)
        if not os.path.isdir(class_dir):
            os.mkdir(class_dir)
    print(os.listdir(data_train_path))


D:\Udacity\MachineLearning(Advanced)\p6_graduation_project\input\data_train created!
['affenpinscher', 'afghan_hound', 'african_hunting_dog', 'airedale', 'american_staffordshire_terrier', 'appenzeller', 'australian_terrier', 'basenji', 'basset', 'beagle', 'bedlington_terrier', 'bernese_mountain_dog', 'black-and-tan_coonhound', 'blenheim_spaniel', 'bloodhound', 'bluetick', 'border_collie', 'border_terrier', 'borzoi', 'boston_bull', 'bouvier_des_flandres', 'boxer', 'brabancon_griffon', 'briard', 'brittany_spaniel', 'bull_mastiff', 'cairn', 'cardigan', 'chesapeake_bay_retriever', 'chihuahua', 'chow', 'clumber', 'cocker_spaniel', 'collie', 'curly-coated_retriever', 'dandie_dinmont', 'dhole', 'dingo', 'doberman', 'english_foxhound', 'english_setter', 'english_springer', 'entlebucher', 'eskimo_dog', 'flat-coated_retriever', 'french_bulldog', 'german_shepherd', 'german_short-haired_pointer', 'giant_schnauzer', 'golden_retriever', 'gordon_setter', 'greater_swiss_mountain_dog', 'great_dane', 'great_pyrenees', 'groenendael', 'ibizan_hound', 'irish_setter', 'irish_terrier', 'irish_water_spaniel', 'irish_wolfhound', 'italian_greyhound', 'japanese_spaniel', 'keeshond', 'kelpie', 'kerry_blue_terrier', 'komondor', 'kuvasz', 'labrador_retriever', 'lakeland_terrier', 'leonberg', 'lhasa', 'malamute', 'malinois', 'maltese_dog', 'mexican_hairless', 'miniature_pinscher', 'miniature_poodle', 'miniature_schnauzer', 'newfoundland', 'norfolk_terrier', 'norwegian_elkhound', 'norwich_terrier', 'old_english_sheepdog', 'otterhound', 'papillon', 'pekinese', 'pembroke', 'pomeranian', 'pug', 'redbone', 'rhodesian_ridgeback', 'rottweiler', 'saint_bernard', 'saluki', 'samoyed', 'schipperke', 'scotch_terrier', 'scottish_deerhound', 'sealyham_terrier', 'shetland_sheepdog', 'shih-tzu', 'siberian_husky', 'silky_terrier', 'soft-coated_wheaten_terrier', 'staffordshire_bullterrier', 'standard_poodle', 'standard_schnauzer', 'sussex_spaniel', 'tibetan_mastiff', 'tibetan_terrier', 'toy_poodle', 'toy_terrier', 'vizsla', 'walker_hound', 'weimaraner', 'welsh_springer_spaniel', 'west_highland_white_terrier', 'whippet', 'wire-haired_fox_terrier', 'yorkshire_terrier']

创建data_val文件夹,然后,在data_val文件夹下面给每个品种创建一个子文件夹


In [7]:
## Create data_val folder
data_val_path = os.path.join(data_path, 'data_val')
if os.path.isdir(data_val_path):
    print('{0} is existed!'.format(data_val_path))
else:
    os.mkdir(data_val_path)
    print('{0} created!'.format(data_val_path))
    
    ## Create subfolders of data_val folder
    for label in label_classes:
        class_dir = os.path.join(data_val_path, label)
        if not os.path.isdir(class_dir):
            os.mkdir(class_dir)
    print(os.listdir(data_val_path))


D:\Udacity\MachineLearning(Advanced)\p6_graduation_project\input\data_val created!
['affenpinscher', 'afghan_hound', 'african_hunting_dog', 'airedale', 'american_staffordshire_terrier', 'appenzeller', 'australian_terrier', 'basenji', 'basset', 'beagle', 'bedlington_terrier', 'bernese_mountain_dog', 'black-and-tan_coonhound', 'blenheim_spaniel', 'bloodhound', 'bluetick', 'border_collie', 'border_terrier', 'borzoi', 'boston_bull', 'bouvier_des_flandres', 'boxer', 'brabancon_griffon', 'briard', 'brittany_spaniel', 'bull_mastiff', 'cairn', 'cardigan', 'chesapeake_bay_retriever', 'chihuahua', 'chow', 'clumber', 'cocker_spaniel', 'collie', 'curly-coated_retriever', 'dandie_dinmont', 'dhole', 'dingo', 'doberman', 'english_foxhound', 'english_setter', 'english_springer', 'entlebucher', 'eskimo_dog', 'flat-coated_retriever', 'french_bulldog', 'german_shepherd', 'german_short-haired_pointer', 'giant_schnauzer', 'golden_retriever', 'gordon_setter', 'greater_swiss_mountain_dog', 'great_dane', 'great_pyrenees', 'groenendael', 'ibizan_hound', 'irish_setter', 'irish_terrier', 'irish_water_spaniel', 'irish_wolfhound', 'italian_greyhound', 'japanese_spaniel', 'keeshond', 'kelpie', 'kerry_blue_terrier', 'komondor', 'kuvasz', 'labrador_retriever', 'lakeland_terrier', 'leonberg', 'lhasa', 'malamute', 'malinois', 'maltese_dog', 'mexican_hairless', 'miniature_pinscher', 'miniature_poodle', 'miniature_schnauzer', 'newfoundland', 'norfolk_terrier', 'norwegian_elkhound', 'norwich_terrier', 'old_english_sheepdog', 'otterhound', 'papillon', 'pekinese', 'pembroke', 'pomeranian', 'pug', 'redbone', 'rhodesian_ridgeback', 'rottweiler', 'saint_bernard', 'saluki', 'samoyed', 'schipperke', 'scotch_terrier', 'scottish_deerhound', 'sealyham_terrier', 'shetland_sheepdog', 'shih-tzu', 'siberian_husky', 'silky_terrier', 'soft-coated_wheaten_terrier', 'staffordshire_bullterrier', 'standard_poodle', 'standard_schnauzer', 'sussex_spaniel', 'tibetan_mastiff', 'tibetan_terrier', 'toy_poodle', 'toy_terrier', 'vizsla', 'walker_hound', 'weimaraner', 'welsh_springer_spaniel', 'west_highland_white_terrier', 'whippet', 'wire-haired_fox_terrier', 'yorkshire_terrier']

创建data_test文件夹,然后,在data_test文件夹下面一个名为test的子文件夹


In [8]:
## Create folder for data_test folder
data_test_path = os.path.join(data_path, 'data_test')
data_test_sub_path = os.path.join(data_test_path, 'test')
if os.path.isdir(data_test_path):
    print('{0} is existed!'.format(data_test_path))
else:
    os.mkdir(data_test_path)
    print('{0} created!'.format(data_test_path))
    
    ## Create subfolder for data_test folder
    if not os.path.isdir(data_test_sub_path):
        os.mkdir(data_test_sub_path)
        print('{0} created!'.format(data_test_sub_path))
    else:
        print('{0} is existed!'.format(data_test_sub_path))


D:\Udacity\MachineLearning(Advanced)\p6_graduation_project\input\data_test created!
D:\Udacity\MachineLearning(Advanced)\p6_graduation_project\input\data_test\test created!

分割train_labels和val_labels


In [9]:
label_amount = labels.shape[0]
train_amount = int(label_amount*0.95)
val_amount = label_amount - train_amount
print(label_amount, train_amount, val_amount)

label_train = labels[0:train_amount]
label_val = labels[train_amount:]
print(label_train.shape)
print(label_val.shape)


10222 9710 512
(9710, 2)
(512, 2)

从train文件夹复制图片到data_train下对应品种的子文件夹内


In [10]:
# If images have moved to target_dir, do not move them again. Only check is first subfolder empty
target_dir = os.path.join(data_path, 'data_train', label_classes[0])

if os.listdir(target_dir):
    print(target_dir + ' is not empty, do not need move images again.')
else:
    print('start to copy images into data_train.')
    # Move images of train data into its correct subfolder
    for i, row in label_train.iterrows():
        iamge_path = os.path.join(data_path, 'train', '{0}.jpg'.format(row[0]))
        target_dir = os.path.join(data_path, 'data_train', row[1])
#         In order to comfirm we get the correct file path
#         print(row[0])
#         print(row[1])
#         print(iamge_path)
#         print(target_dir)
        copy2(iamge_path, target_dir)


start to copy images into data_train.

从train文件夹复制图片到data_val下对应品种的子文件夹内


In [11]:
# If images have moved to target_dir, do not move them again. Only check is first subfolder empty
target_dir = os.path.join(data_path, 'data_val', label_classes[0])

if os.listdir(target_dir):
    print(target_dir + ' is not empty, do not need move images again.')
else:
    print('start to copy images into data_val.')
    # Move images of val data into its correct subfolder
    for i, row in label_val.iterrows():
        iamge_path = os.path.join(data_path, 'train', '{0}.jpg'.format(row[0]))
        target_dir = os.path.join(data_path, 'data_val', row[1])
#         In order to comfirm we get the correct file path
#         print(row[0])
#         print(row[1])
#         print(iamge_path)
#         print(target_dir)
        copy2(iamge_path, target_dir)


start to copy images into data_val.

从test文件夹复制图片到data_test下对应品种的子文件夹内


In [12]:
# If images have moved to target_dir, do not move them again. Only check is first subfolder empty
target_dir = os.path.join(data_path, 'data_test', 'test')

if os.listdir(target_dir):
    print(target_dir + ' is not empty, do not need move images again.')
else:
    print('start to move images into data_test.')

    # Move images of test data into test subfolder
    test_image_pathes = os.listdir(os.path.join(data_path, 'test'))
    # print(test_image_pathes)
    for path in test_image_pathes:
        iamge_path = os.path.join(data_path, 'test', path)
#         print(iamge_path)
#         print(data_test_sub_path)
        copy2(iamge_path, data_test_sub_path)


start to move images into data_test.

In [13]:
print('Done!')


Done!

In [ ]: