In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import seaborn as sns
%matplotlib inline
from IPython.display import display
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import os
import zipfile
import pickle
from PIL import Image
from shutil import copy2
In [2]:
def Unzip(data_path, zip_name):
extract_name = zip_name[0:-4]
extract_path = os.path.join(data_path, extract_name)
zip_path = os.path.join(data_path, zip_name)
if not (os.path.isdir(extract_path) or os.path.isfile(extract_path)):
with zipfile.ZipFile(zip_path) as file:
for name in file.namelist():
file.extract(name, data_path)
In [3]:
cwd = os.getcwd()
data_path = os.path.join(cwd, 'input')
Unzip(data_path, os.path.join(data_path, 'labels.csv.zip'))
Unzip(data_path, os.path.join(data_path, 'sample_submission.csv.zip'))
Unzip(data_path, os.path.join(data_path, 'test.zip'))
Unzip(data_path, os.path.join(data_path, 'train.zip'))
In [4]:
labels_path = os.path.join(data_path, 'labels.csv')
labels = pd.read_csv(labels_path)
print('labels.shape is {0}.'.format(labels.shape))
display(labels.head())
In [5]:
label_classes = labels.iloc[:,1].unique()
label_classes = sorted(label_classes)
display('The breeds of dogs is {0}'.format(len(label_classes)))
display(label_classes) ## You can display all to confirm this breeds are correct.
In [6]:
## Create data_train folder
data_train_path = os.path.join(data_path, 'data_train')
if os.path.isdir(data_train_path):
print('{0} is existed!'.format(data_train_path))
else:
os.mkdir(data_train_path)
print('{0} created!'.format(data_train_path))
## Create subfolders of data_train folder
for label in label_classes:
class_dir = os.path.join(data_train_path, label)
if not os.path.isdir(class_dir):
os.mkdir(class_dir)
print(os.listdir(data_train_path))
In [7]:
## Create data_val folder
data_val_path = os.path.join(data_path, 'data_val')
if os.path.isdir(data_val_path):
print('{0} is existed!'.format(data_val_path))
else:
os.mkdir(data_val_path)
print('{0} created!'.format(data_val_path))
## Create subfolders of data_val folder
for label in label_classes:
class_dir = os.path.join(data_val_path, label)
if not os.path.isdir(class_dir):
os.mkdir(class_dir)
print(os.listdir(data_val_path))
In [8]:
## Create folder for data_test folder
data_test_path = os.path.join(data_path, 'data_test')
data_test_sub_path = os.path.join(data_test_path, 'test')
if os.path.isdir(data_test_path):
print('{0} is existed!'.format(data_test_path))
else:
os.mkdir(data_test_path)
print('{0} created!'.format(data_test_path))
## Create subfolder for data_test folder
if not os.path.isdir(data_test_sub_path):
os.mkdir(data_test_sub_path)
print('{0} created!'.format(data_test_sub_path))
else:
print('{0} is existed!'.format(data_test_sub_path))
In [9]:
label_amount = labels.shape[0]
train_amount = int(label_amount*0.95)
val_amount = label_amount - train_amount
print(label_amount, train_amount, val_amount)
label_train = labels[0:train_amount]
label_val = labels[train_amount:]
print(label_train.shape)
print(label_val.shape)
In [10]:
# If images have moved to target_dir, do not move them again. Only check is first subfolder empty
target_dir = os.path.join(data_path, 'data_train', label_classes[0])
if os.listdir(target_dir):
print(target_dir + ' is not empty, do not need move images again.')
else:
print('start to copy images into data_train.')
# Move images of train data into its correct subfolder
for i, row in label_train.iterrows():
iamge_path = os.path.join(data_path, 'train', '{0}.jpg'.format(row[0]))
target_dir = os.path.join(data_path, 'data_train', row[1])
# In order to comfirm we get the correct file path
# print(row[0])
# print(row[1])
# print(iamge_path)
# print(target_dir)
copy2(iamge_path, target_dir)
In [11]:
# If images have moved to target_dir, do not move them again. Only check is first subfolder empty
target_dir = os.path.join(data_path, 'data_val', label_classes[0])
if os.listdir(target_dir):
print(target_dir + ' is not empty, do not need move images again.')
else:
print('start to copy images into data_val.')
# Move images of val data into its correct subfolder
for i, row in label_val.iterrows():
iamge_path = os.path.join(data_path, 'train', '{0}.jpg'.format(row[0]))
target_dir = os.path.join(data_path, 'data_val', row[1])
# In order to comfirm we get the correct file path
# print(row[0])
# print(row[1])
# print(iamge_path)
# print(target_dir)
copy2(iamge_path, target_dir)
In [12]:
# If images have moved to target_dir, do not move them again. Only check is first subfolder empty
target_dir = os.path.join(data_path, 'data_test', 'test')
if os.listdir(target_dir):
print(target_dir + ' is not empty, do not need move images again.')
else:
print('start to move images into data_test.')
# Move images of test data into test subfolder
test_image_pathes = os.listdir(os.path.join(data_path, 'test'))
# print(test_image_pathes)
for path in test_image_pathes:
iamge_path = os.path.join(data_path, 'test', path)
# print(iamge_path)
# print(data_test_sub_path)
copy2(iamge_path, data_test_sub_path)
In [13]:
print('Done!')
In [ ]: