In [1]:
import pandas as pd
import numpy as np
np.random.seed(1)
In [2]:
# data of interest is in train folder /train
import os
from glob import glob
train_files = glob(os.path.join('train/', "*.jpg"))
train_ids_all = [s for s in train_files]
In [3]:
len(train_ids_all)
Out[3]:
In [4]:
print(train_ids_all[0])
car_id = train_ids_all[0].split('_')[0].split('/')[1]
print(car_id)
In [5]:
car_ids = set([t.split('_')[0].split('/')[1] for t in train_ids_all])
In [6]:
len(car_ids)
Out[6]:
In [7]:
train_masks = glob(os.path.join('train_masks/', "*.gif"))
train_masks_all = [s for s in train_masks]
In [8]:
print(train_masks_all[0])
mask_id = train_masks_all[0].split('/')[1].split('_')[0]
print(mask_id)
In [9]:
mask_ids = set([m.split('/')[1].split('_')[0] for m in train_masks_all])
In [10]:
len(mask_ids)
Out[10]:
In [11]:
car_ids - mask_ids
Out[11]:
The same cars are in the train/ and train_masks/ folders. Good.
In [12]:
car_array = np.array(list(car_ids))
In [13]:
car_array
Out[13]:
In [14]:
# Shuffle array so can easily separate array into train ids, test ids, and holdout ids
np.random.shuffle(car_array)
In [15]:
car_array
Out[15]:
In [16]:
train_fract = 0.8
test_fract = 0.1
holdout_fract = 0.1
num_train = int(round(car_array.shape[0]*train_fract))
num_test = int(round(car_array.shape[0]*test_fract))
num_holdout = int(round(car_array.shape[0]*holdout_fract))
print("{0} train, {1} test, and {2} holdout cars.".format(num_train, num_test, num_holdout))
print("That's a total of {0} cars.".format(num_train + num_test + num_holdout))
In [17]:
ids_train = car_array[:num_train]
ids_test = car_array[num_train:num_train + num_test]
ids_holdout = car_array[num_train + num_test:]
out = "There are {0} in train, {1} in test, and {2} in the holdout"
print(out.format(ids_train.shape[0], ids_test.shape[0], ids_holdout.shape[0]))
In [18]:
np.savetxt('ids_train.txt', ids_train, fmt='%s', delimiter='\n')
np.savetxt('ids_test.txt', ids_test, fmt='%s', delimiter='\n')
np.savetxt('ids_holdout.txt', ids_holdout, fmt='%s', delimiter='\n')
In [ ]: