In [1]:
%matplotlib inline
import os, sys
from glob import glob
from shutil import copyfile
import numpy as np
import pandas as pd
#from PIL import Image
from IPython.display import Image, HTML, display
#sys.path.insert(1, os.path.join(sys.path[0], '../utils'))
In [2]:
current_dir = os.getcwd()
LESSON_HOME_DIR = current_dir
DATA_HOME_DIR = current_dir+'/data'
categories = sorted([os.path.basename(x) for x in glob(DATA_HOME_DIR+'/train/*')])
print("categories: %s"%(categories))
In [3]:
#Create directories
%cd $DATA_HOME_DIR
In [ ]:
# did this once
%mkdir valid
%mkdir results
%mkdir -p sample/train
%mkdir -p sample/test
%mkdir -p sample/valid
%mkdir -p sample/results
%mkdir -p test/unknown
In [ ]:
# Create subdirectories
for c in categories:
%mkdir -p valid/{c}
%mkdir -p sample/train/{c}
%mkdir -p sample/valid/{c}
In [23]:
%cd $DATA_HOME_DIR/train
In [24]:
# how many images we talking about?
for c in categories:
g = glob(c+"/*.jpg")
print c, len(g)
original output:
c0 2489
c1 2267
c2 2317
c3 2346
c4 2326
c5 2312
c6 2325
c7 2002
c8 1911
c9 2129
In [10]:
# going to take 20% or ~460 in each category for validation
validation_ratio = 0.2
In [12]:
for c in categories:
g = glob(c+"/*.jpg")
shuf = np.random.permutation(g)
num_valid = int(validation_ratio*len(g))
print num_valid
for i in range(num_valid):
#print shuf[i], DATA_HOME_DIR+'/valid/' + shuf[i]
os.rename(shuf[i], DATA_HOME_DIR+'/valid/' + shuf[i])
In [129]:
# now create the sample train subset of 20 per category
for c in categories:
g = glob(c+"/*.jpg")
shuf = np.random.permutation(g)
for i in range(20):
#print shuf[i], DATA_HOME_DIR+'/sample/train/' + shuf[i]
copyfile(shuf[i], DATA_HOME_DIR+'/sample/train/' + shuf[i])
In [130]:
%cd $DATA_HOME_DIR/valid
In [131]:
# now create the sample valid subset of 5 per category
for c in categories:
g = glob(c+"/*.jpg")
shuf = np.random.permutation(g)
for i in range(5):
#print shuf[i], DATA_HOME_DIR+'/sample/valid/' + shuf[i]
copyfile(shuf[i], DATA_HOME_DIR+'/sample/valid/' + shuf[i])
In [132]:
!ls {DATA_HOME_DIR}/train/* |wc -l
!ls {DATA_HOME_DIR}/valid/* |wc -l
!ls {DATA_HOME_DIR}/test/* |wc -l
!ls {DATA_HOME_DIR}/sample/train/* |wc -l
!ls {DATA_HOME_DIR}/sample/valid/* |wc -l
In [17]:
# Create single 'unknown' class for test set
%cd $DATA_HOME_DIR/test
%mv *.jpg unknown
In [5]:
# and sample test, too
%cd $DATA_HOME_DIR/sample/test
%mkdir unknown
In [133]:
# sample the test set
%cd $DATA_HOME_DIR/test/unknown
g = glob("*.jpg")
shuf = np.random.permutation(g)
for i in range(100):
#print shuf[i], DATA_HOME_DIR+'/sample/test/unknown/'+shuf[i]
copyfile(shuf[i], DATA_HOME_DIR+'/sample/test/unknown/'+shuf[i])
In [3]:
# So, move validation data back.
%cd $DATA_HOME_DIR
for c in categories:
g = glob("valid/"+c+"/*.jpg")
for i in range(len(g)):
#print g[i], g[i].replace('valid','train')
os.rename(g[i], g[i].replace('valid','train'))
In [4]:
%cd $DATA_HOME_DIR
!ls sample/
#results test train valid
In [5]:
!rm -rf sample/results/*
In [6]:
!rm sample/test/unknown/*
In [7]:
!rm sample/train/c*/*jpg
!rm sample/valid/c*/*jpg
In [8]:
%cd $DATA_HOME_DIR
driver_imgs = pd.read_csv('driver_imgs_list.csv')
In [9]:
driver_imgs.head()
Out[9]:
In [10]:
subjects = driver_imgs.subject.unique()
In [23]:
len(subjects), int(0.2*len(subjects))
Out[23]:
In [24]:
subjects
Out[24]:
In [11]:
# let's look at some subject examples
#images = []
def img_div(img,txt,idx):
W=220
H=200
N=4
float = idx%N < (N-1) # turn off on last one
fs = ""
if float:
fs = 'style="float: left;"'
s = "<div %s>"%(fs)
s += "<img width=%dpx height=%dpx src='%s'/>%s"%(W,H,img,txt)
s += "</div>"
return s
def show_subjects(subj):
html = ""
for j,s in enumerate(subj):
i = driver_imgs[driver_imgs.subject == s].index[0]
classname = driver_imgs.iloc[i].classname
img = driver_imgs.iloc[i].img
html += img_div("/files/kaggle/statefarm/data/train/"+classname+"/"+img, s, j)
display(HTML(html))
show_subjects(subjects)
In [12]:
# we should probably make sure validation & testing both have male/female representation
# males
males = ['p002', 'p012', 'p014', 'p015', 'p016', 'p021', 'p024', 'p026', 'p035', 'p039', 'p047', 'p051', 'p056', 'p075']
# females
females = ['p022', 'p041', 'p042', 'p045', 'p049', 'p050', 'p052', 'p061', 'p064', 'p066', 'p072', 'p081']
In [13]:
show_subjects(males)
In [14]:
show_subjects(females)
In [15]:
len(males), len(females), len(subjects)
Out[15]:
In [72]:
0.2*len(males), 0.2*len(females)
# okay 3 males & 2 females in our validation set
# choosing p045, p049 females
Out[72]:
In [78]:
set(males).intersection(set(females))
Out[78]:
In [83]:
np.random.permutation(males)[:3]
# gave ['p035', 'p056', 'p075']
Out[83]:
In [16]:
# okay this is the set I came up with. BUT, Jeremy says he only used 3
validation_subjects = ['p021', 'p056', 'p075'] + ['p045', 'p049']
# let's try 3, then.
validation_subjects = ['p021', 'p056'] + ['p045']
In [17]:
show_subjects(validation_subjects)
In [18]:
validation_df = driver_imgs[driver_imgs.subject.isin(validation_subjects)]
In [20]:
# move our validation images from train to valid
for i,x in validation_df.iterrows():
#if i < 10:
# print x.classname, x.img
fr = DATA_HOME_DIR+'/train/' + x.classname + '/' + x.img
to = DATA_HOME_DIR+'/valid/' + x.classname + '/' + x.img
#print fr, to
os.rename(fr,to)
In [21]:
%cd $DATA_HOME_DIR/valid
# how many images we talking about?
for c in categories:
g = glob(c+"/*.jpg")
print c, len(g)
In [ ]: