In [1]:
%matplotlib inline
import os, sys
from glob import glob
from shutil import copyfile
import numpy as np
import pandas as pd
#from PIL import Image
from IPython.display import Image, HTML, display

#sys.path.insert(1, os.path.join(sys.path[0], '../utils'))

In [2]:
current_dir = os.getcwd()
LESSON_HOME_DIR = current_dir
DATA_HOME_DIR = current_dir+'/data'
categories = sorted([os.path.basename(x) for x in glob(DATA_HOME_DIR+'/train/*')])
print("categories: %s"%(categories))


categories: ['c0', 'c1', 'c2', 'c3', 'c4', 'c5', 'c6', 'c7', 'c8', 'c9']

Did not do all of this in order, so don't just blindly follow this.

Passed through this once & then had to come back & fix things up again.

Create validation set and sample


In [3]:
#Create directories
%cd $DATA_HOME_DIR


/home/rallen/Documents/Devel/PracticalDL4C/kaggle/statefarm/data

In [ ]:
# did this once
%mkdir valid
%mkdir results
%mkdir -p sample/train
%mkdir -p sample/test
%mkdir -p sample/valid
%mkdir -p sample/results
%mkdir -p test/unknown

In [ ]:
# Create subdirectories
for c in categories:
    %mkdir -p valid/{c}
    %mkdir -p sample/train/{c}
    %mkdir -p sample/valid/{c}

In [23]:
%cd $DATA_HOME_DIR/train


/home/rallen/Documents/Devel/PracticalDL4C/kaggle/statefarm/data/train

In [24]:
# how many images we talking about?
for c in categories:
    g = glob(c+"/*.jpg")
    print c, len(g)


c0 2198
c1 1981
c2 2034
c3 2065
c4 2037
c5 2025
c6 2048
c7 1763
c8 1663
c9 1855

original output:

c0 2489
c1 2267
c2 2317
c3 2346
c4 2326
c5 2312
c6 2325
c7 2002
c8 1911
c9 2129

In [10]:
# going to take 20% or ~460 in each category for validation
validation_ratio = 0.2

In [12]:
for c in categories:
    g = glob(c+"/*.jpg")
    shuf = np.random.permutation(g)
    num_valid = int(validation_ratio*len(g))
    print num_valid
    for i in range(num_valid): 
        #print shuf[i], DATA_HOME_DIR+'/valid/' + shuf[i]
        os.rename(shuf[i], DATA_HOME_DIR+'/valid/' + shuf[i])


497
453
463
469
465
462
465
400
382
425

In [129]:
# now create the sample train subset of 20 per category
for c in categories:
    g = glob(c+"/*.jpg")
    shuf = np.random.permutation(g)
    for i in range(20): 
        #print shuf[i], DATA_HOME_DIR+'/sample/train/' + shuf[i]
        copyfile(shuf[i], DATA_HOME_DIR+'/sample/train/' + shuf[i])

In [130]:
%cd $DATA_HOME_DIR/valid


/home/rallen/Documents/Devel/PracticalDL4C/kaggle/statefarm/data/valid

In [131]:
# now create the sample valid subset of 5 per category
for c in categories:
    g = glob(c+"/*.jpg")
    shuf = np.random.permutation(g)
    for i in range(5): 
        #print shuf[i], DATA_HOME_DIR+'/sample/valid/' + shuf[i]
        copyfile(shuf[i], DATA_HOME_DIR+'/sample/valid/' + shuf[i])

In [132]:
!ls {DATA_HOME_DIR}/train/* |wc -l
!ls {DATA_HOME_DIR}/valid/* |wc -l
!ls {DATA_HOME_DIR}/test/* |wc -l
!ls {DATA_HOME_DIR}/sample/train/* |wc -l
!ls {DATA_HOME_DIR}/sample/valid/* |wc -l


17863
4599
79726
219
69

In [17]:
# Create single 'unknown' class for test set
%cd $DATA_HOME_DIR/test
%mv *.jpg unknown


/home/rallen/Documents/Devel/PracticalDL4C/kaggle/statefarm/data/test

In [5]:
# and sample test, too
%cd $DATA_HOME_DIR/sample/test
%mkdir unknown


/home/rallen/Documents/Devel/PracticalDL4C/kaggle/statefarm/data/sample/test

In [133]:
# sample the test set
%cd $DATA_HOME_DIR/test/unknown
g = glob("*.jpg")
shuf = np.random.permutation(g)
for i in range(100):
    #print shuf[i], DATA_HOME_DIR+'/sample/test/unknown/'+shuf[i]
    copyfile(shuf[i], DATA_HOME_DIR+'/sample/test/unknown/'+shuf[i])


/home/rallen/Documents/Devel/PracticalDL4C/kaggle/statefarm/data/test/unknown

Recreate validation set and sample

I messed up when creating my validation set. Per the forum discussions, you want to split the drivers from training to validation (just like the real test set). Do this below


In [3]:
# So, move validation data back.
%cd $DATA_HOME_DIR
for c in categories:
    g = glob("valid/"+c+"/*.jpg")
    for i in range(len(g)): 
        #print g[i], g[i].replace('valid','train')
        os.rename(g[i], g[i].replace('valid','train'))


/home/rallen/Documents/Devel/PracticalDL4C/kaggle/statefarm/data

In [4]:
%cd $DATA_HOME_DIR
!ls sample/
#results  test  train  valid


/home/rallen/Documents/Devel/PracticalDL4C/kaggle/statefarm/data
results  test  train  valid

In [5]:
!rm -rf sample/results/*

In [6]:
!rm sample/test/unknown/*

In [7]:
!rm sample/train/c*/*jpg
!rm sample/valid/c*/*jpg

Examine the driver_imgs_list.csv


In [8]:
%cd $DATA_HOME_DIR
driver_imgs = pd.read_csv('driver_imgs_list.csv')


/home/rallen/Documents/Devel/PracticalDL4C/kaggle/statefarm/data

In [9]:
driver_imgs.head()


Out[9]:
subject classname img
0 p002 c0 img_44733.jpg
1 p002 c0 img_72999.jpg
2 p002 c0 img_25094.jpg
3 p002 c0 img_69092.jpg
4 p002 c0 img_92629.jpg

In [10]:
subjects = driver_imgs.subject.unique()

In [23]:
len(subjects), int(0.2*len(subjects))


Out[23]:
(26, 5)

In [24]:
subjects


Out[24]:
array(['p002', 'p012', 'p014', 'p015', 'p016', 'p021', 'p022', 'p024',
       'p026', 'p035', 'p039', 'p041', 'p042', 'p045', 'p047', 'p049',
       'p050', 'p051', 'p052', 'p056', 'p061', 'p064', 'p066', 'p072',
       'p075', 'p081'], dtype=object)

In [11]:
# let's look at some subject examples
#images = []
def img_div(img,txt,idx):
    W=220
    H=200
    N=4
    float = idx%N < (N-1) # turn off on last one
    fs = ""
    if float:
        fs = 'style="float: left;"'
    s = "<div %s>"%(fs)
    s += "<img width=%dpx height=%dpx src='%s'/>%s"%(W,H,img,txt)
    s += "</div>"
    return s

def show_subjects(subj):
    html = ""
    for j,s in enumerate(subj):
        i = driver_imgs[driver_imgs.subject == s].index[0]
        classname = driver_imgs.iloc[i].classname
        img = driver_imgs.iloc[i].img
        html += img_div("/files/kaggle/statefarm/data/train/"+classname+"/"+img, s, j)
    display(HTML(html))
    
show_subjects(subjects)


p002
p012
p014
p015
p016
p021
p022
p024
p026
p035
p039
p041
p042
p045
p047
p049
p050
p051
p052
p056
p061
p064
p066
p072
p075
p081

In [12]:
# we should probably make sure validation & testing both have male/female representation
# males 
males = ['p002', 'p012', 'p014', 'p015', 'p016', 'p021', 'p024', 'p026', 'p035', 'p039', 'p047', 'p051', 'p056', 'p075']
# females 
females = ['p022', 'p041', 'p042', 'p045', 'p049', 'p050', 'p052', 'p061', 'p064', 'p066', 'p072', 'p081']

In [13]:
show_subjects(males)


p002
p012
p014
p015
p016
p021
p024
p026
p035
p039
p047
p051
p056
p075

In [14]:
show_subjects(females)


p022
p041
p042
p045
p049
p050
p052
p061
p064
p066
p072
p081

In [15]:
len(males), len(females), len(subjects)


Out[15]:
(14, 12, 26)

In [72]:
0.2*len(males), 0.2*len(females)
# okay 3 males & 2 females in our validation set
# choosing p045, p049 females


Out[72]:
(2.8000000000000003, 2.4000000000000004)

In [78]:
set(males).intersection(set(females))


Out[78]:
set()

In [83]:
np.random.permutation(males)[:3]
# gave ['p035', 'p056', 'p075']


Out[83]:
array(['p035', 'p056', 'p075'], 
      dtype='|S4')

In [16]:
# okay this is the set I came up with.  BUT, Jeremy says he only used 3
validation_subjects = ['p021', 'p056', 'p075'] + ['p045', 'p049']
# let's try 3, then.
validation_subjects = ['p021', 'p056'] + ['p045']

In [17]:
show_subjects(validation_subjects)


p021
p056
p045

In [18]:
validation_df = driver_imgs[driver_imgs.subject.isin(validation_subjects)]

In [20]:
# move our validation images from train to valid
for i,x in validation_df.iterrows():
    #if i < 10:
    #    print x.classname, x.img
    fr = DATA_HOME_DIR+'/train/' + x.classname + '/' + x.img
    to = DATA_HOME_DIR+'/valid/' + x.classname + '/' + x.img
    #print fr, to
    os.rename(fr,to)

In [21]:
%cd $DATA_HOME_DIR/valid
# how many images we talking about?
for c in categories:
    g = glob(c+"/*.jpg")
    print c, len(g)


/home/rallen/Documents/Devel/PracticalDL4C/kaggle/statefarm/data/valid
c0 291
c1 286
c2 283
c3 281
c4 289
c5 287
c6 277
c7 239
c8 248
c9 274

In [ ]: