Eitz et al asked non-expert humans to sketch objects of a given category and gather 20,000 unique sketches evenly distributed over 250 object categories.

@article{eitz2012hdhso,
    author={Eitz, Mathias and Hays, James and Alexa, Marc},
    title={How Do Humans Sketch Objects?},
    journal={ACM Trans. Graph. (Proc. SIGGRAPH)},
    year={2012},
    volume={31},
    number={4},
    pages = {44:1--44:10}
}

Download

Download png images (525MB):



In [117]:

    
#%%bash
#cd ~/Downloads
#wget http://cybertron.cg.tu-berlin.de/eitz/projects/classifysketch/sketches_png.zip
#unzip sketches_png.zip



In [118]:

    
files = !find ~/Desktop/res_results_problem_4 -name "*.jpg"
len(files)
#a = process(files[0])
#a.shape









    Out[118]:





100000



In [119]:

    
outpath = '/Users/drewlinsley/Documents/draw/draw/datasets'
datasource = 'sketch_uint8_shuffle'

show sample sketch



In [120]:

    
#Import libraries for doing image analysis
from skimage.io import imread
from skimage.transform import resize
from sklearn.ensemble import RandomForestClassifier as RF
import glob
import os
from sklearn import cross_validation
from sklearn.cross_validation import StratifiedKFold as KFold
from sklearn.metrics import classification_report
from matplotlib import pyplot as plt
from matplotlib import colors
from pylab import cm
from skimage import segmentation
from skimage.morphology import watershed
from skimage import measure
from skimage import morphology
import numpy as np
import pandas as pd
from scipy import ndimage
from skimage.feature import peak_local_max
# make graphics inline
%matplotlib inline



In [121]:

    
plt.figure(figsize=(12,3))
image = imread(files[0], as_grey=True)
imagethr = np.where(image > np.mean(image),0.,1.0)

plt.subplot(1,3,1)
plt.imshow(imagethr, cmap=cm.gray);
imdilated = morphology.dilation(imagethr, np.ones((16,16)))
plt.subplot(1,3,2)
plt.imshow(imdilated, cmap=cm.gray);

im1 = resize(imdilated,[56,56])
plt.subplot(1,3,3)
plt.imshow(im1, cmap=cm.gray);
plt.show()

Read



In [122]:

    
def process(fname):
    image = imread(fname, as_grey=True)
    imagethr = np.where(image > np.mean(image),0.,1.0)
    return imagethr.ravel().astype(np.int8)

forever divided by eight is also forever :(



In [123]:

    
import multiprocessing as mp
NUM_PROCESSES = 8
pool = mp.Pool(NUM_PROCESSES)
results = pool.map(process, files, chunksize=100)
pool.close()
pool.join()



In [124]:

    
y = np.array(map(lambda f: f.split('_')[-2], files))
y = y.reshape(-1,1)
y = y.astype(np.uint8)
y.reshape(-1,1)









    Out[124]:





array([[0],
       [0],
       [0],
       ..., 
       [1],
       [1],
       [1]], dtype=uint8)



In [125]:

    
X = np.array(results)
N, image_size = X.shape
D = int(np.sqrt(image_size))
N, image_size, D









    Out[125]:





(100000, 1024, 32)

shuffle the order of the images



In [126]:

    
num_els = y.shape[0]
test_size = int(num_els * (.1/2)) #/2 because +/- types
pos_test_id = np.asarray(range(0,test_size))
neg_test_id = np.asarray(range(num_els - test_size,num_els))
train_id = np.asarray(range(test_size, num_els - test_size))



In [127]:

    
test_y = y[np.hstack((pos_test_id,neg_test_id))]
test_X = X[np.hstack((pos_test_id,neg_test_id))]
N_test = test_y.shape[0]
np.sum(test_y)









    Out[127]:





5000



In [128]:

    
train_y = y[train_id]
train_X = X[train_id]
N_train = train_y.shape[0]
np.sum(train_y)









    Out[128]:





45000



In [129]:

    
import random
test_s = random.sample(xrange(test_y.shape[0]),test_y.shape[0])
train_s = random.sample(xrange(train_y.shape[0]),train_y.shape[0])
test_X=test_X[test_s]
train_X=train_X[train_s]
test_y=test_y[test_s]
train_y=train_y[train_s]



In [130]:

    
train_y.dtype









    Out[130]:





dtype('uint8')

HDF5

Save all images in a format that can be used by fuel



In [131]:

    
import fuel
datasource_dir = os.path.join(outpath, datasource)
!mkdir -p {datasource_dir}
datasource_fname = os.path.join(datasource_dir , datasource+'.hdf5')
datasource_fname









    Out[131]:





'/Users/drewlinsley/Documents/draw/draw/datasets/sketch_uint8_shuffle/sketch_uint8_shuffle.hdf5'



In [132]:

    
import h5py
fp = h5py.File(datasource_fname, mode='w')
image_features = fp.create_dataset('features', (N, image_size), dtype='uint8')



In [133]:

    
image_features[...] = np.vstack((train_X,test_X))



In [134]:

    
targets = fp.create_dataset('targets', (N, 1), dtype='uint8')



In [135]:

    
targets[...] = np.vstack((train_y,test_y)).reshape(-1,1)



In [136]:

    
from fuel.datasets.hdf5 import H5PYDataset
split_dict = {
    'train': {'features': (0, N_train), 'targets': (0, N_train)},
    'test': {'features': (N_train, N), 'targets': (N_train, N)}
}
fp.attrs['split'] = H5PYDataset.create_split_array(split_dict)



In [137]:

    
fp.flush()
fp.close()



In [138]:

    
!ls -l {datasource_fname}









    



-rw-r--r--  1 drewlinsley  staff  102502800 Mar 18 18:37 /Users/drewlinsley/Documents/draw/draw/datasets/sketch_uint8_shuffle/sketch_uint8_shuffle.hdf5



In [139]:

    
#!aws s3 cp {datasource_fname} s3://udidraw/ --grants read=uri=http://acs.amazonaws.com/groups/global/AllUsers

Look at training



In [140]:

    
train_set = H5PYDataset(datasource_fname, which_sets=('train',))



In [141]:

    
train_set.num_examples









    Out[141]:





90000



In [142]:

    
train_set.provides_sources









    Out[142]:





(u'features', u'targets')



In [143]:

    
handle = train_set.open()
data = train_set.get_data(handle, slice(0, 16))
data[0].shape,data[1].shape









    Out[143]:





((16, 1024), (16, 1))



In [144]:

    
data[1]









    Out[144]:





array([[0],
       [0],
       [1],
       [0],
       [1],
       [1],
       [0],
       [1],
       [0],
       [0],
       [0],
       [1],
       [1],
       [1],
       [1],
       [1]], dtype=uint8)



In [145]:

    
plt.figure(figsize=(12,12))
for i in range(16):
    plt.subplot(4,4,i+1)
    plt.imshow(data[0][i].reshape(D,D), cmap=cm.gray)
    plt.title(data[1][i][0]);



In [146]:

    
train_set.close(handle)