In [2]:
from __future__ import print_function
import matplotlib.pyplot as plt
import numpy as np
import os
import sys
import zipfile
from IPython.display import display, Image
from scipy import ndimage
from sklearn.linear_model import LogisticRegression
from six.moves.urllib.request import urlretrieve
from six.moves import cPickle as pickle

from skimage import color, io
from scipy.misc import imresize

np.random.seed(133)

# Config the matplotlib backend as plotting inline in IPython
%matplotlib inline

IMAGE_SIZE = 224

First, load the data from the Kaggle


In [2]:
last_percent_reported = None

def download_progress_hook(count, blockSize, totalSize):
  """A hook to report the progress of a download. This is mostly intended for users with
  slow internet connections. Reports every 1% change in download progress.
  """
  global last_percent_reported
  percent = int(count * blockSize * 100 / totalSize)

  if last_percent_reported != percent:
    if percent % 5 == 0:
      sys.stdout.write("%s%%" % percent)
      sys.stdout.flush()
    else:
      sys.stdout.write(".")
      sys.stdout.flush()
      
    last_percent_reported = percent
        
def maybe_download(filename, url, expected_bytes, force=False):
  """Download a file if not present, and make sure it's the right size."""
  if force or not os.path.exists(filename):
    print('Attempting to download:', filename) 
    filename, _ = urlretrieve(url , filename, reporthook=download_progress_hook)
    print('\nDownload Complete!')
  statinfo = os.stat(filename)
  if statinfo.st_size:# == expected_bytes:
    print('Found and verified', filename)
  else:
    raise Exception(
      'Failed to verify ' + filename + '. Can you get to it with a browser?')
  return filename

test_filename = maybe_download('test.zip', 'https://kaggle2.blob.core.windows.net/competitions-data/kaggle/5441/test.zip?sv=2015-12-11&sr=b&sig=YtsCaH8gL7dObP11aL7iD9VVaJ%2BGtnls3%2FzBiE8vfjE%3D&se=2017-02-26T15%3A55%3A36Z&sp=r', 71303168)
train_filename = maybe_download('train.zip', 'https://kaggle2.blob.core.windows.net/competitions-data/kaggle/5441/train.zip?sv=2015-12-11&sr=b&sig=7UzYtGnmwvxodWZtaMVFjzmfSXLUM%2FMVjOpmtBZId28%3D&se=2017-02-26T16%3A02%3A58Z&sp=r', 566181888)


Found and verified test.zip
Found and verified train.zip

Extract the images


In [3]:
def maybe_extract(filename, force=False):
  root = os.path.splitext(os.path.splitext(filename)[0])[0]  # remove .tar.gz
  if os.path.isdir(root) and not force:
    # You may override by setting force=True.
    print('%s already present - Skipping extraction of %s.' % (root, filename))
  else:
    print('Extracting data from %s. This may take a while. Please wait.' % filename)

    zip = zipfile.ZipFile(filename)
    sys.stdout.flush()
    zip.extractall()
    zip.close()
  return root+'/'
  
train_folder = maybe_extract(train_filename)
test_folder = maybe_extract(test_filename)


train already present - Skipping extraction of train.zip.
test already present - Skipping extraction of test.zip.

Build lists of files and display some random image to verify if it works


In [4]:
train_images = [train_folder+i for i in os.listdir(train_folder)]
#train_labels = ['dog' in i for i in train_images]
#train_dogs =   [train_folder+i for i in os.listdir(train_folder) if 'dog' in i]
#train_cats =   [train_folder+i for i in os.listdir(train_folder) if 'cat' in i]
test_images =  [test_folder+i for i in os.listdir(test_folder)]

random_image=np.random.choice(train_images)
print (random_image)
image=Image(random_image)
display(image)

random_image=np.random.choice(test_images)
print (random_image)
display(Image(random_image))


train/cat.7590.jpg
test/5053.jpg

Now let's see what the images look like. Let's examin their shapes


In [5]:
from PIL import Image as image

dimensions_train = np.matrix([image.open(i).size for i in train_images],dtype=np.float32)
dimensions_test = np.matrix([image.open(i).size for i in test_images])

print(dimensions_train.shape)
print(dimensions_test.shape)


(25000, 2)
(12500, 2)

In [6]:
aspect_train = dimensions_train[:,0]/dimensions_train[:,1]
#print(aspect_train)

print ("Training set:")
print ("min: %s" % np.min(dimensions_train, axis=0))
print ("max: %s" % np.max(dimensions_train, axis=0))
print ("mean: %s" % np.mean(dimensions_train, axis=0))
print ("median: %s" % np.median(dimensions_train, axis=0))
print ("stdev: %s" % np.std(dimensions_train, axis=0))
print ("aspect min: %s" % np.min(aspect_train))
print ("aspect max: %s" % np.max(aspect_train))
print ("aspect mean: %s" % np.mean(aspect_train))
print ("aspect stdev: %s" % np.std(aspect_train))

print ("Test set:")
print ("min: %s" % np.min(dimensions_test, axis=0))
print ("max: %s" % np.max(dimensions_test, axis=0))
print ("mean: %s" % np.mean(dimensions_test, axis=0))
print ("median: %s" % np.median(dimensions_test, axis=0))
print ("stdev: %s" % np.std(dimensions_test, axis=0))


Training set:
min: [[ 42.  32.]]
max: [[ 1050.   768.]]
mean: [[ 404.09902954  360.47808838]]
median: [[ 447.  374.]]
stdev: [[ 109.03631592   97.01548767]]
aspect min: 0.306613
aspect max: 5.90909
aspect mean: 1.1572
aspect stdev: 0.291908
Test set:
min: [[37 44]]
max: [[500 500]]
mean: [[ 404.22448  359.93072]]
median: [[ 447.  374.]]
stdev: [[ 109.32650113   96.75354092]]

In [7]:
plt.hist(aspect_train, bins='auto', log=True) 
plt.title("Aspect ratio Histogram (log scale)")
plt.show()


Let's find images with extreme aspects.

When scaled such images can produce wiered output that can be misleading. Empirically set aspect cutoff to 1:2


In [8]:
low_pct_aspect=np.percentile(aspect_train,0.2)
high_pct_aspect=np.percentile(aspect_train,99.8)
# empirically set aspect cutoff to 1:2
low_pct_aspect=0.5
high_pct_aspect=2.0
low_pct_aspect_indices=[i for i in xrange(len(aspect_train)) if aspect_train[i]<low_pct_aspect]
high_pct_aspect_indices=[i for i in xrange(len(aspect_train)) if aspect_train[i]>high_pct_aspect]

In [9]:
print(low_pct_aspect_indices)

def display_train_image_by_idx(idx):
    display(Image(train_images[idx]))

for i in low_pct_aspect_indices:
    display_train_image_by_idx(i)
    print(aspect_train[i])
    print(train_images[i])


[270, 396, 1059, 2054, 2176, 2370, 2796, 2813, 2814, 3105, 3806, 4022, 4414, 4875, 5141, 5437, 5544, 5629, 6071, 6075, 7243, 7430, 7555, 7959, 8296, 8878, 9060, 9279, 9608, 9668, 10129, 10356, 11126, 11879, 11964, 12079, 12520, 12606, 12950, 13002, 13114, 13575, 13678, 14270, 14988, 15225, 15896, 15927, 15966, 15969, 16428, 17214, 17275, 17420, 17708, 18013, 18304, 18342, 18836, 18929, 18958, 19699, 19725, 20094, 21447, 21646, 21905, 22395, 22505, 23234, 23455, 23583, 24212, 24223, 24352, 24354, 24585]
[[ 0.42399999]]
train/dog.1741.jpg
[[ 0.49098197]]
train/dog.806.jpg
[[ 0.49899799]]
train/cat.7098.jpg
[[ 0.49000001]]
train/cat.3030.jpg
[[ 0.44800001]]
train/cat.9954.jpg
[[ 0.47389558]]
train/dog.5277.jpg
[[ 0.44419643]]
train/dog.3380.jpg
[[ 0.49000001]]
train/dog.1294.jpg
[[ 0.47477746]]
train/dog.7400.jpg
[[ 0.48605579]]
train/dog.7778.jpg
[[ 0.44360903]]
train/dog.8142.jpg
[[ 0.47695389]]
train/cat.9926.jpg
[[ 0.4729459]]
train/dog.1047.jpg
[[ 0.45691383]]
train/cat.5964.jpg
[[ 0.43486974]]
train/dog.5880.jpg
[[ 0.49735451]]
train/dog.12143.jpg
[[ 0.42399999]]
train/dog.1483.jpg
[[ 0.472]]
train/cat.8693.jpg
[[ 0.38660908]]
train/cat.10119.jpg
[[ 0.45691383]]
train/dog.3135.jpg
[[ 0.49498999]]
train/cat.1680.jpg
[[ 0.47695389]]
train/cat.10958.jpg
[[ 0.37593985]]
train/dog.7857.jpg
[[ 0.37089202]]
train/cat.8755.jpg
[[ 0.46245059]]
train/dog.10199.jpg
[[ 0.49399999]]
train/cat.5851.jpg
[[ 0.49653581]]
train/dog.12476.jpg
[[ 0.44999999]]
train/dog.8739.jpg
[[ 0.40681362]]
train/cat.5929.jpg
[[ 0.45535713]]
train/cat.188.jpg
[[ 0.35545024]]
train/cat.5981.jpg
[[ 0.41999999]]
train/cat.8902.jpg
[[ 0.47400001]]
train/dog.10249.jpg
[[ 0.44642857]]
train/dog.3330.jpg
[[ 0.43000001]]
train/cat.1723.jpg
[[ 0.37688443]]
train/cat.664.jpg
[[ 0.368]]
train/cat.11512.jpg
[[ 0.45222929]]
train/dog.7019.jpg
[[ 0.4474273]]
train/cat.12420.jpg
[[ 0.49200001]]
train/cat.1214.jpg
[[ 0.30661324]]
train/dog.10749.jpg
[[ 0.47987616]]
train/cat.3799.jpg
[[ 0.48096192]]
train/dog.418.jpg
[[ 0.37675351]]
train/cat.10988.jpg