In [2]:
from __future__ import print_function
import matplotlib.pyplot as plt
import numpy as np
import os
import sys
import zipfile
from IPython.display import display, Image
from scipy import ndimage
from sklearn.linear_model import LogisticRegression
from six.moves.urllib.request import urlretrieve
from six.moves import cPickle as pickle
from skimage import color, io
from scipy.misc import imresize
np.random.seed(133)
# Config the matplotlib backend as plotting inline in IPython
%matplotlib inline
IMAGE_SIZE = 224
First, load the data from the Kaggle
In [2]:
last_percent_reported = None
def download_progress_hook(count, blockSize, totalSize):
"""A hook to report the progress of a download. This is mostly intended for users with
slow internet connections. Reports every 1% change in download progress.
"""
global last_percent_reported
percent = int(count * blockSize * 100 / totalSize)
if last_percent_reported != percent:
if percent % 5 == 0:
sys.stdout.write("%s%%" % percent)
sys.stdout.flush()
else:
sys.stdout.write(".")
sys.stdout.flush()
last_percent_reported = percent
def maybe_download(filename, url, expected_bytes, force=False):
"""Download a file if not present, and make sure it's the right size."""
if force or not os.path.exists(filename):
print('Attempting to download:', filename)
filename, _ = urlretrieve(url , filename, reporthook=download_progress_hook)
print('\nDownload Complete!')
statinfo = os.stat(filename)
if statinfo.st_size:# == expected_bytes:
print('Found and verified', filename)
else:
raise Exception(
'Failed to verify ' + filename + '. Can you get to it with a browser?')
return filename
test_filename = maybe_download('test.zip', 'https://kaggle2.blob.core.windows.net/competitions-data/kaggle/5441/test.zip?sv=2015-12-11&sr=b&sig=YtsCaH8gL7dObP11aL7iD9VVaJ%2BGtnls3%2FzBiE8vfjE%3D&se=2017-02-26T15%3A55%3A36Z&sp=r', 71303168)
train_filename = maybe_download('train.zip', 'https://kaggle2.blob.core.windows.net/competitions-data/kaggle/5441/train.zip?sv=2015-12-11&sr=b&sig=7UzYtGnmwvxodWZtaMVFjzmfSXLUM%2FMVjOpmtBZId28%3D&se=2017-02-26T16%3A02%3A58Z&sp=r', 566181888)
Extract the images
In [3]:
def maybe_extract(filename, force=False):
root = os.path.splitext(os.path.splitext(filename)[0])[0] # remove .tar.gz
if os.path.isdir(root) and not force:
# You may override by setting force=True.
print('%s already present - Skipping extraction of %s.' % (root, filename))
else:
print('Extracting data from %s. This may take a while. Please wait.' % filename)
zip = zipfile.ZipFile(filename)
sys.stdout.flush()
zip.extractall()
zip.close()
return root+'/'
train_folder = maybe_extract(train_filename)
test_folder = maybe_extract(test_filename)
Build lists of files and display some random image to verify if it works
In [4]:
train_images = [train_folder+i for i in os.listdir(train_folder)]
#train_labels = ['dog' in i for i in train_images]
#train_dogs = [train_folder+i for i in os.listdir(train_folder) if 'dog' in i]
#train_cats = [train_folder+i for i in os.listdir(train_folder) if 'cat' in i]
test_images = [test_folder+i for i in os.listdir(test_folder)]
random_image=np.random.choice(train_images)
print (random_image)
image=Image(random_image)
display(image)
random_image=np.random.choice(test_images)
print (random_image)
display(Image(random_image))
Now let's see what the images look like. Let's examin their shapes
In [5]:
from PIL import Image as image
dimensions_train = np.matrix([image.open(i).size for i in train_images],dtype=np.float32)
dimensions_test = np.matrix([image.open(i).size for i in test_images])
print(dimensions_train.shape)
print(dimensions_test.shape)
In [6]:
aspect_train = dimensions_train[:,0]/dimensions_train[:,1]
#print(aspect_train)
print ("Training set:")
print ("min: %s" % np.min(dimensions_train, axis=0))
print ("max: %s" % np.max(dimensions_train, axis=0))
print ("mean: %s" % np.mean(dimensions_train, axis=0))
print ("median: %s" % np.median(dimensions_train, axis=0))
print ("stdev: %s" % np.std(dimensions_train, axis=0))
print ("aspect min: %s" % np.min(aspect_train))
print ("aspect max: %s" % np.max(aspect_train))
print ("aspect mean: %s" % np.mean(aspect_train))
print ("aspect stdev: %s" % np.std(aspect_train))
print ("Test set:")
print ("min: %s" % np.min(dimensions_test, axis=0))
print ("max: %s" % np.max(dimensions_test, axis=0))
print ("mean: %s" % np.mean(dimensions_test, axis=0))
print ("median: %s" % np.median(dimensions_test, axis=0))
print ("stdev: %s" % np.std(dimensions_test, axis=0))
In [7]:
plt.hist(aspect_train, bins='auto', log=True)
plt.title("Aspect ratio Histogram (log scale)")
plt.show()
In [8]:
low_pct_aspect=np.percentile(aspect_train,0.2)
high_pct_aspect=np.percentile(aspect_train,99.8)
# empirically set aspect cutoff to 1:2
low_pct_aspect=0.5
high_pct_aspect=2.0
low_pct_aspect_indices=[i for i in xrange(len(aspect_train)) if aspect_train[i]<low_pct_aspect]
high_pct_aspect_indices=[i for i in xrange(len(aspect_train)) if aspect_train[i]>high_pct_aspect]
In [9]:
print(low_pct_aspect_indices)
def display_train_image_by_idx(idx):
display(Image(train_images[idx]))
for i in low_pct_aspect_indices:
display_train_image_by_idx(i)
print(aspect_train[i])
print(train_images[i])