Author(s): bfoo@google.com, kozyr@google.com
In this notebook, we gather exploratory data from our training set to do feature engineering and model tuning. Before running this notebook, make sure that:
mkdir -p ~/data/training_small
gsutil -m cp gs://$BUCKET/catimages/training_images/000*.png ~/data/training_small/
gsutil -m cp gs://$BUCKET/catimages/training_images/001*.png ~/data/training_small/
mkdir -p ~/data/debugging_small
gsutil -m cp gs://$BUCKET/catimages/training_images/002*.png ~/data/debugging_small
echo "done!"
Note that we only take the images starting with those IDs to limit the number we'll copy over to only a few thousand images.
In [0]:
# Enter your username:
YOUR_GMAIL_ACCOUNT = '******' # Whatever is before @gmail.com in your email address
In [0]:
# Libraries for this section:
import os
import cv2
import pickle
import numpy as np
from sklearn import preprocessing
In [0]:
# Directories:
PREPROC_DIR = os.path.join('/home', YOUR_GMAIL_ACCOUNT, 'data/')
TRAIN_DIR = os.path.join('/home', YOUR_GMAIL_ACCOUNT, 'data/training_small/') # Where the training dataset lives.
DEBUG_DIR = os.path.join('/home', YOUR_GMAIL_ACCOUNT, 'data/debugging_small/') # Where the debugging dataset lives.
In [0]:
def general_img_features(band):
"""
Define a set of features that we can look at for each color band
Args:
band: array which is one of blue, green, or red
Returns:
features: unique colors, nonzero count, mean, standard deviation,
min, and max of the channel's pixel values
"""
return [len(set(band.ravel())), np.count_nonzero(band),
np.mean(band), np.std(band),
band.min(), band.max()]
def concat_all_band_features(file, dir):
"""
Extract features from a single image.
Args:
file - single image filename
dir - directory where the files are stored
Returns:
features - descriptive statistics for pixels
"""
img = cv2.imread(os.path.join(dir, file))
features = []
blue = np.float32(img[:,:,0])
green = np.float32(img[:,:,1])
red = np.float32(img[:,:,2])
features.extend(general_img_features(blue)) # indices 0-4
features.extend(general_img_features(green)) # indices 5-9
features.extend(general_img_features(red)) # indices 10-14
return features
In [0]:
def harris_density(harris_img, square_size, threshold):
"""Apply Harris Corner Detection to image and get count of corners.
Args:
harris_img: image already processed by Harris Corner Detector (in cv2 package).
square_size: number of pixels per side of the window in which we detect corners.
threshold: indicates how "sharp" that corner must be to be detected.
Returns:
bins - counts in each bin of histogram.
"""
max_val = harris_img.max()
shape = harris_img.shape
bins = [0] * (square_size * square_size + 1)
for row in xrange(0, shape[0], square_size):
for col in xrange(0, shape[1], square_size):
bin_val = sum(sum(harris_img[row: row + square_size,
col: col + square_size] > threshold * max_val))
bins[int(bin_val)] += 1
return bins
In [0]:
def get_features(img_path):
"""Engineer the features and output feature vectors.
Args:
img_path: filepath to image file
Returns:
features: np array of features
"""
img = cv2.imread(img_path)
# Get the channels
gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
blue = np.float32(img[:, :, 0])
green = np.float32(img[:, :, 1])
red = np.float32(img[:, :, 2])
# Run general summarization on each
features = general_img_features(gray)
features.extend(general_img_features(blue))
features.extend(general_img_features(green))
features.extend(general_img_features(red))
# Get Harris corner detection output
gray = cv2.cornerHarris(gray, 2, 3, 0.04)
blue = cv2.cornerHarris(blue, 2, 3, 0.04)
green = cv2.cornerHarris(green, 2, 3, 0.04)
red = cv2.cornerHarris(red, 2, 3, 0.04)
# Get general stats on each Harris detector results
features.extend(general_img_features(gray))
features.extend(general_img_features(blue))
features.extend(general_img_features(green))
features.extend(general_img_features(red))
# Get density bins on Harris detector results
features.extend(harris_density(gray, 4, 0.05))
return features
In [0]:
def get_features_and_labels(dir):
"""Get preprocessed features and labels.
Args:
dir: directory containing image files
Returns:
features: np array of features
labels: 1-d np array of binary labels
"""
i = 0
features = None
labels = []
print('\nImages processed (out of {:d})...'.format(len(os.listdir(dir))))
for filename in os.listdir(dir):
feature_row = np.array([get_features(os.path.join(dir, filename))])
if features is not None:
features = np.append(features, feature_row, axis=0)
else:
features = feature_row
split_filename = filename.split('_')
label = int(split_filename[-1].split('.')[0])
labels = np.append(labels, label)
i += 1
if i % 100 == 0:
print(features.shape[0])
print(features.shape[0])
return features, labels
In [0]:
# Use a limited set of images, this is computationally expensive:
training_features, training_labels = get_features_and_labels(TRAIN_DIR)
debugging_features, debugging_labels = get_features_and_labels(DEBUG_DIR)
print('\nDone!')
In [0]:
# Standardize features:
standardizer = preprocessing.StandardScaler().fit(training_features)
training_std = standardizer.transform(training_features)
debugging_std = standardizer.transform(debugging_features)
# Save features as pkl:
pickle.dump(training_std, open(os.path.join(PREPROC_DIR, 'training_std.pkl'), 'w'))
pickle.dump(debugging_std, open(os.path.join(PREPROC_DIR, 'debugging_std.pkl'), 'w'))
pickle.dump(training_labels, open(os.path.join(PREPROC_DIR, 'training_labels.pkl'), 'w'))
pickle.dump(debugging_labels, open(os.path.join(PREPROC_DIR, 'debugging_labels.pkl'), 'w'))
print ('\nFeaturing engineering is complete!')