Logistic regression

Building a logistic regression classifier to distinguish 'plane' and 'car' in CIFAR-10 dataset

  • implement a fully-vectorized loss function for the Logistic Regression
  • implement the fully-vectorized expression for its analytic gradient
  • check implementation using numerical gradient
  • use a validation set to tune the learning rate and regularization strength
  • optimize the loss function with Batch Gradient Descent and Stochastic Gradient Descent

In [1]:
# Setup code for this notebook
import random 
import numpy as np
import matplotlib.pyplot as plt

# This is a bit of magic gto make matplotlib figures appear inline
# in the notebook rather than in a new window
%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0)
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'
# Some more magic so that the notebook will reload external python modules;
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

Download CIFAR-10 Data

I use the loading function from course code from Stanford University

Run get_datasets.sh in terminal to download the datasets, or download from Alex Krizhevsky.

get_datasets.sh

# Get CIFAR10
wget http://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz
tar -xzvf cifar-10-python.tar.gz
rm cifar-10-python.tar.gz

The results of the downloading is showed in following figure.

Write function to load data in data_utils.py


In [2]:
# Write function to load the cifar-10 data
# The original code is from http://cs231n.github.io/assignment1/
# The function is in data_utils.py file for reusing.
import cPickle as pickle
import numpy as np
import os

def load_CIFAR_batch(filename):
  """ load single batch of cifar """
  with open(filename, 'r') as f:
    datadict = pickle.load(f)
    X = datadict['data']
    Y = datadict['labels']
    X = X.reshape(10000, 3, 32, 32).transpose(0,2,3,1).astype("float")
    Y = np.array(Y)
    return X, Y

def load_CIFAR10(ROOT):
  """ load all of cifar """
  xs = []
  ys = []
  for b in range(1,6):
    f = os.path.join(ROOT, 'data_batch_%d' % (b, ))
    X, Y = load_CIFAR_batch(f)
    xs.append(X)
    ys.append(Y)    
  Xtr = np.concatenate(xs)
  Ytr = np.concatenate(ys)
  del X, Y
  Xte, Yte = load_CIFAR_batch(os.path.join(ROOT, 'test_batch'))
  return Xtr, Ytr, Xte, Yte

Load data and visualize some samples


In [3]:
from algorithms.data_utils import load_CIFAR10

def visualize_sample(X_train, y_train, samples_per_class=7):
    """visualize some samples in the training datasets """
    classes = ['plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck']
    num_classes = len(classes)
    for y, cls in enumerate(classes):
        idxs = np.flatnonzero(y_train == y) # get all the indexes of cls
        idxs = np.random.choice(idxs, samples_per_class, replace=False)
        for i, idx in enumerate(idxs): # plot the image one by one
            plt_idx = i * num_classes + y + 1 # i*num_classes and y+1 determine the row and column respectively
            plt.subplot(samples_per_class, num_classes, plt_idx)
            plt.imshow(X_train[idx].astype('uint8'))
            plt.axis('off')
            if i == 0:
                plt.title(cls)
    plt.show()

def get_CIFAR10_data(num_training=49000, num_val=1000, num_test=10000, show_sample=True):
    """
    Load the CIFAR-10 dataset, show samples and perform perprocessing to prepare it for 
    the classifier.
    - show_sample: determine whether plot samples
    """

    cifar10_dir = 'datasets/datasets-cifar-10/cifar-10-batches-py/'
    X_train, y_train, X_test, y_test = load_CIFAR10(cifar10_dir)
    
    if show_sample:
        visualize_sample(X_train, y_train)
        
    # subsample the data for validation set
    mask = xrange(num_training, num_training + num_val)
    X_val = X_train[mask]
    y_val = y_train[mask]
    mask = xrange(num_training)
    X_train = X_train[mask]
    y_train = y_train[mask]
    mask = xrange(num_test)
    X_test = X_test[mask]
    y_test = y_test[mask]
    
    # Preprocessing: reshape the image data into rows
    X_train = np.reshape(X_train, (X_train.shape[0], -1)) # [49000, 3072]
    X_val = np.reshape(X_val, (X_val.shape[0], -1)) # [1000, 3072]
    X_test = np.reshape(X_test, (X_test.shape[0], -1)) # [10000, 3072]
    
    # Normalize the data: subtract the mean image
    mean_image = np.mean(X_train, axis = 0)
    X_train -= mean_image
    X_val -= mean_image
    X_test -= mean_image
    
    # Add bias dimension and transform into columns
    X_train = np.hstack([X_train, np.ones((X_train.shape[0], 1))]).T
    X_val = np.hstack([X_val, np.ones((X_val.shape[0], 1))]).T
    X_test = np.hstack([X_test, np.ones((X_test.shape[0], 1))]).T
    
    return X_train, y_train, X_val, y_val, X_test, y_test

# Invoke the above function to get our data
X_train, y_train, X_val, y_val, X_test, y_test = get_CIFAR10_data()


# Subset 'plane' and 'car' classes to perform logistic regression
idxs = np.logical_or(y_train == 0, y_train == 1)
X_train = X_train[:, idxs]
y_train = y_train[idxs]

# validation set
idxs = np.logical_or(y_val == 0, y_val == 1)
X_val = X_val[:, idxs]
y_val = y_val[idxs]

# test set
idxs = np.logical_or(y_test == 0, y_test == 1)
X_test = X_test[:, idxs]
y_test = y_test[idxs]

# As a sanity check, we print out th size of the training and test data dimenstion
print 'Train data shape: ', X_train.shape
print 'Train labels shape: ', y_train.shape
print 'Validation data shape: ', X_val.shape
print 'Validation labels shape: ', y_val.shape
print 'Test data shape: ', X_test.shape
print 'Test labels shape: ', y_test.shape


Train data shape:  (3073, 9794)
Train labels shape:  (9794,)
Validation data shape:  (3073, 206)
Validation labels shape:  (206,)
Test data shape:  (3073, 2000)
Test labels shape:  (2000,)

Logisic Regression Classifier

The code is running in the backend, you can find it here.


In [8]:
# Test the loss and gradient and compare between two implementations
from algorithms.classifiers import loss_grad_logistic_naive, loss_grad_logistic_vectorized
import time

In [6]:



LinearRegression.html     LogisticRegression.ipynb  datasets/
LinearRegression.ipynb    algorithms/               images/

In [ ]: