In [ ]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

Support Vector Machines

Classification Using SVM

This tutorial is based on an EPFL machine learning course. Load dataset. We will use the CERN dataset, available from a previous EPFL machine learning challenges. You can download the data here. https://inclass.kaggle.com/c/epfml-project-1/data


In [ ]:
from proj1_helpers import load_csv_data
DATA_TRAIN_PATH = '../data/train.csv' # TODO: download train data and supply path here 
y, x, ids = load_csv_data(DATA_TRAIN_PATH)
# TODO: convert labels to -1,1 ?

## Note: This is the raw dataset, you can also work with your modified features if you prefer

In [ ]:
def calculate_cost(y, x, w, lambda_):
    """compute the full cost (the primal objective), that is loss plus regularizer."""
    # Here x is the full dataset matrix, and y are the corresponding +1 or -1 labels
    # ***************************************************
    # INSERT YOUR CODE HERE
    # TODO
    # ***************************************************
    raise NotImplementedError

Stochastic Gradient Descent for SVM

Compute the (stochastic) subgradient for the n-th summand of the SVM optimization objective


In [ ]:
def calculate_gradient(y, x, w, lambda_, n):
    """compute the stochastic gradient of loss plus regularizer."""
    # Here x is one datapoint, and y is the corresponding +1 or -1 label
    # 
    # ***************************************************
    # INSERT YOUR CODE HERE
    # TODO
    # ***************************************************
    # Be careful about the constant N(size) term! The complete objective for SVM is a sum, not an average as in earlier SGD examples!
    raise NotImplementedError

Implement stochastic gradient descent: Pick a data point uniformly at random and update w based on the gradient for the n-th summand of the objective


In [ ]:
def sgd_for_svm_demo(y, x):
    # ***************************************************
    # INSERT YOUR CODE HERE
    # classify the data by SGD for SVM: TODO
    # ***************************************************
    max_iter = 10000
    gamma = 0.001               # Step-size
    lambda_ = 1.0 / y.shape[0]  # or set to a different value, try cross-validation!
    
    w = np.zeros((x.shape[1], 1))
    
    for iter in range(max_iter):
        # n = sample one data point uniformly at random data from x
        raise NotImplemented
        # loss = TODO 
        # grad = TODO don't forget about the regularizer term
        # w = update w
        raise NotImplemented
        
        if iter % 1000 == 0:
            print("Current iteration={i}, the loss={l}".format(i=iter, l=loss))
    
    print("Objective = {l}".format(l=calculate_cost(y, x, w, lambda_)))

sgd_for_svm_demo(y, x)