In [ ]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2
This tutorial is based on an EPFL machine learning course. Load dataset. We will use the CERN dataset, available from a previous EPFL machine learning challenges. You can download the data here. https://inclass.kaggle.com/c/epfml-project-1/data
In [ ]:
from proj1_helpers import load_csv_data
DATA_TRAIN_PATH = '../data/train.csv' # TODO: download train data and supply path here
y, x, ids = load_csv_data(DATA_TRAIN_PATH)
# TODO: convert labels to -1,1 ?
## Note: This is the raw dataset, you can also work with your modified features if you prefer
In [ ]:
def calculate_cost(y, x, w, lambda_):
"""compute the full cost (the primal objective), that is loss plus regularizer."""
# Here x is the full dataset matrix, and y are the corresponding +1 or -1 labels
# ***************************************************
# INSERT YOUR CODE HERE
# TODO
# ***************************************************
raise NotImplementedError
Compute the (stochastic) subgradient for the n-th summand of the SVM optimization objective
In [ ]:
def calculate_gradient(y, x, w, lambda_, n):
"""compute the stochastic gradient of loss plus regularizer."""
# Here x is one datapoint, and y is the corresponding +1 or -1 label
#
# ***************************************************
# INSERT YOUR CODE HERE
# TODO
# ***************************************************
# Be careful about the constant N(size) term! The complete objective for SVM is a sum, not an average as in earlier SGD examples!
raise NotImplementedError
Implement stochastic gradient descent: Pick a data point uniformly at random and update w based on the gradient for the n-th summand of the objective
In [ ]:
def sgd_for_svm_demo(y, x):
# ***************************************************
# INSERT YOUR CODE HERE
# classify the data by SGD for SVM: TODO
# ***************************************************
max_iter = 10000
gamma = 0.001 # Step-size
lambda_ = 1.0 / y.shape[0] # or set to a different value, try cross-validation!
w = np.zeros((x.shape[1], 1))
for iter in range(max_iter):
# n = sample one data point uniformly at random data from x
raise NotImplemented
# loss = TODO
# grad = TODO don't forget about the regularizer term
# w = update w
raise NotImplemented
if iter % 1000 == 0:
print("Current iteration={i}, the loss={l}".format(i=iter, l=loss))
print("Objective = {l}".format(l=calculate_cost(y, x, w, lambda_)))
sgd_for_svm_demo(y, x)