In [56]:
    
import cvxpy as cp
import numpy as np
    
In [58]:
    
def l_w(w, y, X):
    return np.sum(np.log(1 + np.exp(-y * (X @ w))))
# an alternative (from Grant Baker's code)
def logOnePlusExp(a):
    if a > 10**50:
        return np.log(exp(a)*(1+np.exp(-a)))
    elif a < -3:
        return np.log1p(np.exp(a))
    else:
        return np.log(1+np.exp(a))
def l_w(w, y, X):
    return np.sum( np.vectorize(logOnePlusExp)(-y * (X @ w)))
    
def sigma(a):
    return 1/(1 + np.exp(-a))
def make_mu(y, w, X):
    return sigma(y * (w @ X.T))
def grad_l(X, y, w):
    mu = make_mu(y, w, X)
    return -X.T @ (y * (1 - mu)).flatten()
    
In [59]:
    
# There's not much point in running this, since it's just objectively worse than the one above,
# but it was easier for me to reason about and helped check that the grad_l above is implemented correctly
def slow_grad_l(X, y, w):
    i = 0
    running = sigma(-y[i] * (w @ X[i, ])) * y[i] * X[i, ]
    for i in range(1, y.shape[0]):
        running += sigma(-y[i] * (w @ X[i, ])) * y[i] * X[i, ]
    
    return -running
    
In [60]:
    
def finite_diff(f, grad_f, x_0, h=1e-6):
    g = []
    
    for i in range(len(x_0)):
        h_i = np.zeros(len(x_0))
        h_i[i] = h
        
        g.append((f(x_0 + h_i) - f(x_0)) / h)
    
    g = np.array(g)
    
    return np.mean(g - grad_f(x_0))
def finite_diff_centered(f, grad_f, x_0, h=1e-6):
    g = []
    
    for i in range(len(x_0)):
        h_i = np.zeros(len(x_0))
        h_i[i] = h
        
        g.append((f(x_0 + h_i) - f(x_0 - h_i)) / (2*h))
    
    g = np.array(g)
    
    return np.mean(g - grad_f(x_0))
def taylor_remainder_test(f, grad_f, x_0, delta=1, h=1e-6):
    x_1 = x_0 + delta
    first_order = np.abs(f(x_0) - f(x_0 + h*x_1)).mean()
    second_order = np.abs(f(x_0) + np.dot(grad_f(x_0), h*x_1) - f(x_0 + h*x_1)).mean()
    
    return first_order, second_order
    
Just running a few toy examples to make sure it works.
In [61]:
    
finite_diff(lambda x: 0.5*np.linalg.norm(x)**2, lambda x: x, x_0 = np.array([1, 2, 3, 4, 5, 700]))
    
    Out[61]:
In [62]:
    
finite_diff_centered(lambda x: 0.5*np.linalg.norm(x)**2, lambda x: x, x_0 = np.array([1, 2, 3, 4, 5, 700]))
    
    Out[62]:
In [63]:
    
taylor_remainder_test(lambda x: 0.5*np.linalg.norm(x)**2, lambda x: x, x_0 = np.array([1, 2, 3, 4, 5, 700]))
    
    Out[63]:
In [64]:
    
def gradient_check(f, grad_f, x_0):
    for i in range(12):
        h = 10**(-i)
        
        result = finite_diff(f, grad_f, x_0, h = h)
        result2 = finite_diff_centered(f, grad_f, x_0, h=h)
        
        print(h, result, result2)
    
In [65]:
    
print('h, fd, fdc')
gradient_check(lambda x: 0.5*np.linalg.norm(x)**2, lambda x: x, x_0 = np.array([2, 3, 4, 5, 6, 7, 8]))
    
    
In [66]:
    
def armijo_linesearch(f, grad_f, x_k, t=0.1):
    p_k = - grad_f(x_k) # for gradient descent, p_k comes from grad_f
    ro = 0.9
    c = 1e-4
    
    while (f(x_k + t*p_k) > f(x_k) + c*t*np.dot(grad_f(x_k), p_k)).all():
        t = ro*t
    
    return t
def gradient_descent(f, grad_f, x_0=0, threshold=1e-4, max_iter=1e10, calc_progress=False):
    x = x_0
    grad = grad_f(x)
    t = 0.1
    i = 0
    
    progress = []
    progress_test = []
    
    while np.linalg.norm(grad) > threshold:
        t = armijo_linesearch(f, grad_f, x, t*2)
        
        x = x - t*grad
        grad = grad_f(x)
        i += 1
        
        # So this method closes over these variables defined BELOW this block to save the progress of the spam run...
        # Super hacky!
        if i % 20 == 0 and calc_progress:
            score = ((np.floor(sigma(a=Xtrain @ x)*2)*2 - 1) == Ytrain).sum() / Xtrain.shape[0]
            testscore = ((np.floor(sigma(a=Xtest @ x)*2)*2 - 1) == Ytest).sum() / Xtest.shape[0]
            progress.append(score)
            progress_test.append(testscore)
        
        if i > max_iter:
            print("Max iterations reached.")
            return x, progress, progress_test
    
    print("Total iterations:", i)
    return x, progress, progress_test
    
In [67]:
    
# toy example of this running
gradient_descent(lambda x: (x + 2)**2, lambda x: 2*x + 4, x_0 = np.array([5, 6]))
    
    
    Out[67]:
In [68]:
    
y = np.random.choice([-1, 1], size=100)
    
In [69]:
    
A = np.array([[1, 0, 0, 0],
            [0, 1, 0, 0],
            [0, 0, 1, 0],
            [0, 0 ,0 , 1],
            [1, 1, 1, 1]])
b = np.array([5, 6, 7, 8, 9])
ground_truth = np.linalg.inv(A.T @ A) @ A.T @ b
ground_truth
    
    Out[69]:
In [70]:
    
f = lambda x: 0.5 * np.linalg.norm(A@x - b)**2
grad_f = lambda x: 2*A.T@A@x -2*A.T@b
x_0 = np.array([1, 2, 3, 4])
               
gradient_descent(f, grad_f, x_0 = x_0, max_iter=1000, threshold=1e-100)
    
    
    Out[70]:
In [71]:
    
errs = []
for i in np.arange(0, 200, 20):
    errs.append((gradient_descent(f, grad_f, x_0 = x_0, max_iter=i, threshold=1e-100)[0] - ground_truth)[0])
    
    
In [72]:
    
import matplotlib.pyplot as plt
%matplotlib inline
plt.xlabel('Iterations')
plt.ylabel('Log error')
plt.suptitle('Log error of gradient descent given maximum iterations')
plt.plot(20 * np.arange(10), np.log(errs), marker='o')
    
    
    Out[72]:
    
In [73]:
    
# logistic regression
import scipy.io
import pandas as pd
spam_data = scipy.io.loadmat('../data/spamData.mat')
    
In [74]:
    
Xtrain = np.log(spam_data['Xtrain'] + 0.1)
Ytrain = spam_data['ytrain'].flatten()
Xtest = np.log(spam_data['Xtest'] + 0.1)
Ytest = spam_data['ytest'].flatten()
Ytrain.dtype = 'int8'
Ytest.dtype = 'int8'
Ytrain = Ytrain*2 - 1
Ytest = Ytest*2 - 1
    
In [75]:
    
Xtrain.shape, Ytrain.shape, Xtest.shape, Ytest.shape
    
    Out[75]:
In [76]:
    
# Bake ("curry") the other variables in, since the way I wrote the solver it requires the function to only take one
# variable
l_w_spam_train = lambda w: l_w(X=Xtrain, y=Ytrain, w=w)
grad_l_spam_train = lambda w: grad_l(X=Xtrain, y=Ytrain, w=w)
    
In [77]:
    
taylor_remainder_test(l_w_spam_train, grad_l_spam_train, x_0 = np.zeros(57))
    
    Out[77]:
In [78]:
    
found_w, progress, progress_test = gradient_descent(l_w_spam_train, grad_l_spam_train, x_0=np.zeros(57), max_iter=1000, calc_progress=True)
    
    
    
In [79]:
    
plt.plot(np.arange(1000/20)*20, 1 - np.array(progress), label='Train')
plt.plot(np.arange(1000/20)*20, 1 - np.array(progress_test), label='Test')
plt.xlabel('Iterations')
plt.ylabel('Misclassification rate')
plt.suptitle('Spam classification: misclassification rates by iteration')
plt.legend()
    
    Out[79]: