In [2]:
# Imports
%load_ext autoreload
%autoreload 2
%matplotlib inline
from matplotlib import pyplot as plt
import csv
import importlib
from tqdm import tqdm
from scripts import proj1_helpers, helpers
from scripts import implementation, feature_processing, k_fold, model_linear, model_logistic
import numpy as np

In [3]:
train_path = '../data/train.csv'
test_path  = '../data/test.csv'
output_path = '../data/linear_processed_submission.csv'

In [24]:
# loading data
y, X, idx = proj1_helpers.load_csv_data(train_path)
y_t, X_t, ids_t = proj1_helpers.load_csv_data(test_path)

In [76]:
X_p, (x_mean, x_std) = feature_processing.process_X(X)
X_t_p, _ = feature_processing.process_X(X_t, (x_mean, x_std))


100%|██████████| 5/5 [00:06<00:00,  1.29s/it]
100%|██████████| 5/5 [00:17<00:00,  3.05s/it]

In [77]:
#Linear
model = implementation.ridge_regression
loss = model_linear.compute_accuracy_loss

idx_min_l, rmse_all_l, lambdas_l = k_fold.cross_validation_select(X_p, y, model, loss, seed = 1,
                                                            k_fold = 5, lambdas = lambdas, do_plot = True, do_tqdm = True)


100%|██████████| 20/20 [00:37<00:00,  1.90s/it]

In [78]:
lambda_ = lambdas[idx_min_l]
te_err=np.mean(rmse_all_l[1][idx_min_l])
tr_err=np.mean(rmse_all_l[0][idx_min_l])

print('Best lambda for linear model:',lambda_,'test error: ',te_err,'train error: ',tr_err,X_p.shape)
w, l = implementation.ridge_regression(y, X_p, lambda_ = 0)


Best lambda for linear model: 1.43844988829e-06 test error:  -0.793756 train error:  -0.794 (250000, 131)

In [7]:
#Logistic
y_01 = helpers.y_to_01(np.array([y]).T)
np.random.seed(1)
w0 = np.zeros((X_p.shape[1], 1))#np.random.randn(X_p.shape[1], 1)
model = implementation.reg_logistic_regression_newton_batch
model_args = {'initial_w': w0, 'max_iters': 100, 'gamma': .5, 'debug': False, 'batch_size': 50000}
#loss = model_logistic.reg_loss
loss = model_linear.compute_accuracy_loss

lambdas = np.logspace(-6, -3, 20)

idx_min, rmse_all, lambdas = k_fold.cross_validation_select(X_p, y_01, model, loss, kw_model = model_args, seed = 1,
                                                            k_fold = 3, lambdas = lambdas, do_plot = True, do_tqdm = False)


100%|██████████| 100/100 [00:06<00:00, 20.34epoch/s, acc=-0.75, diff=2.57e-06, grad=107, loss=8.24e+04, w=82.3]
100%|██████████| 100/100 [00:05<00:00, 19.98epoch/s, acc=-0.75, diff=1.04e-05, grad=69.4, loss=8.28e+04, w=72.1]
 15%|█▌        | 15/100 [00:01<00:07, 12.13epoch/s, acc=-0.75, diff=0.0245, grad=98.2, loss=8.24e+04, w=120]
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-7-ffd68fd65e6a> in <module>()
     11 
     12 idx_min, rmse_all, lambdas = k_fold.cross_validation_select(X_p, y_01, model, loss, kw_model = model_args, seed = 1,
---> 13                                                             k_fold = 3, lambdas = lambdas, do_plot = True, do_tqdm = False)

~/Project1/src/scripts/k_fold.py in cross_validation_select(x, y, model, loss, kw_model, kw_loss, seed, k_fold, do_plot, do_tqdm, lambdas)
     55         for k in range(k_fold):
     56             [rmse_[i].append(x) for i, x in
---> 57              enumerate(cross_validation(y, x, k_indices, k, model, kw_model, loss, kw_loss, lambda_))]
     58         [rmse[i].append(np.mean(x)) for (i, x) in enumerate(rmse_)]
     59         [rmse_all[i].append(x) for (i, x) in enumerate(rmse_)]

~/Project1/src/scripts/k_fold.py in cross_validation(y, tx, k_indices, k, model, kw_model, loss, kw_loss, lambda_)
     31 
     32     # training ridge regression
---> 33     weights, _ = model(y_tr, x_tr, lambda_, **kw_model)
     34 
     35     # computing losses

~/Project1/src/scripts/implementation.py in reg_logistic_regression_newton_batch(y, tx, lambda_, initial_w, batch_size, max_iters, gamma, debug)
     53 def reg_logistic_regression_newton_batch(y, tx, lambda_, initial_w, batch_size, max_iters, gamma, debug = False):
     54     """ implement regularized logistic regression via gradient descent """
---> 55     losses, ws = stochastic_gradient_descent(y, tx, initial_w, batch_size, max_iters, gamma, loss_f = model_logistic.reg_loss, grad_f = model_logistic.newton_reg_grad, kwargs = {'lambda_': lambda_}, debug = debug)
     56     return get_last_ans(ws, losses)
     57 

~/Project1/src/scripts/helpers.py in stochastic_gradient_descent(y, tx, initial_w, batch_size, max_iters, gamma, loss_f, grad_f, kwargs, debug)
     76         # calculating loss and gradient
     77         loss = loss_f(y, tx, w, **kwargs)
---> 78         stoch_gradient = grad_f(y_, tx_, w, **kwargs)
     79 
     80         # updating w

~/Project1/src/scripts/model_logistic.py in newton_reg_grad(y, x, w, lambda_)
     32 def newton_reg_grad(y, x, w, lambda_):
     33     """ returns regularized newton gradient """
---> 34     return newton_grad(y, x, w, lambda_)
     35 
     36 ### SECONDARY IMPLEMENTATION

~/Project1/src/scripts/model_logistic.py in newton_grad(y, x, w, lambda_)
     24     """ returns newton gradient """
     25     N, D = x.shape
---> 26     sigma = expit(x @ w).flatten()
     27     S = diags(np.multiply(sigma, 1 - sigma))
     28     H = x.T @ S @ x

KeyboardInterrupt: 

In [316]:
lambda_ = lambdas[idx_min]
print(lambda_)


0.000233572146909

In [ ]:
#w1, l = implementation.reg_logistic_regression(y_01, X_p, lambda_ = 0.1,
#                                                     initial_w = w0, max_iters = 100, gamma = 1e-6, debug = False)

In [198]:
w1, l = implementation.reg_logistic_regression_batch(y_01, X_p, lambda_ = 0.1,
                                                     initial_w = w0, batch_size = 50000,
                                                     max_iters = 300, gamma = 1e-5, debug = False)


 83%|████████▎ | 250/300 [00:39<00:07,  6.67epoch/s, acc=-0.78, grad=869, loss=1.15e+05, w=2.54]     

In [336]:
np.random.seed(42)
w2, l = implementation.reg_logistic_regression_newton_batch(y_01, X_p, lambda_ = 0.000233572146909,
                                                     initial_w = w1, batch_size = 50000,
                                                     max_iters = 100, gamma = .1, debug = False)


  0%|          | 0/100 [00:00<?, ?epoch/s]/home/sergei/Documents/repos/git/EPFL/ML/Project1/src/scripts/model_logistic.py:12: RuntimeWarning: overflow encountered in exp
  return -np.sum(np.multiply((tx @ w).flatten(), y.flatten())) + np.sum(np.log1p(np.exp(tx @ w)))
 84%|████████▍ | 84/100 [00:33<00:05,  3.00epoch/s, acc=-0.81, diff=3.42e-05, grad=7.35, loss=inf, w=122]

In [337]:
y_pred = proj1_helpers.predict_labels(w, X_t_p)

In [338]:
proj1_helpers.create_csv_submission(ids_t, y_pred, output_path)

In [ ]:
tqdm.get_lock().locks = []

In [253]:
np.mean(y_pred>0)


Out[253]:
0.30083521341409764

In [ ]: