In [1]:
# Imports
%load_ext autoreload
%autoreload 2
import csv
import importlib
from scripts import proj1_helpers, helpers
from scripts import implementation, feature_processing
import numpy as np
import importlib
from tqdm import tqdm

Configuration


In [2]:
train_path = '../data/train.csv'
test_path  = '../data/test.csv'
output_path = '../data/fp_submission.csv'

Loading data


In [3]:
# loading data
y, X, idx = proj1_helpers.load_csv_data(train_path)
y_t, X_t, ids_t = proj1_helpers.load_csv_data(test_path)

Histograms of features

Non-mean-imputed:

Mean-imputed:

Imputing missing values with mean values over feature


In [17]:
X_p = feature_processing.process_X(X)
X_t_p = feature_processing.process_X(X_t)


100%|██████████| 5/5 [00:13<00:00,  2.58s/it]
100%|██████████| 5/5 [00:30<00:00,  5.76s/it]

In [23]:
np.mean(X_p,axis=0)


Out[23]:
array([  1.00000000e+00,  -9.68879478e-13,   4.50019089e-15,
        -3.48448848e-15,   7.19675786e-15,  -2.72244716e-14,
        -6.28513611e-12,   6.81155987e-13,   2.16429719e-14,
         6.39742126e-15,   2.86409207e-15,  -7.00447966e-15,
         4.45924897e-15,   5.42473841e-13,  -5.96492045e-15,
         1.35646161e-16,   7.13136217e-17,   2.58030370e-14,
        -1.06327391e-16,  -1.87188487e-16,   8.24369382e-15,
         1.41040513e-16,  -9.00283004e-15,   2.88956741e-12,
        -2.76637636e-15,   2.53944285e-14,  -8.41148019e-12,
         2.10209063e-14,  -5.90545074e-15,  -8.76751116e-16,
        -1.67285563e-15,   1.45078349e-15,   1.45078349e-15,
         1.45078349e-15,   1.14287069e-15,   1.14287069e-15,
         1.14287069e-15,   1.45078349e-15,   1.45078349e-15,
         1.45078349e-15,  -3.08753160e-13,   9.20292553e-15,
        -9.52453727e-15,  -5.32868771e-15,   1.16755041e-14,
         5.76625874e-13,   3.59292840e-13,  -2.18489240e-14,
        -3.35068834e-16,   1.17201717e-14,   4.52299300e-14,
        -4.05015701e-14,  -3.80683400e-12,  -2.40004644e-15,
        -3.64775690e-14,   8.78775683e-14,   2.52964005e-15,
        -5.67012133e-14,   7.75157258e-14,   2.73154166e-16,
         8.33625171e-14,   1.74138481e-16,  -3.71875863e-16,
         1.06692571e-12,  -1.18177169e-12,  -1.76712156e-12,
        -4.10114726e-13,   1.13214393e-12,  -5.18587084e-15,
         1.14287069e-15,   1.63829839e-16,  -7.44904582e-16,
        -1.60472480e-15])

Fitting least squares


In [35]:
y_01 = helpers.y_to_01(np.array([y]).T)
w0 = np.random.randn(X_p.shape[1], 1)
w, l = implementation.reg_logistic_regression(y_01, X_p, 1e-8, w0, 100, 1e-5, debug = False)


 22%|██▏       | 22/100 [00:08<00:30,  2.52epoch/s, grad=2.59e+05, loss=4.22e+05, w=7.59]
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-35-308289320690> in <module>()
      1 y_01 = helpers.y_to_01(np.array([y]).T)
      2 w0 = np.random.randn(X_p.shape[1], 1)
----> 3 w, l = implementation.reg_logistic_regression(y_01, X_p, 1e-8, w0, 100, 1e-5, debug = False)

~/Documents/repos/git/EPFL/ML/Project1/src/scripts/implementation.py in reg_logistic_regression(y, tx, lambda_, initial_w, max_iters, gamma, debug)
     38 def reg_logistic_regression(y, tx, lambda_, initial_w, max_iters, gamma, debug = False):
     39     """ implement regularized logistic regression via gradient descent """
---> 40     losses, ws = gradient_descent(y, tx, initial_w, max_iters, gamma, loss_f = model_logistic.reg_loss, grad_f = model_logistic.reg_grad, kwargs = {'lambda_': lambda_}, debug = debug)
     41     return get_last_ans(ws, losses)
     42 

~/Documents/repos/git/EPFL/ML/Project1/src/scripts/helpers.py in gradient_descent(y, tx, initial_w, max_iters, gamma, loss_f, grad_f, kwargs, debug)
     34       for n_iter in range(max_iters):
     35         # calculating loss and gradient
---> 36         loss = loss_f(y, tx, w, **kwargs)
     37         gradient = grad_f(y, tx, w, **kwargs)
     38 

~/Documents/repos/git/EPFL/ML/Project1/src/scripts/model_logistic.py in reg_loss(y, tx, w, lambda_)
     16 def reg_loss(y, tx, w, lambda_):
     17     """ returns regularized logistic regression loss """
---> 18     return loss(y, tx, w) + lambda_ * (w.T @ w)

~/Documents/repos/git/EPFL/ML/Project1/src/scripts/model_logistic.py in loss(y, tx, w)
      8 def loss(y, tx, w):
      9     """ returns logistic regression loss """
---> 10     return np.sum(- np.multiply(tx @ w, y) + np.log1p(np.exp(tx @ w)))
     11 
     12 def reg_grad(y, tx, w, lambda_):

KeyboardInterrupt: 

In [30]:
#w, l = implementation.ridge_regression(y, X_p, 1e-2)

Predicting values and saving to file


In [31]:
y_pred = proj1_helpers.predict_labels(w, X_t_p)
y_1_t = np.mean(X_t_p @ w > 0)
y_1 = np.mean(y > 0)
print("Ones in test are %.2f%% of train" % (100. * y_1_t / y_1))


Ones in test are 78.18% of train

In [29]:
proj1_helpers.create_csv_submission(ids_t, y_pred, output_path)