In [1]:
# Imports
%load_ext autoreload
%autoreload 2
import csv
import importlib
from scripts import proj1_helpers, helpers
from scripts import implementation, feature_processing
import numpy as np
import importlib
from tqdm import tqdm
Configuration
In [2]:
train_path = '../data/train.csv'
test_path = '../data/test.csv'
output_path = '../data/fp_submission.csv'
Loading data
In [3]:
# loading data
y, X, idx = proj1_helpers.load_csv_data(train_path)
y_t, X_t, ids_t = proj1_helpers.load_csv_data(test_path)
Histograms of features
Non-mean-imputed:
Mean-imputed:
Imputing missing values with mean values over feature
In [17]:
X_p = feature_processing.process_X(X)
X_t_p = feature_processing.process_X(X_t)
In [23]:
np.mean(X_p,axis=0)
Out[23]:
Fitting least squares
In [35]:
y_01 = helpers.y_to_01(np.array([y]).T)
w0 = np.random.randn(X_p.shape[1], 1)
w, l = implementation.reg_logistic_regression(y_01, X_p, 1e-8, w0, 100, 1e-5, debug = False)
In [30]:
#w, l = implementation.ridge_regression(y, X_p, 1e-2)
Predicting values and saving to file
In [31]:
y_pred = proj1_helpers.predict_labels(w, X_t_p)
y_1_t = np.mean(X_t_p @ w > 0)
y_1 = np.mean(y > 0)
print("Ones in test are %.2f%% of train" % (100. * y_1_t / y_1))
In [29]:
proj1_helpers.create_csv_submission(ids_t, y_pred, output_path)