In [4]:
# Imports
import csv
import importlib
from scripts import proj1_helpers, helpers
from scripts import implementation, feature_processing
import numpy as np

Configuration


In [5]:
train_path = '../data/train.csv'
test_path  = '../data/test.csv'
output_path = '../data/impute_submission.csv'

Loading data


In [3]:
# loading data
y, X, idx = proj1_helpers.load_csv_data(train_path)
y_t, X_t, ids_t = proj1_helpers.load_csv_data(test_path)

Percent of missing values per feature


In [6]:
np.mean(X==-999., axis=0)


Out[6]:
array([ 0.152456,  0.      ,  0.      ,  0.      ,  0.709828,  0.709828,
        0.709828,  0.      ,  0.      ,  0.      ,  0.      ,  0.      ,
        0.709828,  0.      ,  0.      ,  0.      ,  0.      ,  0.      ,
        0.      ,  0.      ,  0.      ,  0.      ,  0.      ,  0.399652,
        0.399652,  0.399652,  0.709828,  0.709828,  0.709828,  0.      ])

Imputing missing values with mean values over feature


In [7]:
X = feature_processing.impute_with_mean(X, range(X.shape[1]))

Standardizing data and adding constant feature


In [9]:
X_s, _, _ = feature_processing.standardize(X)
tx = feature_processing.add_polynomial(X_s, [])

Fitting least squares


In [10]:
w, l = implementation.least_squares(y, tx)

Test data: imputation with mean, standardizing, adding constant


In [12]:
X_t = feature_processing.impute_with_mean(X_t, range(X.shape[1]))
X_ts, _, _ = feature_processing.standardize(X_t)
tx_t = feature_processing.add_polynomial(X_ts, [])

Predicting values and saving to file


In [13]:
y_pred = proj1_helpers.predict_labels(w, tx_t)
proj1_helpers.create_csv_submission(ids_t, y_pred, output_path)

In [ ]: