notebook.community

Edit and run



In [4]:

    
# Imports
import csv
import importlib
from scripts import proj1_helpers, helpers
from scripts import implementation, feature_processing
import numpy as np

Configuration



In [5]:

    
train_path = '../data/train.csv'
test_path  = '../data/test.csv'
output_path = '../data/impute_submission.csv'

Loading data



In [3]:

    
# loading data
y, X, idx = proj1_helpers.load_csv_data(train_path)
y_t, X_t, ids_t = proj1_helpers.load_csv_data(test_path)

Percent of missing values per feature



In [6]:

    
np.mean(X==-999., axis=0)









    Out[6]:





array([ 0.152456,  0.      ,  0.      ,  0.      ,  0.709828,  0.709828,
        0.709828,  0.      ,  0.      ,  0.      ,  0.      ,  0.      ,
        0.709828,  0.      ,  0.      ,  0.      ,  0.      ,  0.      ,
        0.      ,  0.      ,  0.      ,  0.      ,  0.      ,  0.399652,
        0.399652,  0.399652,  0.709828,  0.709828,  0.709828,  0.      ])

Imputing missing values with mean values over feature



In [7]:

    
X = feature_processing.impute_with_mean(X, range(X.shape[1]))

Standardizing data and adding constant feature



In [9]:

    
X_s, _, _ = feature_processing.standardize(X)
tx = feature_processing.add_polynomial(X_s, [])

Fitting least squares



In [10]:

    
w, l = implementation.least_squares(y, tx)

Test data: imputation with mean, standardizing, adding constant



In [12]:

    
X_t = feature_processing.impute_with_mean(X_t, range(X.shape[1]))
X_ts, _, _ = feature_processing.standardize(X_t)
tx_t = feature_processing.add_polynomial(X_ts, [])

Predicting values and saving to file



In [13]:

    
y_pred = proj1_helpers.predict_labels(w, tx_t)
proj1_helpers.create_csv_submission(ids_t, y_pred, output_path)



In [ ]: