In [2]:
# Imports
%load_ext autoreload
%autoreload 2
%matplotlib inline
from scripts.plot_histograms import *
import csv
import importlib
from scripts import proj1_helpers, implementation, helpers
import numpy as np
from matplotlib import pyplot as plt
from scripts.helpers import impute_with_mean, get_header


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload

Configuration


In [3]:
train_path = '../data/train.csv'
test_path = '../data/test.csv'

Data import


In [4]:
header = get_header(train_path)
y, X_tr, ids_tr = proj1_helpers.load_csv_data(train_path)
X_tr_imp = impute_with_mean(X_tr)
_, X_te, ids_te = proj1_helpers.load_csv_data(test_path)
X_te_imp = impute_with_mean(X_te)

Histograms of features


In [5]:
do_hist(X_tr, header, shape=(6, 5))
savefig('hist_tr')


100%|██████████| 30/30 [00:03<00:00,  7.31it/s]

In [6]:
do_hist(X_tr_imp, header, shape=(6, 5))
savefig('hist_tr_imp')


100%|██████████| 30/30 [00:03<00:00,  8.37it/s]

In [7]:
do_hist(X_te, header, shape=(6, 5))
savefig('hist_te')


100%|██████████| 30/30 [00:07<00:00,  4.95it/s]

In [8]:
do_hist(X_te_imp, header, shape=(6, 5))
savefig('hist_te_imp')


100%|██████████| 30/30 [00:06<00:00,  4.87it/s]

Boxplots of features


In [9]:
do_boxplot(X_tr, header, shape=(15,2))
savefig('box_tr')


100%|██████████| 30/30 [00:04<00:00,  5.87it/s]

In [10]:
do_boxplot(X_tr_imp, header, shape=(15,2))
savefig('box_tr_imp')


100%|██████████| 30/30 [00:05<00:00,  6.21it/s]

Target value vs features


In [11]:
do_Xy(X_tr, y, header, shape=(6, 5))
savefig('xy_tr')


100%|██████████| 30/30 [00:08<00:00,  4.09it/s]

In [ ]:
do_Xy(X_tr_imp, y, header, shape=(6, 5))
savefig('xy_tr_imp')


100%|██████████| 30/30 [00:07<00:00,  2.42it/s]

Feature vs feature plots


In [ ]:
for i, (a, b) in enumerate(get_ranges(X_tr_imp.shape[1], 6)):
    do_hist_scatter(X_tr_imp, y, header, idx_x = a, idx_y = b, bins = 15)
    savefig('hist-scatter-tr-imp-%d' % i)

In [ ]: