Convert Input Files

It's much faster to load the training and test data from NumPy native files. Load up the CSV files and dump them back out as compressed .npz files.


In [1]:
from __future__ import print_function
import numpy as np

Load raw CSVs


In [2]:
X_train = np.genfromtxt("X_train_public.csv", delimiter=',')
Y_train = np.genfromtxt("Y_train_public.csv", delimiter=',')
X_test = np.genfromtxt("X_test_public.csv", delimiter=',')
Y_test = np.genfromtxt("Y_test_public.csv", delimiter=',')
X_comp = np.genfromtxt("X_test_private.csv", delimiter=',')

Check Input Ranges


In [3]:
for arr in [X_train, Y_train, X_test, Y_test, X_comp]:
    print(arr.min(), arr.max())


-32768.0 32767.0
0.0 1.0
-32768.0 32767.0
0.0 1.0
-32768.0 32767.0

Convert to int for Efficiency


In [4]:
X_train = np.int16(X_train)
Y_train = np.int16(Y_train)
X_test = np.int16(X_test)
Y_test = np.int16(Y_test)
X_comp = np.int16(X_comp)

In [5]:
for arr in [X_train, Y_train, X_test, Y_test, X_comp]:
    print(arr.min(), arr.max())


-32768 32767
0 1
-32768 32767
0 1
-32768 32767

Save to Binary Compressed File


In [6]:
np.savez_compressed(
    "data_files.npz",
    X_train=X_train,
    Y_train=Y_train,
    X_test=X_test,
    Y_test=Y_test,
    X_comp=X_comp)