It's much faster to load the training and test data from NumPy native files. Load up the CSV files and dump them back out as compressed .npz
files.
In [1]:
from __future__ import print_function
import numpy as np
In [2]:
X_train = np.genfromtxt("X_train_public.csv", delimiter=',')
Y_train = np.genfromtxt("Y_train_public.csv", delimiter=',')
X_test = np.genfromtxt("X_test_public.csv", delimiter=',')
Y_test = np.genfromtxt("Y_test_public.csv", delimiter=',')
X_comp = np.genfromtxt("X_test_private.csv", delimiter=',')
In [3]:
for arr in [X_train, Y_train, X_test, Y_test, X_comp]:
print(arr.min(), arr.max())
In [4]:
X_train = np.int16(X_train)
Y_train = np.int16(Y_train)
X_test = np.int16(X_test)
Y_test = np.int16(Y_test)
X_comp = np.int16(X_comp)
In [5]:
for arr in [X_train, Y_train, X_test, Y_test, X_comp]:
print(arr.min(), arr.max())
In [6]:
np.savez_compressed(
"data_files.npz",
X_train=X_train,
Y_train=Y_train,
X_test=X_test,
Y_test=Y_test,
X_comp=X_comp)