In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
plt.style.use('fivethirtyeight')
# plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.serif'] = 'Helvetica'
plt.rcParams['font.monospace'] = 'Consolas'
plt.rcParams['font.size'] = 16
plt.rcParams['axes.labelsize'] = 16
plt.rcParams['axes.labelweight'] = 'bold'
plt.rcParams['xtick.labelsize'] = 14
plt.rcParams['ytick.labelsize'] = 14
plt.rcParams['legend.fontsize'] = 16
plt.rcParams['figure.titlesize'] = 20
plt.rcParams['lines.linewidth'] = 2
%matplotlib inline
# for auto-reloading external modules
%load_ext autoreload
%autoreload 2
In [2]:
# Set up the file directory and names
DIR = '../input/'
X_TRAIN = DIR + 'train-images-idx3-ubyte.pkl'
Y_TRAIN = DIR + 'train-labels-idx1-ubyte.pkl'
X_TEST = DIR + 't10k-images-idx3-ubyte.pkl'
Y_TEST = DIR + 't10k-labels-idx1-ubyte.pkl'
print('Loading pickle files')
X_train = pickle.load( open( X_TRAIN, "rb" ) )
y_train = pickle.load( open( Y_TRAIN, "rb" ) )
X_test = pickle.load( open( X_TEST, "rb" ) )
y_test = pickle.load( open( Y_TEST, "rb" ) )
n_train = X_train.shape[0]
n_test = X_test.shape[0]
print('Train images shape {}, labels shape {}'.format(X_train.shape, y_train.shape))
print('Test images shape {}, labels shape {}'.format(X_test.shape, y_test.shape))
In [3]:
# Check a few training values at random as a sanity check
def show_label_images(X, y):
'''Shows random images in a grid'''
num = 9
images = np.random.randint(0, X.shape[0], num)
print('Showing training image indexes {}'.format(images))
fig, axes = plt.subplots(3,3, figsize=(6,6))
for idx, val in enumerate(images):
r, c = divmod(idx, 3)
axes[r][c].imshow(X[images[idx]])
axes[r][c].annotate('Label: {}'.format(y[val]), xy=(1, 1))
axes[r][c].xaxis.set_visible(False)
axes[r][c].yaxis.set_visible(False)
show_label_images(X_train, y_train)
In [4]:
# Now do the same for the training dataset
show_label_images(X_test, y_test)
In [5]:
# # Training label distribution
y_train_df = pd.DataFrame(y_train, columns=['class'])
y_train_df.plot.hist(legend=False)
hist_df = pd.DataFrame(y_train_df['class'].value_counts(normalize=True))
hist_df.index.name = 'class'
hist_df.columns = ['train']
The class distribution is pretty evenly split between the classes. 1 is the most popular class with 11.24% of instances, and at the other end 5 is the least frequent class, with 9.04% of instances
In [6]:
# Test label distribution
y_test_df = pd.DataFrame(y_test, columns=['class'])
y_test_df.plot.hist(legend=False, bins=10)
test_counts = y_test_df['class'].value_counts(normalize=True)
hist_df['test'] = test_counts
The distribution looks very similar between training and test datasets.
In [7]:
hist_df['diff'] = np.abs(hist_df['train'] - hist_df['test'])
hist_df.sort_values('diff', ascending=False)['diff'].plot.bar()
Out[7]:
The largest difference is 0.0040% in the number 2 class.
In [8]:
# Final quick check of datatypes
assert X_train.dtype == np.uint8
assert y_train.dtype == np.uint8
assert X_test.dtype == np.uint8
assert y_test.dtype == np.uint8
In [ ]: