Data exploration


In [1]:
import numpy as np
import pandas as pd

In [2]:
pixel_start = 13 * 28 + 7
pixel_end = pixel_start + 14
pixel_columns = ['pixel%d' % pixel for pixel in range(pixel_start, pixel_end)]
pixel_columns


Out[2]:
['pixel371',
 'pixel372',
 'pixel373',
 'pixel374',
 'pixel375',
 'pixel376',
 'pixel377',
 'pixel378',
 'pixel379',
 'pixel380',
 'pixel381',
 'pixel382',
 'pixel383',
 'pixel384']

In [3]:
train = pd.read_csv('../data/train.csv', dtype=np.uint8)
train[['label'] + pixel_columns].head()


Out[3]:
label pixel371 pixel372 pixel373 pixel374 pixel375 pixel376 pixel377 pixel378 pixel379 pixel380 pixel381 pixel382 pixel383 pixel384
0 1 0 0 0 0 0 27 253 253 254 13 0 0 0 0
1 0 254 186 7 0 0 0 0 0 0 0 0 0 166 254
2 1 0 0 0 0 0 4 146 254 184 0 0 0 0 0
3 4 0 222 153 5 0 0 0 0 0 0 0 120 240 13
4 0 253 253 99 0 0 0 0 0 0 0 0 0 25 231

In [4]:
test = pd.read_csv('../data/test.csv', dtype=np.uint8)
test[pixel_columns].head()


Out[4]:
pixel371 pixel372 pixel373 pixel374 pixel375 pixel376 pixel377 pixel378 pixel379 pixel380 pixel381 pixel382 pixel383 pixel384
0 0 0 0 0 0 0 0 0 0 0 0 205 253 253
1 250 137 83 70 0 0 0 0 0 0 0 0 0 0
2 0 0 133 217 0 15 58 140 189 181 227 24 0 0
3 0 14 147 72 0 0 0 0 0 0 40 243 253 253
4 0 0 0 0 0 45 204 253 253 254 178 0 0 0

In [5]:
train_grouped = train.groupby('label')
train_grouped.size()


Out[5]:
label
0    4132
1    4684
2    4177
3    4351
4    4072
5    3795
6    4137
7    4401
8    4063
9    4188
dtype: int64

In [6]:
import matplotlib.gridspec as gridspec
import matplotlib.pyplot as plt

% matplotlib inline

NUM_LABELS = 10
NUM_IMAGES_PER_GROUP = 7

fig = plt.figure(figsize=(10, 10))
gs = gridspec.GridSpec(NUM_LABELS, NUM_IMAGES_PER_GROUP)

for label in range(NUM_LABELS):
    for i, row in enumerate(train_grouped.get_group(label).head(NUM_IMAGES_PER_GROUP).values):
        img_data = row[1:].reshape((28, 28))
        ax = plt.subplot(gs[label, i])
        ax.imshow(img_data, cmap='gray')
        ax.axis('off')