In [1]:
import numpy as np
import pandas as pd

from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.normalization import BatchNormalization
from keras.layers.advanced_activations import PReLU
from keras.utils import np_utils, generic_utils

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

In [2]:
df = pd.read_csv("./otto/train.csv")

In [3]:
df.head(5)


Out[3]:
id feat_1 feat_2 feat_3 feat_4 feat_5 feat_6 feat_7 feat_8 feat_9 ... feat_85 feat_86 feat_87 feat_88 feat_89 feat_90 feat_91 feat_92 feat_93 target
0 1 1 0 0 0 0 0 0 0 0 ... 1 0 0 0 0 0 0 0 0 Class_1
1 2 0 0 0 0 0 0 0 1 0 ... 0 0 0 0 0 0 0 0 0 Class_1
2 3 0 0 0 0 0 0 0 1 0 ... 0 0 0 0 0 0 0 0 0 Class_1
3 4 1 0 0 1 6 1 5 0 0 ... 0 1 2 0 0 0 0 0 0 Class_1
4 5 0 0 0 0 0 0 0 0 0 ... 1 0 0 0 0 1 0 0 0 Class_1

5 rows × 95 columns


In [4]:
df.values


Out[4]:
array([[1, 1, 0, ..., 0, 0, 'Class_1'],
       [2, 0, 0, ..., 0, 0, 'Class_1'],
       [3, 0, 0, ..., 0, 0, 'Class_1'],
       ..., 
       [61876, 0, 0, ..., 0, 0, 'Class_9'],
       [61877, 1, 0, ..., 10, 0, 'Class_9'],
       [61878, 0, 0, ..., 2, 0, 'Class_9']], dtype=object)

In [5]:
# from pandas to numpy format
X = df.values.copy()

In [6]:
X.shape


Out[6]:
(61878, 95)

In [7]:
np.random.shuffle(X)
X, labels, ids = X[:, 1:-1].astype(np.float32), X[:, -1], X[:, 0].astype(str)

In [10]:
print X[0]
print labels[0]
print ids[0]


[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  8.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.
  0.  2.  0.]
Class_5
30399

In [11]:
scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)

In [12]:
X.shape


Out[12]:
(61878, 93)

In [13]:
encoder = LabelEncoder()
encoder.fit(labels)
y = encoder.transform(labels).astype(np.int32)

In [15]:
y[0]


Out[15]:
4

In [16]:
y = np_utils.to_categorical(y)

In [17]:
y[0]


Out[17]:
array([ 0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.])

In [18]:
np.random.seed(1337) # for reproducibility

def load_data(path, train=True):
    df = pd.read_csv(path)
    X = df.values.copy()
    if train:
        np.random.shuffle(X) # https://youtu.be/uyUXoap67N8
        X, labels = X[:, 1:-1].astype(np.float32), X[:, -1]
        return X, labels
    else:
        X, ids = X[:, 1:].astype(np.float32), X[:, 0].astype(str)
        return X, ids

def preprocess_data(X, scaler=None):
    if not scaler:
        scaler = StandardScaler()
        scaler.fit(X)
    X = scaler.transform(X)
    return X, scaler

def preprocess_labels(labels, encoder=None, categorical=True):
    if not encoder:
        encoder = LabelEncoder()
        encoder.fit(labels)
    y = encoder.transform(labels).astype(np.int32)
    if categorical:
        y = np_utils.to_categorical(y)
    return y, encoder

def make_submission(y_prob, ids, encoder, fname):
    with open(fname, 'w') as f:
        f.write('id,')
        f.write(','.join([str(i) for i in encoder.classes_]))
        f.write('\n')
        for i, probs in zip(ids, y_prob):
            probas = ','.join([i] + [str(p) for p in probs.tolist()])
            f.write(probas)
            f.write('\n')
    print("Wrote submission to file {}.".format(fname))

In [20]:
print "Loading data..."
X, labels = load_data('./otto/train.csv', train=True)
X, scaler = preprocess_data(X)
y, encoder = preprocess_labels(labels)

X_test, ids = load_data('./otto/test.csv', train=False)
X_test, _ = preprocess_data(X_test, scaler)

nb_classes = y.shape[1]
print nb_classes, 'classes'

dims = X.shape[1]
print dims, 'dims'

print "Building model..."

model = Sequential()
model.add(Dense(dims, 512, init='glorot_uniform'))
model.add(PReLU((512,)))
model.add(BatchNormalization((512,)))
model.add(Dropout(0.5))

model.add(Dense(512, 512, init='glorot_uniform'))
model.add(PReLU((512,)))
model.add(BatchNormalization((512,)))
model.add(Dropout(0.5))

model.add(Dense(512, 512, init='glorot_uniform'))
model.add(PReLU((512,)))
model.add(BatchNormalization((512,)))
model.add(Dropout(0.5))

model.add(Dense(512, nb_classes, init='glorot_uniform'))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy', optimizer="adam")


Loading data...
9 classes
93 dims
Building model...

In [21]:
print("Training model...")

model.fit(X, y, nb_epoch=20, batch_size=128, validation_split=0.15)


Training model...
Train on 52596 samples, validate on 9282 samples
Epoch 0
52596/52596 [==============================] - 34s - loss: 0.7832 - val. loss: 0.5979
Epoch 1
52596/52596 [==============================] - 35s - loss: 0.5975 - val. loss: 0.5683
Epoch 2
52596/52596 [==============================] - 34s - loss: 0.5755 - val. loss: 0.5551
Epoch 3
52596/52596 [==============================] - 34s - loss: 0.5567 - val. loss: 0.5456
Epoch 4
52596/52596 [==============================] - 32s - loss: 0.5439 - val. loss: 0.5386
Epoch 5
52596/52596 [==============================] - 33s - loss: 0.5341 - val. loss: 0.5334
Epoch 6
52596/52596 [==============================] - 35s - loss: 0.5270 - val. loss: 0.5268
Epoch 7
52596/52596 [==============================] - 35s - loss: 0.5188 - val. loss: 0.5238
Epoch 8
52596/52596 [==============================] - 34s - loss: 0.5134 - val. loss: 0.5189
Epoch 9
52596/52596 [==============================] - 34s - loss: 0.5057 - val. loss: 0.5142
Epoch 10
52596/52596 [==============================] - 36s - loss: 0.4980 - val. loss: 0.5133
Epoch 11
52596/52596 [==============================] - 32s - loss: 0.4941 - val. loss: 0.5093
Epoch 12
52596/52596 [==============================] - 31s - loss: 0.4896 - val. loss: 0.5100
Epoch 13
52596/52596 [==============================] - 36s - loss: 0.4841 - val. loss: 0.5090
Epoch 14
52596/52596 [==============================] - 34s - loss: 0.4805 - val. loss: 0.5043
Epoch 15
52596/52596 [==============================] - 33s - loss: 0.4755 - val. loss: 0.5048
Epoch 16
52596/52596 [==============================] - 32s - loss: 0.4731 - val. loss: 0.5015
Epoch 17
52596/52596 [==============================] - 38s - loss: 0.4698 - val. loss: 0.5002
Epoch 18
52596/52596 [==============================] - 46s - loss: 0.4656 - val. loss: 0.4978
Epoch 19
52596/52596 [==============================] - 35s - loss: 0.4627 - val. loss: 0.4976
Out[21]:
{'epoch': [0,
  1,
  2,
  3,
  4,
  5,
  6,
  7,
  8,
  9,
  10,
  11,
  12,
  13,
  14,
  15,
  16,
  17,
  18,
  19],
 'loss': [0.78324143186110273,
  0.5975361578061632,
  0.57547785434185539,
  0.55671858320393863,
  0.54390018091036152,
  0.53411056014521341,
  0.52701779974536755,
  0.51875027563143683,
  0.5133981341625069,
  0.50566420354929642,
  0.49798473720361885,
  0.49407484929212669,
  0.48963518579911541,
  0.48408449675306908,
  0.48052000884991886,
  0.47547464190846783,
  0.47305086903579713,
  0.46983600918015794,
  0.46564020178303855,
  0.46266455976967036],
 'val_loss': [0.5978717362965589,
  0.5683310884381665,
  0.5550503094872421,
  0.5456369744842431,
  0.5385627460676722,
  0.5333624760240565,
  0.5268130510326816,
  0.5237875709462401,
  0.5189273308336326,
  0.5142311434826521,
  0.5132902035145454,
  0.5093476792064725,
  0.5099738978079174,
  0.5089585815657851,
  0.5043363281220193,
  0.5048342262500515,
  0.5014564160347906,
  0.5001752814280698,
  0.4978143185932504,
  0.4976140490873994]}

In [24]:
print("Generating submission...")

proba = model.predict_proba(X_test)
make_submission(proba, ids, encoder, fname='./otto/keras-otto.csv')


Generating submission...
144368/144368 [==============================] - 37s    
Wrote submission to file ./otto/keras-otto.csv.

In [26]:
df = pd.read_csv("./otto/keras-otto.csv")

In [27]:
df.head(3)


Out[27]:
id Class_1 Class_2 Class_3 Class_4 Class_5 Class_6 Class_7 Class_8 Class_9
0 1 0.000309 0.127067 0.175108 0.692129 0.000169 0.000182 0.004699 0.000051 0.000287
1 2 0.002957 0.000693 0.000222 0.000061 0.000049 0.679892 0.000779 0.311487 0.003860
2 3 0.000009 0.000005 0.000003 0.000007 0.000002 0.999866 0.000017 0.000067 0.000021