In [1]:
import pandas as pd
import tensorflow as tf
from bayes_opt import BayesianOptimization
from sklearn.cross_validation import train_test_split
import numpy as np


/opt/conda/lib/python3.6/site-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)

In [2]:
df = pd.read_json('../input/train.json')
df.inc_angle = df.inc_angle.replace('na', 0)
df.inc_angle = df.inc_angle.astype(float).fillna(0.0)

In [3]:
x_band1 = np.array([np.array(band).astype(np.float32).reshape(75, 75) for band in df["band_1"]])
x_band2 = np.array([np.array(band).astype(np.float32).reshape(75, 75) for band in df["band_2"]])
X_train = np.concatenate([x_band1[:, :, :, np.newaxis]
                          , x_band2[:, :, :, np.newaxis]
                         , ((x_band1+x_band1)/2)[:, :, :, np.newaxis]], axis=-1)
X_angle_train = np.array(df.inc_angle)
y_train = np.array(df["is_iceberg"])

In [4]:
# just take 100 dataset to do optimization
# we assume this 100 able to generalize the whole dataset
# if not enough, increase the number
X_train = X_train[:100]
y_train = y_train[:100]
X_angle_train = X_angle_train[:100].reshape((-1, 1))
y_onehot = np.zeros((y_train.shape[0], np.unique(y_train).shape[0]))
for i in range(y_train.shape[0]):
    y_onehot[i, y_train[i]] = 1.0
    
x_train, x_test, y_train, y_test, x_train_angle, x_test_angle = train_test_split(X_train, y_onehot, X_angle_train, test_size = 0.20)

In [5]:
def neural_network(fully_conn_size, # wide size of fully connected layer
                   len_layer_conv, # each conv layer included max pooling
                   kernel_size, # kernel size for conv process
                   learning_rate, # learning rate
                   pooling_size, # kernel and stride size for pooling
                   multiply, # constant to multiply for conv output
                   dropout_rate, # dropout
                   beta, # l2 norm discount
                   activation,
                   batch_normalization, 
                   batch_size = 20):
    
    tf.reset_default_graph()
    if activation == 0:
        activation = tf.nn.sigmoid
    elif activation == 1:
        activation = tf.nn.tanh
    else:
        activation = tf.nn.relu
    
    def conv_layer(x, conv, out_shape):
        w = tf.Variable(tf.truncated_normal([conv, conv, int(x.shape[3]), out_shape]))
        b = tf.Variable(tf.truncated_normal([out_shape], stddev = 0.01))
        return tf.nn.conv2d(x, w, [1, 1, 1, 1], padding = 'SAME') + b
    
    def fully_connected(x, out_shape):
        w = tf.Variable(tf.truncated_normal([int(x.shape[1]), out_shape]))
        b = tf.Variable(tf.truncated_normal([out_shape], stddev = 0.01))
        return tf.matmul(x, w) + b
    
    def pooling(x, k = 2, stride = 2):
        return tf.nn.max_pool(x, ksize = [1, k, k, 1], 
                              strides = [1, stride, stride, 1], 
                              padding = 'SAME')
    
    X_img = tf.placeholder(tf.float32, (None, 75, 75, 3))
    X_angle = tf.placeholder(tf.float32, (None, 1))
    Y = tf.placeholder(tf.float32, (None, y_onehot.shape[1]))
    # for batch normalization
    train = tf.placeholder(tf.bool)
    
    for i in range(len_layer_conv):
        if i == 0:
            conv = activation(conv_layer(X_img, kernel_size, 
                                         int(np.around(int(X_img.shape[3]) * multiply))))
        else:
            conv = activation(conv_layer(conv, kernel_size, 
                                         int(np.around(int(conv.shape[3]) * multiply))))
        conv = pooling(conv, k = pooling_size, stride = pooling_size)
        if batch_normalization:
            conv = tf.layers.batch_normalization(conv, training = train)
        conv = tf.nn.dropout(conv, dropout_rate)
    print(conv.shape)
    output_shape = int(conv.shape[1]) * int(conv.shape[2]) * int(conv.shape[3])
    conv = tf.reshape(conv, [-1, output_shape])
    conv = tf.concat([conv, X_angle], axis = 1)
    
    # our fully connected got 1 layer
    # you can increase it if you want
    for i in range(1):
        if i == 0:
            fc = activation(fully_connected(conv, fully_conn_size))
        else:
            fc = activation(fully_connected(fc, fully_conn_size))
        fc = tf.nn.dropout(fc, dropout_rate)
        if batch_normalization:
            fc = tf.layers.batch_normalization(fc, training = train)
            
    logits = fully_connected(fc, y_onehot.shape[1])
    cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels = Y, 
                                                                  logits = logits))
    cost += sum(beta * tf.nn.l2_loss(tf_var) for tf_var in tf.trainable_variables())
    optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost)
    correct_prediction = tf.equal(tf.argmax(logits, 1), tf.argmax(Y, 1))
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
    
    sess = tf.InteractiveSession()
    sess.run(tf.global_variables_initializer())
    COST, TEST_COST, ACC, TEST_ACC = [], [], [], []
    
    for i in range(20):
        train_acc, train_loss = 0, 0
        for n in range(0, (X_train.shape[0] // batch_size) * batch_size, batch_size):
            _, loss = sess.run([optimizer, cost], 
                               feed_dict = {X_img: x_train[n: n + batch_size, :, :, :],
                                            X_angle: x_train_angle[n: n + batch_size],
                                            Y: y_train[n: n + batch_size, :], 
                                            train: True})
            train_acc += sess.run(accuracy, 
                                  feed_dict = {X_img: x_train[n: n + batch_size, :, :, :],
                                               X_angle: x_train_angle[n: n + batch_size],
                                               Y: y_train[n: n + batch_size, :], 
                                               train: False})
            train_loss += loss
        results = sess.run([cost, accuracy], 
                           feed_dict = {X_img: x_test,
                                        X_angle: x_test_angle,
                                        Y: y_test, 
                                        train: False})
        TEST_COST.append(results[0])
        TEST_ACC.append(results[1])
        train_loss /= (x_train.shape[0] // batch_size)
        train_acc /= (x_train.shape[0] // batch_size)
        ACC.append(train_acc)
        COST.append(train_loss)
    COST = np.array(COST).mean()
    TEST_COST = np.array(TEST_COST).mean()
    ACC = np.array(ACC).mean()
    TEST_ACC = np.array(TEST_ACC).mean()
    return COST, TEST_COST, ACC, TEST_ACC

In [6]:
def generate_nn(fully_conn_size, len_layer_conv, kernel_size, learning_rate, pooling_size, multiply,
                dropout_rate, beta, activation, batch_normalization):
    global accbest
    param = {
        'fully_conn_size' : int(np.around(fully_conn_size)),
        'len_layer_conv' : int(np.around(len_layer_conv)),
        'kernel_size': int(np.around(kernel_size)),
        'learning_rate' : max(min(learning_rate, 1), 0.0001),
        'pooling_size': int(np.around(pooling_size)),
        'multiply': multiply,
        'dropout_rate' : max(min(dropout_rate, 0.99), 0),
        'beta' : max(min(beta, 0.5), 0.000001),
        'activation': int(np.around(activation)),
        'batch_normalization' : int(np.around(batch_normalization))
    }
    learning_cost, valid_cost, learning_acc, valid_acc = neural_network(**param)
    print("stop after 20 iteration with train cost %f, valid cost %f, train acc %f, valid acc %f" % (learning_cost, valid_cost, learning_acc, valid_acc))
    # a very simple benchmark, just use correct accuracy
    # if you want to change to f1 score or anything else, can
    if (valid_acc > accbest):
        costbest = valid_acc
    return valid_acc

In [7]:
accbest = 0.0
NN_BAYESIAN = BayesianOptimization(generate_nn, 
                              {'fully_conn_size': (16, 128),
                               'len_layer_conv': (3, 5),
                               'kernel_size': (2, 7),
                               'learning_rate': (0.0001, 1),
                               'pooling_size': (2, 4),
                               'multiply': (1, 3),
                               'dropout_rate': (0.1, 0.99),
                               'beta': (0.000001, 0.49),
                               'activation': (0, 2),
                               'batch_normalization': (0, 1)
                              })
NN_BAYESIAN.maximize(init_points = 10, n_iter = 10, acq = 'ei', xi = 0.0)


Initialization
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 Step |   Time |      Value |   activation |   batch_normalization |      beta |   dropout_rate |   fully_conn_size |   kernel_size |   learning_rate |   len_layer_conv |   multiply |   pooling_size | 
(?, 2, 2, 3)
stop after 20 iteration with train cost nan, valid cost 12.430384, train acc nan, valid acc 0.600000
    1 | 00m11s |    0.60000 |       0.7829 |                0.4417 |    0.4673 |         0.7876 |           45.7452 |        5.7427 |          0.5704 |           3.1465 |     1.0553 |         3.8126 | 
(?, 3, 3, 36)
stop after 20 iteration with train cost nan, valid cost nan, train acc nan, valid acc 0.400000
    2 | 00m18s |    0.40000 |       1.2282 |                0.8450 |    0.0673 |         0.2630 |           16.0480 |        6.4503 |          0.4779 |           3.2515 |     2.2198 |         2.7465 | 
(?, 1, 1, 221)
stop after 20 iteration with train cost nan, valid cost nan, train acc nan, valid acc 0.400000
    3 | 00m16s |    0.40000 |       0.1120 |                0.6980 |    0.3254 |         0.4960 |           34.5135 |        5.2485 |          0.7449 |           4.3817 |     2.9078 |         3.7427 | 
(?, 1, 1, 143)
stop after 20 iteration with train cost nan, valid cost nan, train acc nan, valid acc 0.400000
    4 | 00m28s |    0.40000 |       0.9064 |                0.5437 |    0.2239 |         0.1009 |          110.0785 |        6.6746 |          0.1047 |           4.1076 |     2.5975 |         2.8028 | 
(?, 1, 1, 48)
stop after 20 iteration with train cost nan, valid cost nan, train acc nan, valid acc 0.400000
    5 | 00m03s |    0.40000 |       1.7474 |                0.0067 |    0.0440 |         0.8174 |          116.6944 |        2.3211 |          0.9795 |           3.9498 |     2.0026 |         3.7250 | 
(?, 1, 1, 3)
stop after 20 iteration with train cost nan, valid cost 11.912154, train acc nan, valid acc 0.600000
    6 | 00m05s |    0.60000 |       0.7277 |                0.3909 |    0.2616 |         0.6071 |           91.6583 |        2.8332 |          0.2810 |           3.8482 |     1.1559 |         3.0747 | 
(?, 5, 5, 173)
stop after 20 iteration with train cost nan, valid cost 55609.699219, train acc nan, valid acc 0.592500
    7 | 00m31s |    0.59250 |       0.7879 |                0.1073 |    0.4592 |         0.2502 |           67.7937 |        4.7486 |          0.2690 |           4.3562 |     2.7959 |         2.1530 | 
(?, 1, 1, 3)
stop after 20 iteration with train cost nan, valid cost nan, train acc nan, valid acc 0.400000
    8 | 00m16s |    0.40000 |       1.9890 |                0.2757 |    0.0399 |         0.2489 |           80.9954 |        6.6404 |          0.4286 |           3.5234 |     1.0856 |         3.4482 | 
(?, 1, 1, 89)
stop after 20 iteration with train cost nan, valid cost nan, train acc nan, valid acc 0.400000
    9 | 00m06s |    0.40000 |       1.2205 |                0.9488 |    0.4696 |         0.4479 |           37.9253 |        2.3696 |          0.5134 |           3.5527 |     2.3449 |         3.9133 | 
(?, 1, 1, 89)
stop after 20 iteration with train cost nan, valid cost nan, train acc nan, valid acc 0.400000
   10 | 00m16s |    0.40000 |       0.0551 |                0.5532 |    0.4881 |         0.4577 |           66.5911 |        4.6565 |          0.6994 |           3.8198 |     2.3507 |         2.6258 | 
Bayesian Optimization
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 Step |   Time |      Value |   activation |   batch_normalization |      beta |   dropout_rate |   fully_conn_size |   kernel_size |   learning_rate |   len_layer_conv |   multiply |   pooling_size | 
(?, 3, 3, 729)
stop after 20 iteration with train cost nan, valid cost nan, train acc nan, valid acc 0.400000
   11 | 02m09s |    0.40000 |       2.0000 |                0.0000 |    0.4900 |         0.1000 |           73.3309 |        2.0000 |          0.0001 |           5.0000 |     3.0000 |         2.0000 | 
(?, 3, 3, 729)
stop after 20 iteration with train cost nan, valid cost nan, train acc nan, valid acc 0.400000
   12 | 03m35s |    0.40000 |       2.0000 |                0.0000 |    0.0000 |         0.1000 |           68.2941 |        7.0000 |          0.0001 |           5.0000 |     3.0000 |         2.0000 | 
(?, 2, 2, 81)
stop after 20 iteration with train cost nan, valid cost nan, train acc nan, valid acc 0.400000
   13 | 00m58s |    0.40000 |       2.0000 |                0.0000 |    0.0000 |         0.9900 |           97.3657 |        7.0000 |          1.0000 |           3.0000 |     3.0000 |         4.0000 | 
(?, 3, 3, 729)
stop after 20 iteration with train cost nan, valid cost nan, train acc nan, valid acc 0.400000
   14 | 03m40s |    0.40000 |       0.0000 |                1.0000 |    0.4900 |         0.1000 |          128.0000 |        7.0000 |          0.0001 |           5.0000 |     3.0000 |         2.0000 | 
(?, 3, 3, 729)
stop after 20 iteration with train cost nan, valid cost nan, train acc nan, valid acc 0.400000
   15 | 01m06s |    0.40000 |       2.0000 |                0.0000 |    0.4900 |         0.1000 |           51.1080 |        2.0000 |          0.0001 |           5.0000 |     3.0000 |         2.0000 | 
(?, 3, 3, 729)
stop after 20 iteration with train cost nan, valid cost nan, train acc nan, valid acc 0.400000
   16 | 01m05s |    0.40000 |       2.0000 |                0.0000 |    0.4900 |         0.1000 |           67.1000 |        2.0000 |          0.0001 |           5.0000 |     3.0000 |         2.0000 | 
(?, 10, 10, 81)
stop after 20 iteration with train cost nan, valid cost 120134.359375, train acc nan, valid acc 0.490000
   17 | 02m17s |    0.49000 |       0.0000 |                0.0000 |    0.4900 |         0.1000 |           70.7609 |        4.6815 |          0.0001 |           3.0000 |     3.0000 |         2.0000 | 
(?, 3, 3, 729)
stop after 20 iteration with train cost nan, valid cost 120.540894, train acc nan, valid acc 0.590000
   18 | 05m01s |    0.59000 |       0.0000 |                0.0000 |    0.4900 |         0.9900 |           88.2054 |        7.0000 |          1.0000 |           5.0000 |     3.0000 |         2.0000 | 
(?, 3, 3, 729)
stop after 20 iteration with train cost nan, valid cost nan, train acc nan, valid acc 0.400000
   19 | 05m01s |    0.40000 |       2.0000 |                0.0000 |    0.4900 |         0.1000 |           43.3221 |        7.0000 |          0.0001 |           5.0000 |     3.0000 |         2.0000 | 
(?, 2, 2, 3)
stop after 20 iteration with train cost nan, valid cost 0.734473, train acc nan, valid acc 0.600000
   20 | 02m08s |    0.60000 |       0.0000 |                0.0000 |    0.4900 |         0.9900 |           24.3525 |        2.0000 |          1.0000 |           3.0000 |     1.0000 |         4.0000 | 

In [8]:
print('Maximum NN accuracy value: %f' % NN_BAYESIAN.res['max']['max_val'])
print('Best NN parameters: ', NN_BAYESIAN.res['max']['max_params'])


Maximum NN accuracy value: 0.600000
Best NN parameters:  {'fully_conn_size': 45.745233745490474, 'len_layer_conv': 3.1465432539809788, 'kernel_size': 5.7426796270733682, 'learning_rate': 0.57043391227406293, 'pooling_size': 3.8125736787655145, 'multiply': 1.0552589410978261, 'dropout_rate': 0.78762606131084778, 'beta': 0.46732759735648771, 'activation': 0.78293147230772098, 'batch_normalization': 0.44170041567714113}

The accuracy is low because our cnn model is very complex. the purpose is, bayesian still able to find the best maxima in non convex hyper-parameters function without do any derivation


In [9]: