In [1]:
import pandas as pd
import cv2
import os
import numpy as np
from tqdm import tqdm
import os
import gc
from glob import glob
from sklearn.metrics import fbeta_score

# Keras libraries 
import keras as k
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D
from keras.layers.normalization import BatchNormalization


Using TensorFlow backend.

In [2]:
df_train = pd.read_csv('data/train_v2.csv')

In [3]:
# referred to https://www.kaggle.com/anokas/simple-keras-starter for help reading data and setting up basic Keras model
x = []
x_test = []
y = []


flatten = lambda l: [item for sublist in l for item in sublist]
labels = list(set(flatten([l.split(' ') for l in df_train['tags'].values])))
label_map = {l: i for i, l in enumerate(labels)}
inv_label_map = {i: l for l, i in label_map.items()}

for f, tags in tqdm(df_train.values, miniters=1000):
    img = cv2.imread('data/train-jpg/{}.jpg'.format(f))
    targets = np.zeros(17)
    for t in tags.split(' '):
        targets[label_map[t]] = 1 
    x.append(cv2.resize(img, (32, 32)))
    y.append(targets)


100%|██████████| 40479/40479 [00:55<00:00, 733.80it/s]
['haze', 'cultivation', 'partly_cloudy', 'blow_down', 'primary', 'water', 'agriculture', 'clear', 'cloudy', 'bare_ground', 'conventional_mine', 'selective_logging', 'road', 'blooming', 'slash_burn', 'habitation', 'artisinal_mine']


In [4]:
y = np.array(y, np.uint8)
x = np.array(x, np.float16) / 255.

In [5]:
split = 35000
x_train, x_valid, y_train, y_valid = x[:split], x[split:], y[:split], y[split:]

In [7]:
#Random Search over filter sizes and dropout

best_f = -1
best_d = -1
best_thresh = 0
best_F1 = -1 

num_experiments = 3

for i in range(num_experiments):
    fSize = np.random.choice((32, 64))
    dProb = np.random.uniform(low = 0.1, high = 0.8)
    
    #Model set up and fitting
    model = Sequential()
    model.add(BatchNormalization(input_shape=(32, 32, 3)))
    model.add(Conv2D(fSize, kernel_size=(3, 3),
                     activation='relu',
                     input_shape=(32, 32, 3)))
    model.add(Conv2D(fSize, (3, 3), activation='relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Dropout(dProb))

    model.add(Conv2D(fSize, kernel_size=(3, 3),
                     activation='relu',
                     input_shape=(32, 32, 3)))
    model.add(Conv2D(fSize, (3, 3), activation='relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))

    model.add(Dropout(dProb))

    model.add(Flatten())
    model.add(Dense(32, activation='relu'))
    model.add(Dropout(dProb))
    model.add(Dense(17, activation='sigmoid'))
    
    model.compile(loss='binary_crossentropy', 
              optimizer='adam',
              metrics=['accuracy'])
    
    model.fit(x_train, y_train,
          batch_size=128,
          epochs=4,
          verbose=1,
          validation_data=(x_valid, y_valid))
    #grid search for best threshold on training data 
    p_train = model.predict(x_train, batch_size=128)

    best_F1_train = -1
    for t in np.arange(.1, .3, .01):
        F1 = fbeta_score(y_train, np.array(p_train) > t, beta=2, average='samples')
        if F1 > best_F1_train:
            thresh = t
            best_F1_train = F1

    p_valid = model.predict(x_valid, batch_size=128)
    
    F1 = fbeta_score(y_valid, np.array(p_valid) > thresh, beta=2, average='samples')
    print("Current results", "F1: " , F1, "threshold: ", thresh , " Dropout: ", dProb, " Filter Size: ", fSize)
    if F1 > best_F1:
        best_f = fSize
        best_d = dProb
        best_thresh = thresh
        best_F1 = F1
        print("New best F1 found with", "F1: " , best_F1, "threshold: ", best_thresh , " Dropout: ", best_d, " Filter Size: ", best_f)

#('New best F1 found with', 'F1: ', 0.86129869286543659, 'threshold: ', 0.17999999999999997, ' Dropout: ', 0.3045845721625034, ' Filter Size: ', 64)


Train on 35000 samples, validate on 5479 samples
Epoch 1/4
35000/35000 [==============================] - 59s - loss: 0.3223 - acc: 0.8711 - val_loss: 0.2175 - val_acc: 0.9149
Epoch 2/4
35000/35000 [==============================] - 58s - loss: 0.2187 - acc: 0.9189 - val_loss: 0.1803 - val_acc: 0.9312
Epoch 3/4
35000/35000 [==============================] - 59s - loss: 0.1947 - acc: 0.9270 - val_loss: 0.1653 - val_acc: 0.9368
Epoch 4/4
35000/35000 [==============================] - 59s - loss: 0.1821 - acc: 0.9312 - val_loss: 0.1578 - val_acc: 0.9390
Current results F1:  0.855523095539 threshold:  0.21  Dropout:  0.4371361076045778  Filter Size:  32
New best F1 found with F1:  0.855523095539 threshold:  0.21  Dropout:  0.4371361076045778  Filter Size:  32
Train on 35000 samples, validate on 5479 samples
Epoch 1/4
35000/35000 [==============================] - 122s - loss: 0.2827 - acc: 0.8915 - val_loss: 0.2124 - val_acc: 0.9176
Epoch 2/4
35000/35000 [==============================] - 129s - loss: 0.1927 - acc: 0.9274 - val_loss: 0.1602 - val_acc: 0.9385
Epoch 3/4
35000/35000 [==============================] - 160s - loss: 0.1735 - acc: 0.9340 - val_loss: 0.1531 - val_acc: 0.9399
Epoch 4/4
35000/35000 [==============================] - 169s - loss: 0.1635 - acc: 0.9372 - val_loss: 0.1431 - val_acc: 0.9446
Current results F1:  0.870570300658 threshold:  0.2  Dropout:  0.21118580899363046  Filter Size:  64
New best F1 found with F1:  0.870570300658 threshold:  0.2  Dropout:  0.21118580899363046  Filter Size:  64
Train on 35000 samples, validate on 5479 samples
Epoch 1/4
35000/35000 [==============================] - 169s - loss: 0.2802 - acc: 0.8901 - val_loss: 0.2113 - val_acc: 0.9169
Epoch 2/4
35000/35000 [==============================] - 170s - loss: 0.1926 - acc: 0.9269 - val_loss: 0.1696 - val_acc: 0.9338
Epoch 3/4
35000/35000 [==============================] - 168s - loss: 0.1716 - acc: 0.9342 - val_loss: 0.1519 - val_acc: 0.9418
Epoch 4/4
35000/35000 [==============================] - 168s - loss: 0.1587 - acc: 0.9392 - val_loss: 0.1452 - val_acc: 0.9443
Current results F1:  0.861160390213 threshold:  0.17  Dropout:  0.13962758329904215  Filter Size:  64

In [ ]: