In [103]:
import datetime
import numpy as np

import numpy as np
np.random.seed(42)
import tensorflow as tf
tf.set_random_seed(42)

import pandas as pd
import keras.backend as K
from sklearn import preprocessing
from sklearn.metrics import r2_score
from keras.models import Sequential
from keras.layers.core import Dense, Dropout
from keras.layers.recurrent import SimpleRNN, LSTM
from keras.layers.wrappers import TimeDistributed, Bidirectional
from keras.callbacks import EarlyStopping, ModelCheckpoint

import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.decomposition import PCA

In [3]:
train = pd.read_csv("data/train.csv", index_col="ID")
test = pd.read_csv("data/test.csv", index_col="ID")

Combine train and test data for one-hot encoding


In [4]:
train["data"] = "train"
test["data"] = "test"

combined_data = pd.concat([train, test])
encoded = pd.get_dummies(combined_data[["X0", "X1", "X2", "X3", "X4", "X5", "X6", "X8"]])
drop_cat = combined_data.drop(["X0", "X1", "X2", "X3", "X4", "X5", "X6", "X8"], axis=1)
combined_data_clean = drop_cat.join(encoded)

In [5]:
train_data = combined_data_clean[combined_data_clean.data == "train"].copy()
test_data = combined_data_clean[combined_data_clean.data == "test"].copy()

train_data.drop("data", axis=1, inplace=True)
test_data.drop(["data", "y"], axis=1, inplace=True)

In [6]:
train_data.columns


Out[6]:
Index(['X10', 'X100', 'X101', 'X102', 'X103', 'X104', 'X105', 'X106', 'X107',
       'X108',
       ...
       'X8_p', 'X8_q', 'X8_r', 'X8_s', 'X8_t', 'X8_u', 'X8_v', 'X8_w', 'X8_x',
       'X8_y'],
      dtype='object', length=580)

In [7]:
test_data.columns


Out[7]:
Index(['X10', 'X100', 'X101', 'X102', 'X103', 'X104', 'X105', 'X106', 'X107',
       'X108',
       ...
       'X8_p', 'X8_q', 'X8_r', 'X8_s', 'X8_t', 'X8_u', 'X8_v', 'X8_w', 'X8_x',
       'X8_y'],
      dtype='object', length=579)

In [8]:
y_train = train_data["y"].astype(np.float32)
x_train =  train_data.drop("y", axis=1).astype(np.float32)
x_test = test_data.astype(np.float32)

In [9]:
x_train.shape


Out[9]:
(4209, 579)

In [61]:
train_reshaped = np.array([i.reshape((-1, 1)) for i in x_train.values])
train_reshaped = train_reshaped.astype(np.float32)
train_reshaped.shape


Out[61]:
(4209, 579, 1)

Adjust for multicollinearity

  • Idea: treat the features as sequences but using the raw data 0/1 can cause NaN easily / probably due to multicollinearity
  • Therefore either take VIF or PCA to adjust for it

vif = pd.DataFrame() vif["VIF Factor"] = [variance_inflation_factor(x_train.values, i) for i in range(x_train.shape[1])] vif["features"] = x_train.columns


In [88]:
n_comp = 128
pca = PCA(n_components=n_comp, random_state=42)
pca2_results_train = pca.fit_transform(x_train)
pca2_results_test = pca.transform(x_test)

In [89]:
train_reshaped = np.array([i.reshape((-1, 1)) for i in pca2_results_train])
train_reshaped = train_reshaped.astype(np.float32)
train_reshaped.shape


Out[89]:
(4209, 128, 1)

Train the model


In [122]:
# Idea: Simple model
model = Sequential()
model.add(Bidirectional(SimpleRNN(128, return_sequences=True, activation="relu"), input_shape=(None, 1)))
model.add(Bidirectional(SimpleRNN(64, return_sequences=True, activation="relu")))
model.add(Bidirectional(SimpleRNN(32, return_sequences=False, activation="relu")))
model.add(Dropout(0.5))
model.add(Dense(1, activation="linear"))
model.compile(optimizer="rmsprop", loss="mse")
model.summary()


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
bidirectional_40 (Bidirectio (None, None, 256)         33280     
_________________________________________________________________
bidirectional_41 (Bidirectio (None, None, 128)         41088     
_________________________________________________________________
bidirectional_42 (Bidirectio (None, 64)                10304     
_________________________________________________________________
dense_26 (Dense)             (None, 1)                 65        
=================================================================
Total params: 84,737
Trainable params: 84,737
Non-trainable params: 0
_________________________________________________________________

In [105]:
# Idea: Funnel -> reduce information after each layer / deep model
model = Sequential()
model.add(Bidirectional(SimpleRNN(64, return_sequences=True, activation="relu"), input_shape=(None, 1)))
model.add(Bidirectional(SimpleRNN(64, return_sequences=True, activation="relu")))
model.add(TimeDistributed(Dense(32, activation="relu")))
model.add(Bidirectional(SimpleRNN(32, return_sequences=True, activation="relu")))
model.add(Bidirectional(SimpleRNN(32, return_sequences=True, activation="relu")))
model.add(TimeDistributed(Dense(16, activation="relu")))
model.add(Bidirectional(SimpleRNN(16, return_sequences=False, activation="relu")))
model.add(Dropout(0.5))
model.add(Dense(1, activation="linear"))
model.compile(optimizer="rmsprop", loss="mse")
model.summary()


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
bidirectional_33 (Bidirectio (None, None, 128)         8448      
_________________________________________________________________
bidirectional_34 (Bidirectio (None, None, 128)         24704     
_________________________________________________________________
time_distributed_7 (TimeDist (None, None, 32)          4128      
_________________________________________________________________
bidirectional_35 (Bidirectio (None, None, 64)          4160      
_________________________________________________________________
bidirectional_36 (Bidirectio (None, None, 64)          6208      
_________________________________________________________________
time_distributed_8 (TimeDist (None, None, 16)          1040      
_________________________________________________________________
bidirectional_37 (Bidirectio (None, 32)                1056      
_________________________________________________________________
dropout_2 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_23 (Dense)             (None, 1)                 33        
=================================================================
Total params: 49,777
Trainable params: 49,777
Non-trainable params: 0
_________________________________________________________________

In [123]:
early_stop = EarlyStopping(monitor="loss", patience=10)
file_path = "weights.{epoch:02d}-{val_loss:.2f}.hdf5"
checkpoint = ModelCheckpoint(file_path)
model_run = model.fit(train_reshaped, y_train, epochs=100 ,validation_split=0.02, callbacks=[early_stop, checkpoint])


Train on 4124 samples, validate on 85 samples
Epoch 1/100
4124/4124 [==============================] - 30s - loss: 7157527823.5379 - val_loss: 620.2780
Epoch 2/100
4124/4124 [==============================] - 29s - loss: 666.0185 - val_loss: 747.2826
Epoch 3/100
4124/4124 [==============================] - 32s - loss: 433.7659 - val_loss: 566.4491
Epoch 4/100
4124/4124 [==============================] - 34s - loss: 342.9813 - val_loss: 749.6959
Epoch 5/100
4124/4124 [==============================] - 38s - loss: 245.4716 - val_loss: 274.8306
Epoch 6/100
4124/4124 [==============================] - 38s - loss: 190.1679 - val_loss: 529.1939
Epoch 7/100
4124/4124 [==============================] - 33s - loss: 166.3326 - val_loss: 354.3311
Epoch 8/100
4124/4124 [==============================] - 32s - loss: 150.1117 - val_loss: 168.7268
Epoch 9/100
4124/4124 [==============================] - 32s - loss: 131.3647 - val_loss: 131.6620
Epoch 10/100
4124/4124 [==============================] - 32s - loss: 128.5587 - val_loss: 101.5296
Epoch 11/100
4124/4124 [==============================] - 33s - loss: 115.9399 - val_loss: 107.8835
Epoch 12/100
4124/4124 [==============================] - 33s - loss: 110.8212 - val_loss: 216.1792
Epoch 13/100
4124/4124 [==============================] - 32s - loss: 104.4411 - val_loss: 126.4074
Epoch 14/100
4124/4124 [==============================] - 33s - loss: 98.7915 - val_loss: 78.5074
Epoch 15/100
4124/4124 [==============================] - 33s - loss: 99.3747 - val_loss: 100.3919
Epoch 16/100
4124/4124 [==============================] - 33s - loss: 96.6542 - val_loss: 77.6001
Epoch 17/100
4124/4124 [==============================] - 33s - loss: 94.3281 - val_loss: 76.4338
Epoch 18/100
4124/4124 [==============================] - 33s - loss: 91.2105 - val_loss: 75.0284
Epoch 19/100
4124/4124 [==============================] - 33s - loss: 87.4559 - val_loss: 69.9566
Epoch 20/100
4124/4124 [==============================] - 33s - loss: 85.4909 - val_loss: 112.3715
Epoch 21/100
4124/4124 [==============================] - 33s - loss: 86.4884 - val_loss: 92.5914
Epoch 22/100
4124/4124 [==============================] - 33s - loss: 83.6708 - val_loss: 75.9793
Epoch 23/100
4124/4124 [==============================] - 33s - loss: 81.6153 - val_loss: 109.8285
Epoch 24/100
4124/4124 [==============================] - 33s - loss: 78.9488 - val_loss: 69.7006
Epoch 25/100
4124/4124 [==============================] - 33s - loss: 78.7784 - val_loss: 74.2681
Epoch 26/100
4124/4124 [==============================] - 33s - loss: 77.8193 - val_loss: 77.3091
Epoch 27/100
4124/4124 [==============================] - 33s - loss: 73.1039 - val_loss: 74.7508
Epoch 28/100
4124/4124 [==============================] - 33s - loss: 73.7891 - val_loss: 120.4115
Epoch 29/100
4124/4124 [==============================] - 33s - loss: 73.8976 - val_loss: 206.0441
Epoch 30/100
4124/4124 [==============================] - 35s - loss: 73.2177 - val_loss: 73.1416
Epoch 31/100
4124/4124 [==============================] - 33s - loss: 71.8650 - val_loss: 65.3407
Epoch 32/100
4124/4124 [==============================] - 33s - loss: 71.7615 - val_loss: 85.1769
Epoch 33/100
4124/4124 [==============================] - 33s - loss: 72.3781 - val_loss: 86.8130
Epoch 34/100
4124/4124 [==============================] - 33s - loss: 74.6560 - val_loss: 62.9605
Epoch 35/100
4124/4124 [==============================] - 34s - loss: 75.2371 - val_loss: 130.7201
Epoch 36/100
4124/4124 [==============================] - 37s - loss: 85.7366 - val_loss: 72.7771
Epoch 37/100
4124/4124 [==============================] - 34s - loss: 84.8696 - val_loss: 80.4697
Epoch 38/100
4124/4124 [==============================] - 34s - loss: 78.3030 - val_loss: 70.6869
Epoch 39/100
4124/4124 [==============================] - 35s - loss: 76.1913 - val_loss: 64.6130
Epoch 40/100
4124/4124 [==============================] - 35s - loss: 72.8339 - val_loss: 72.8532
Epoch 41/100
4124/4124 [==============================] - 36s - loss: 72.4188 - val_loss: 140.9957
Epoch 42/100
4124/4124 [==============================] - 36s - loss: 72.2809 - val_loss: 77.4582
Epoch 43/100
4124/4124 [==============================] - 35s - loss: 73.3152 - val_loss: 87.1545

In [124]:
y_pred_train = model.predict(train_reshaped)

In [125]:
print("the R2 score is : {}".format(r2_score(y_train, y_pred_train)))


the R2 score is : 0.6261648564459983

Prediction


In [126]:
test_reshaped = np.array([i.reshape((-1, 1)) for i in pca2_results_test])
test_reshaped = test_reshaped.astype(np.float32)
test_reshaped.shape


Out[126]:
(4209, 128, 1)

In [127]:
y_pred_test = model.predict(test_reshaped)

In [128]:
output = pd.DataFrame({"ID": test.index, "y": y_pred_test.reshape(-1)})

In [129]:
output.head()


Out[129]:
ID y
0 1 77.908844
1 2 90.723923
2 3 78.807854
3 4 79.746132
4 5 111.386482

In [130]:
output.to_csv("submissions_{}.csv".format(datetime.datetime.today()), index=False)

In [ ]:

Combine submission


In [131]:
sub_1 = pd.read_csv("submission_baseLine.csv")

In [132]:
sub_2 = pd.read_csv("submissions_2017-05-31 15:48:40.546392.csv")

In [133]:
sub_3 = output.copy()

In [141]:
mean_pred = (sub_1.y.values + sub_2.y.values + sub_3.y.values) / 3

In [142]:
output_mean = pd.DataFrame({"ID": test.index, "y": mean_pred})

In [143]:
output_mean.to_csv("submissions_mean_{}.csv".format(datetime.datetime.today()), index=False)

In [ ]:


In [144]:
sub_1 = pd.read_csv("submission_baseLine.csv")

In [145]:
sub_2 = pd.read_csv("submissions_2017-05-31 15:48:40.546392.csv")

In [146]:
mean_pred = (sub_1.y.values + sub_2.y.values ) / 2

In [148]:
output_mean = pd.DataFrame({"ID": test.index, "y": mean_pred})

In [149]:
output_mean.to_csv("submissions_mean_2_{}.csv".format(datetime.datetime.today()), index=False)

In [ ]:


In [ ]: