In [1]:
import numpy as np
import pandas as pd 
import xgboost as xgb
import keras.backend as K

from datetime import datetime
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.optimizers import RMSprop
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.decomposition import PCA, FastICA
from sklearn.metrics import r2_score


Using TensorFlow backend.

In [2]:
train = pd.read_csv("data/train.csv", index_col="ID")
test = pd.read_csv("data/test.csv", index_col="ID")

In [3]:
for c in train.columns:
    if train[c].dtype == "object":
        lbl = LabelEncoder() 
        lbl.fit(list(train[c].values) + list(test[c].values)) 
        train[c] = lbl.transform(list(train[c].values))
        test[c] = lbl.transform(list(test[c].values))

In [4]:
n_comp = 10

# PCA
pca = PCA(n_components=n_comp, random_state=42)
pca2_results_train = pca.fit_transform(train.drop(["y"], axis=1))
pca2_results_test = pca.transform(test)

# ICA
ica = FastICA(n_components=n_comp, random_state=42)
ica2_results_train = ica.fit_transform(train.drop(["y"], axis=1))
ica2_results_test = ica.transform(test)


/Users/datitran/anaconda/envs/kaggle/lib/python3.5/site-packages/scipy/linalg/basic.py:1018: RuntimeWarning: internal gelsd driver lwork query error, required iwork dimension not returned. This is likely the result of LAPACK bug 0038, fixed in LAPACK 3.2.2 (released July 21, 2010). Falling back to 'gelss' driver.
  warnings.warn(mesg, RuntimeWarning)

In [5]:
#train = train.iloc[:, :9].copy()
#test = test.iloc[:, :8].copy()

In [6]:
for i in range(1, n_comp+1):
    train["pca_" + str(i)] = pca2_results_train[:,i-1]
    test["pca_" + str(i)] = pca2_results_test[:, i-1]
    
    train["ica_" + str(i)] = ica2_results_train[:,i-1]
    test["ica_" + str(i)] = ica2_results_test[:, i-1]
    
y_train = train["y"]
y_mean = np.mean(y_train)

In [7]:
x_train = train.drop("y", axis=1).values.astype(np.float32)
x_test = test.values.astype(np.float32)

In [8]:
x_train.shape, x_test.shape


Out[8]:
((4209, 396), (4209, 396))

In [9]:
scaler = MinMaxScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.fit_transform(x_test)

In [10]:
def r2_keras(y_true, y_pred):
    SS_res =  K.sum(K.square( y_true-y_pred )) 
    SS_tot = K.sum(K.square( y_true - K.mean(y_true) ) ) 
    return ( 1 - SS_res/(SS_tot + K.epsilon()) )

In [32]:
rmsprop = RMSprop(lr=0.0001, rho=0.9, epsilon=1e-08, decay=0.0)

def model():
    model = Sequential()
    model.add(Dense(units=, activation="relu",input_dim=x_train.shape[1]))
    model.add(Dense(units=396, activation="relu"))
    model.add(Dense(units=, activation="relu"))
    model.add(Dense(units=512, activation="relu"))
    model.add(Dense(units=512, activation="relu"))
    model.add(Dense(units=512, activation="relu"))
    model.add(Dense(units=1, activation="linear"))
    model.compile(loss="mse", optimizer=rmsprop, metrics=[r2_keras])
    #model.summary()
    return model

In [12]:
callbacks = [
    EarlyStopping(monitor="val_r2_keras", patience=20)
    #ModelCheckpoint("weights.{epoch:02d}-{val_loss:.2f}.hdf5")
]

estimator = KerasRegressor(
    build_fn=model, 
    nb_epoch=100, 
    batch_size=32,
    verbose=0
)

In [ ]:
estimator.fit(x_train_scaled, y_train, batch_size=32, epochs=200, verbose=2, callbacks=callbacks, validation_split=0.02)

In [ ]:
y_pred_train = estimator.predict(x_train_scaled)

In [ ]:
prediction = estimator.predict(x_test_scaled)

In [ ]:
prediction

In [ ]:
print("the R2 score is : {}".format(r2_score(y_train, y_pred_train)))

Ensemble the model


In [33]:
r2_score_list = []
prediction_list = []
for i in range(1):
    estimator.fit(x_train_scaled, y_train, batch_size=32, 
                  epochs=50, verbose=2, callbacks=callbacks,
                  validation_split=0.1, shuffle=False)
    y_pred_train = estimator.predict(x_train_scaled)
    prediction = estimator.predict(x_test_scaled)
    prediction_list.append(prediction)
    r2_value = r2_score(y_train, y_pred_train)
    print("Number: {}, R^2: {}".format(i, r2_value))
    r2_score_list.append((i, r2_value))


Train on 3788 samples, validate on 421 samples
Epoch 1/50
0s - loss: 8497.0696 - r2_keras: -5.9802e+01 - val_loss: 5688.8226 - val_r2_keras: -4.7048e+01
Epoch 2/50
0s - loss: 3063.7014 - r2_keras: -2.0552e+01 - val_loss: 596.3211 - val_r2_keras: -3.9942e+00
Epoch 3/50
0s - loss: 265.6156 - r2_keras: -7.4332e-01 - val_loss: 156.3795 - val_r2_keras: -2.6558e-01
Epoch 4/50
0s - loss: 154.8358 - r2_keras: 0.0207 - val_loss: 104.6440 - val_r2_keras: 0.1623
Epoch 5/50
0s - loss: 121.1724 - r2_keras: 0.2567 - val_loss: 77.3172 - val_r2_keras: 0.3868
Epoch 6/50
0s - loss: 103.4057 - r2_keras: 0.3806 - val_loss: 63.6953 - val_r2_keras: 0.4983
Epoch 7/50
0s - loss: 93.7359 - r2_keras: 0.4478 - val_loss: 56.8824 - val_r2_keras: 0.5542
Epoch 8/50
0s - loss: 88.1637 - r2_keras: 0.4862 - val_loss: 53.2024 - val_r2_keras: 0.5848
Epoch 9/50
0s - loss: 84.7483 - r2_keras: 0.5097 - val_loss: 50.9846 - val_r2_keras: 0.6035
Epoch 10/50
0s - loss: 82.5202 - r2_keras: 0.5249 - val_loss: 49.5240 - val_r2_keras: 0.6160
Epoch 11/50
0s - loss: 80.9714 - r2_keras: 0.5355 - val_loss: 48.4918 - val_r2_keras: 0.6250
Epoch 12/50
0s - loss: 79.8321 - r2_keras: 0.5432 - val_loss: 47.7326 - val_r2_keras: 0.6316
Epoch 13/50
0s - loss: 78.9550 - r2_keras: 0.5491 - val_loss: 47.1445 - val_r2_keras: 0.6368
Epoch 14/50
0s - loss: 78.2526 - r2_keras: 0.5538 - val_loss: 46.6723 - val_r2_keras: 0.6409
Epoch 15/50
0s - loss: 77.6721 - r2_keras: 0.5577 - val_loss: 46.2824 - val_r2_keras: 0.6444
Epoch 16/50
0s - loss: 77.1796 - r2_keras: 0.5609 - val_loss: 45.9422 - val_r2_keras: 0.6473
Epoch 17/50
0s - loss: 76.7505 - r2_keras: 0.5637 - val_loss: 45.6478 - val_r2_keras: 0.6499
Epoch 18/50
0s - loss: 76.3729 - r2_keras: 0.5661 - val_loss: 45.4004 - val_r2_keras: 0.6520
Epoch 19/50
0s - loss: 76.0353 - r2_keras: 0.5683 - val_loss: 45.1856 - val_r2_keras: 0.6539
Epoch 20/50
0s - loss: 75.7311 - r2_keras: 0.5702 - val_loss: 45.0010 - val_r2_keras: 0.6554
Epoch 21/50
0s - loss: 75.4521 - r2_keras: 0.5719 - val_loss: 44.8445 - val_r2_keras: 0.6568
Epoch 22/50
0s - loss: 75.1957 - r2_keras: 0.5735 - val_loss: 44.7094 - val_r2_keras: 0.6579
Number: 0, R^2: 0.527942307065526

In [34]:
value = np.zeros(len(prediction_list[0]))
for i in prediction_list:
    value += i

In [35]:
average_prediction = value / len(prediction_list)

In [36]:
average_prediction


Out[36]:
array([  73.09261322,   89.23135376,   73.39955902, ...,   93.74902344,
        107.94994354,   92.85961151])

In [ ]:
output = pd.DataFrame({"id": test.index, "y": average_prediction})

In [ ]:
output.to_csv("submission_neural_network_average.csv", index=False)

In [ ]:


In [ ]:


In [ ]:


In [ ]: