In [103]:
import datetime
import numpy as np
import numpy as np
np.random.seed(42)
import tensorflow as tf
tf.set_random_seed(42)
import pandas as pd
import keras.backend as K
from sklearn import preprocessing
from sklearn.metrics import r2_score
from keras.models import Sequential
from keras.layers.core import Dense, Dropout
from keras.layers.recurrent import SimpleRNN, LSTM
from keras.layers.wrappers import TimeDistributed, Bidirectional
from keras.callbacks import EarlyStopping, ModelCheckpoint
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.decomposition import PCA
In [3]:
train = pd.read_csv("data/train.csv", index_col="ID")
test = pd.read_csv("data/test.csv", index_col="ID")
In [4]:
train["data"] = "train"
test["data"] = "test"
combined_data = pd.concat([train, test])
encoded = pd.get_dummies(combined_data[["X0", "X1", "X2", "X3", "X4", "X5", "X6", "X8"]])
drop_cat = combined_data.drop(["X0", "X1", "X2", "X3", "X4", "X5", "X6", "X8"], axis=1)
combined_data_clean = drop_cat.join(encoded)
In [5]:
train_data = combined_data_clean[combined_data_clean.data == "train"].copy()
test_data = combined_data_clean[combined_data_clean.data == "test"].copy()
train_data.drop("data", axis=1, inplace=True)
test_data.drop(["data", "y"], axis=1, inplace=True)
In [6]:
train_data.columns
Out[6]:
In [7]:
test_data.columns
Out[7]:
In [8]:
y_train = train_data["y"].astype(np.float32)
x_train = train_data.drop("y", axis=1).astype(np.float32)
x_test = test_data.astype(np.float32)
In [9]:
x_train.shape
Out[9]:
In [61]:
train_reshaped = np.array([i.reshape((-1, 1)) for i in x_train.values])
train_reshaped = train_reshaped.astype(np.float32)
train_reshaped.shape
Out[61]:
vif = pd.DataFrame() vif["VIF Factor"] = [variance_inflation_factor(x_train.values, i) for i in range(x_train.shape[1])] vif["features"] = x_train.columns
In [88]:
n_comp = 128
pca = PCA(n_components=n_comp, random_state=42)
pca2_results_train = pca.fit_transform(x_train)
pca2_results_test = pca.transform(x_test)
In [89]:
train_reshaped = np.array([i.reshape((-1, 1)) for i in pca2_results_train])
train_reshaped = train_reshaped.astype(np.float32)
train_reshaped.shape
Out[89]:
In [122]:
# Idea: Simple model
model = Sequential()
model.add(Bidirectional(SimpleRNN(128, return_sequences=True, activation="relu"), input_shape=(None, 1)))
model.add(Bidirectional(SimpleRNN(64, return_sequences=True, activation="relu")))
model.add(Bidirectional(SimpleRNN(32, return_sequences=False, activation="relu")))
model.add(Dropout(0.5))
model.add(Dense(1, activation="linear"))
model.compile(optimizer="rmsprop", loss="mse")
model.summary()
In [105]:
# Idea: Funnel -> reduce information after each layer / deep model
model = Sequential()
model.add(Bidirectional(SimpleRNN(64, return_sequences=True, activation="relu"), input_shape=(None, 1)))
model.add(Bidirectional(SimpleRNN(64, return_sequences=True, activation="relu")))
model.add(TimeDistributed(Dense(32, activation="relu")))
model.add(Bidirectional(SimpleRNN(32, return_sequences=True, activation="relu")))
model.add(Bidirectional(SimpleRNN(32, return_sequences=True, activation="relu")))
model.add(TimeDistributed(Dense(16, activation="relu")))
model.add(Bidirectional(SimpleRNN(16, return_sequences=False, activation="relu")))
model.add(Dropout(0.5))
model.add(Dense(1, activation="linear"))
model.compile(optimizer="rmsprop", loss="mse")
model.summary()
In [123]:
early_stop = EarlyStopping(monitor="loss", patience=10)
file_path = "weights.{epoch:02d}-{val_loss:.2f}.hdf5"
checkpoint = ModelCheckpoint(file_path)
model_run = model.fit(train_reshaped, y_train, epochs=100 ,validation_split=0.02, callbacks=[early_stop, checkpoint])
In [124]:
y_pred_train = model.predict(train_reshaped)
In [125]:
print("the R2 score is : {}".format(r2_score(y_train, y_pred_train)))
In [126]:
test_reshaped = np.array([i.reshape((-1, 1)) for i in pca2_results_test])
test_reshaped = test_reshaped.astype(np.float32)
test_reshaped.shape
Out[126]:
In [127]:
y_pred_test = model.predict(test_reshaped)
In [128]:
output = pd.DataFrame({"ID": test.index, "y": y_pred_test.reshape(-1)})
In [129]:
output.head()
Out[129]:
In [130]:
output.to_csv("submissions_{}.csv".format(datetime.datetime.today()), index=False)
In [ ]:
In [131]:
sub_1 = pd.read_csv("submission_baseLine.csv")
In [132]:
sub_2 = pd.read_csv("submissions_2017-05-31 15:48:40.546392.csv")
In [133]:
sub_3 = output.copy()
In [141]:
mean_pred = (sub_1.y.values + sub_2.y.values + sub_3.y.values) / 3
In [142]:
output_mean = pd.DataFrame({"ID": test.index, "y": mean_pred})
In [143]:
output_mean.to_csv("submissions_mean_{}.csv".format(datetime.datetime.today()), index=False)
In [ ]:
In [144]:
sub_1 = pd.read_csv("submission_baseLine.csv")
In [145]:
sub_2 = pd.read_csv("submissions_2017-05-31 15:48:40.546392.csv")
In [146]:
mean_pred = (sub_1.y.values + sub_2.y.values ) / 2
In [148]:
output_mean = pd.DataFrame({"ID": test.index, "y": mean_pred})
In [149]:
output_mean.to_csv("submissions_mean_2_{}.csv".format(datetime.datetime.today()), index=False)
In [ ]:
In [ ]: