In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
from datetime import datetime
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA, FastICA
from sklearn.metrics import r2_score
In [2]:
train = pd.read_csv("data/train.csv", index_col="ID")
test = pd.read_csv("data/test.csv", index_col="ID")
In [3]:
for c in train.columns:
if train[c].dtype == "object":
lbl = LabelEncoder()
lbl.fit(list(train[c].values) + list(test[c].values))
train[c] = lbl.transform(list(train[c].values))
test[c] = lbl.transform(list(test[c].values))
In [4]:
n_comp = 10
# PCA
pca = PCA(n_components=n_comp, random_state=42)
pca2_results_train = pca.fit_transform(train.drop(["y"], axis=1))
pca2_results_test = pca.transform(test)
# ICA
ica = FastICA(n_components=n_comp, random_state=42)
ica2_results_train = ica.fit_transform(train.drop(["y"], axis=1))
ica2_results_test = ica.transform(test)
In [ ]:
"""
pca_train_df = pd.DataFrame(pca2_results_train).add_prefix("pca_")
ica_train_df = pd.DataFrame(ica2_results_train).add_prefix("ica_")
x_train = pd.concat([pca_train_df, ica_train_df], axis=1)
pca_test_df = pd.DataFrame(pca2_results_test).add_prefix("pca_")
ica_test_df = pd.DataFrame(ica2_results_test).add_prefix("ica_")
x_test = pd.concat([pca_test_df, ica_test_df], axis=1)
"""
In [5]:
for i in range(1, n_comp+1):
train["pca_" + str(i)] = pca2_results_train[:,i-1]
test["pca_" + str(i)] = pca2_results_test[:, i-1]
train["ica_" + str(i)] = ica2_results_train[:,i-1]
test["ica_" + str(i)] = ica2_results_test[:, i-1]
y_train = train["y"]
y_mean = np.mean(y_train)
In [6]:
dtrain = xgb.DMatrix(train.drop("y", axis=1), y_train)
dtest = xgb.DMatrix(test)
In [7]:
startTime = datetime.now()
In [10]:
r2_list = []
seed_nr = 10
for i in range(seed_nr):
xgb_params = {
"n_trees": 500,
"eta": 0.005,
"max_depth": 4,
"subsample": 0.95,
"objective": "reg:linear",
"eval_metric": "rmse",
"base_score": y_mean,
"silent": 1,
"seed": i
}
#cv_output = xgb.cv(xgb_params, dtrain, num_boost_round=2000, early_stopping_rounds=50,
# verbose_eval=200, show_stdv=False)
#num_boost_rounds = len(cv_output) # old 900
num_boost_rounds = 900
model = xgb.train(dict(xgb_params, silent=0), dtrain, num_boost_round=num_boost_rounds)
r2_value = r2_score(model.predict(dtrain), dtrain.get_label())
y_pred = model.predict(dtest)
output = pd.DataFrame({"id": test.index, "y": y_pred})
output.to_csv("xgb_various_seeds/submission_{seed}_{r2}.csv".format(seed=i, r2=r2_value), index=False)
r2_list.append((i, r2_value))
In [ ]:
print(datetime.now() - startTime)
In [ ]: