In [48]:
import datetime
import xgboost as xgb
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use("ggplot")
%matplotlib inline
import itertools
from sklearn.metrics import r2_score
from sklearn import preprocessing
from sklearn.decomposition import PCA, FastICA, TruncatedSVD
from sklearn.random_projection import GaussianRandomProjection, SparseRandomProjection
In [9]:
train = pd.read_csv("data/train.csv", index_col="ID")
test = pd.read_csv("data/test.csv", index_col="ID")
In [10]:
for c in train.columns:
if train[c].dtype == "object":
lbl = preprocessing.LabelEncoder()
lbl.fit(list(train[c].values) + list(test[c].values))
train[c] = lbl.transform(list(train[c].values))
test[c] = lbl.transform(list(test[c].values))
In [11]:
n_comp = 12
# tSVD
tsvd = TruncatedSVD(n_components=n_comp, random_state=42)
tsvd_results_train = tsvd.fit_transform(train.drop(["y"], axis=1))
tsvd_results_test = tsvd.transform(test)
# PCA
pca = PCA(n_components=n_comp, random_state=42)
pca2_results_train = pca.fit_transform(train.drop(["y"], axis=1))
pca2_results_test = pca.transform(test)
# ICA
ica = FastICA(n_components=n_comp, random_state=42)
ica2_results_train = ica.fit_transform(train.drop(["y"], axis=1))
ica2_results_test = ica.transform(test)
# GRP
grp = GaussianRandomProjection(n_components=n_comp, eps=0.1, random_state=42)
grp_results_train = grp.fit_transform(train.drop(["y"], axis=1))
grp_results_test = grp.transform(test)
# SRP
srp = SparseRandomProjection(n_components=n_comp, dense_output=True, random_state=42)
srp_results_train = srp.fit_transform(train.drop(["y"], axis=1))
srp_results_test = srp.transform(test)
# Append decomposition components to datasets
for i in range(1, n_comp + 1):
train['pca_' + str(i)] = pca2_results_train[:, i - 1]
test['pca_' + str(i)] = pca2_results_test[:, i - 1]
train['ica_' + str(i)] = ica2_results_train[:, i - 1]
test['ica_' + str(i)] = ica2_results_test[:, i - 1]
train['tsvd_' + str(i)] = tsvd_results_train[:, i - 1]
test['tsvd_' + str(i)] = tsvd_results_test[:, i - 1]
train['grp_' + str(i)] = grp_results_train[:, i - 1]
test['grp_' + str(i)] = grp_results_test[:, i - 1]
train['srp_' + str(i)] = srp_results_train[:, i - 1]
test['srp_' + str(i)] = srp_results_test[:, i - 1]
In [14]:
y_train = train["y"]
y_mean = np.average(y_train)
y_mean
Out[14]:
In [42]:
xgb_params = {
"n_trees": 500,
"eta": 0.005,
"max_depth": 4,
"subsample": 0.95,
"objective": "reg:linear",
"eval_metric": "rmse",
"base_score": y_mean,
"silent": 1
}
In [16]:
dtrain = xgb.DMatrix(train.drop("y", axis=1), y_train)
dtest = xgb.DMatrix(test)
In [17]:
cv_output = xgb.cv(xgb_params, dtrain, num_boost_round=2000, early_stopping_rounds=50,
# verbose_eval=50, show_stdv=False)
In [27]:
# CV, 750, 1250, 1500
num_boost_rounds = 1250
model = xgb.train(dict(xgb_params, silent=0), dtrain, num_boost_round=num_boost_rounds)
In [28]:
xgb.plot_importance(model, max_num_features=20)
Out[28]:
In [31]:
y_pred = model.predict(dtrain)
In [32]:
r2_score(y_train, y_pred)
Out[32]:
In [43]:
cv_output = xgb.cv(xgb_params, dtrain, num_boost_round=2000, early_stopping_rounds=50,
verbose_eval=50, show_stdv=False)
num_boost = [len(cv_output), 750, 1250, 1500]
r2_value = []
prediction = []
for i in num_boost:
print(i)
model = xgb.train(dict(xgb_params, silent=0), dtrain, num_boost_round=i)
y_pred_train = model.predict(dtrain)
y_pred_test = model.predict(dtest)
r2_value.append(r2_score(y_train, y_pred_train))
prediction.append(y_pred_test)
In [44]:
y_predict = model.predict(dtest)
In [45]:
y_predict
Out[45]:
In [46]:
r2_value
Out[46]:
In [60]:
mean_values = pd.DataFrame.transpose(pd.DataFrame(prediction)).mean(axis=1)
In [61]:
output = pd.DataFrame({"ID": test.index, "y": mean_values})
In [63]:
output.to_csv("submissions_xgb_average.csv", index=False)
In [65]:
test
Out[65]:
In [ ]:
In [ ]: