In [43]:
import datetime
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn import preprocessing
from sklearn.metrics import r2_score
from sklearn.decomposition import PCA, IncrementalPCA, FastICA
import matplotlib.pyplot as plt
plt.style.use("ggplot")
%matplotlib inline
In [44]:
train = pd.read_csv("data/train.csv", index_col="ID")
test = pd.read_csv("data/test.csv", index_col="ID")
In [45]:
# process columns, apply LabelEncoder to categorical features
for c in train.columns:
if train[c].dtype == 'object':
lbl = preprocessing.LabelEncoder()
lbl.fit(list(train[c].values) + list(test[c].values))
train[c] = lbl.transform(list(train[c].values))
test[c] = lbl.transform(list(test[c].values))
# shape
print('Shape train: {}\nShape test: {}'.format(train.shape, test.shape))
In [46]:
from sklearn.decomposition import PCA, FastICA, TruncatedSVD
n_comp = 24
# tSVD
tsvd = TruncatedSVD(n_components=n_comp, random_state=42)
tsvd_results_train = tsvd.fit_transform(train.drop(["y"], axis=1))
tsvd_results_test = tsvd.transform(test)
# PCA
pca = PCA(n_components=n_comp, random_state=42)
pca2_results_train = pca.fit_transform(train.drop(["y"], axis=1))
pca2_results_test = pca.transform(test)
# ICA
ica = FastICA(n_components=n_comp, random_state=42)
ica2_results_train = ica.fit_transform(train.drop(["y"], axis=1))
ica2_results_test = ica.transform(test)
# Append decomposition components to datasets
for i in range(1, n_comp+1):
train['pca_' + str(i)] = pca2_results_train[:,i-1]
test['pca_' + str(i)] = pca2_results_test[:, i-1]
train['ica_' + str(i)] = ica2_results_train[:,i-1]
test['ica_' + str(i)] = ica2_results_test[:, i-1]
y_train = train["y"]
y_mean = np.mean(y_train)
In [47]:
y_mean = np.average(y_train)
y_mean
Out[47]:
In [48]:
xgb_params = {
"n_trees": 500,
"eta": 0.005,
"max_depth": 4,
"subsample": 0.95,
"objective": "reg:linear",
"eval_metric": "rmse",
"base_score": y_mean,
"silent": 1,
"seed": 0
}
In [49]:
dtrain = xgb.DMatrix(train.drop("y", axis=1), y_train)
dtest = xgb.DMatrix(test)
In [8]:
cv_output = xgb.cv(xgb_params, dtrain, num_boost_round=1000, early_stopping_rounds=50,
verbose_eval=50, show_stdv=False)
In [9]:
cv_output[["train-rmse-mean", "test-rmse-mean"]].plot()
pass
In [50]:
num_boost_rounds = 1500
model = xgb.train(dict(xgb_params, silent=0), dtrain, num_boost_round=num_boost_rounds)
In [51]:
xgb.plot_importance(model, max_num_features=20)
Out[51]:
In [52]:
y_pred = model.predict(dtrain)
In [53]:
r2_score(y_train, y_pred)
Out[53]:
In [54]:
y_predict = model.predict(dtest)
In [55]:
test.index
Out[55]:
In [56]:
output = pd.DataFrame({"ID": test.index, "y": y_predict})
In [57]:
output.head()
Out[57]:
In [58]:
output.to_csv("submissions_feature_reduction_{}.csv".format(datetime.datetime.today()), index=False)
In [ ]:
In [ ]: