In [48]:
import datetime
import xgboost as xgb
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use("ggplot")
%matplotlib inline

import itertools
from sklearn.metrics import r2_score
from sklearn import preprocessing
from sklearn.decomposition import PCA, FastICA, TruncatedSVD
from sklearn.random_projection import GaussianRandomProjection, SparseRandomProjection

In [9]:
train = pd.read_csv("data/train.csv", index_col="ID")
test = pd.read_csv("data/test.csv", index_col="ID")

In [10]:
for c in train.columns:
    if train[c].dtype == "object":
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(train[c].values) + list(test[c].values))
        train[c] = lbl.transform(list(train[c].values))
        test[c] = lbl.transform(list(test[c].values))

In [11]:
n_comp = 12

# tSVD
tsvd = TruncatedSVD(n_components=n_comp, random_state=42)
tsvd_results_train = tsvd.fit_transform(train.drop(["y"], axis=1))
tsvd_results_test = tsvd.transform(test)

# PCA
pca = PCA(n_components=n_comp, random_state=42)
pca2_results_train = pca.fit_transform(train.drop(["y"], axis=1))
pca2_results_test = pca.transform(test)

# ICA
ica = FastICA(n_components=n_comp, random_state=42)
ica2_results_train = ica.fit_transform(train.drop(["y"], axis=1))
ica2_results_test = ica.transform(test)

# GRP
grp = GaussianRandomProjection(n_components=n_comp, eps=0.1, random_state=42)
grp_results_train = grp.fit_transform(train.drop(["y"], axis=1))
grp_results_test = grp.transform(test)

# SRP
srp = SparseRandomProjection(n_components=n_comp, dense_output=True, random_state=42)
srp_results_train = srp.fit_transform(train.drop(["y"], axis=1))
srp_results_test = srp.transform(test)

# Append decomposition components to datasets
for i in range(1, n_comp + 1):
    train['pca_' + str(i)] = pca2_results_train[:, i - 1]
    test['pca_' + str(i)] = pca2_results_test[:, i - 1]

    train['ica_' + str(i)] = ica2_results_train[:, i - 1]
    test['ica_' + str(i)] = ica2_results_test[:, i - 1]

    train['tsvd_' + str(i)] = tsvd_results_train[:, i - 1]
    test['tsvd_' + str(i)] = tsvd_results_test[:, i - 1]

    train['grp_' + str(i)] = grp_results_train[:, i - 1]
    test['grp_' + str(i)] = grp_results_test[:, i - 1]

    train['srp_' + str(i)] = srp_results_train[:, i - 1]
    test['srp_' + str(i)] = srp_results_test[:, i - 1]


/Users/datitran/anaconda/envs/kaggle/lib/python3.5/site-packages/sklearn/decomposition/fastica_.py:116: UserWarning: FastICA did not converge. Consider increasing tolerance or the maximum number of iterations.
  warnings.warn('FastICA did not converge. Consider increasing '
/Users/datitran/anaconda/envs/kaggle/lib/python3.5/site-packages/scipy/linalg/basic.py:1018: RuntimeWarning: internal gelsd driver lwork query error, required iwork dimension not returned. This is likely the result of LAPACK bug 0038, fixed in LAPACK 3.2.2 (released July 21, 2010). Falling back to 'gelss' driver.
  warnings.warn(mesg, RuntimeWarning)

In [14]:
y_train = train["y"]
y_mean = np.average(y_train)
y_mean


Out[14]:
100.66931812782134

In [42]:
xgb_params = {
    "n_trees": 500,
    "eta": 0.005,
    "max_depth": 4,
    "subsample": 0.95,
    "objective": "reg:linear",
    "eval_metric": "rmse",
    "base_score": y_mean,
    "silent": 1
}

In [16]:
dtrain = xgb.DMatrix(train.drop("y", axis=1), y_train)
dtest = xgb.DMatrix(test)

In [17]:
cv_output = xgb.cv(xgb_params, dtrain, num_boost_round=2000, early_stopping_rounds=50, 
#                   verbose_eval=50, show_stdv=False)


[0]	train-rmse:12.64	test-rmse:12.638
[50]	train-rmse:11.092	test-rmse:11.1459
[100]	train-rmse:10.0189	test-rmse:10.1351
[150]	train-rmse:9.28968	test-rmse:9.47303
[200]	train-rmse:8.80062	test-rmse:9.05063
[250]	train-rmse:8.47291	test-rmse:8.78791
[300]	train-rmse:8.25098	test-rmse:8.62768
[350]	train-rmse:8.0908	test-rmse:8.53291
[400]	train-rmse:7.95357	test-rmse:8.47836
[450]	train-rmse:7.82536	test-rmse:8.4491
[500]	train-rmse:7.70952	test-rmse:8.43563
[550]	train-rmse:7.61704	test-rmse:8.42952
[600]	train-rmse:7.53256	test-rmse:8.42824

In [27]:
# CV, 750, 1250, 1500
num_boost_rounds = 1250
model = xgb.train(dict(xgb_params, silent=0), dtrain, num_boost_round=num_boost_rounds)

In [28]:
xgb.plot_importance(model, max_num_features=20)


Out[28]:
<matplotlib.axes._subplots.AxesSubplot at 0x110d824e0>

In [31]:
y_pred = model.predict(dtrain)

In [32]:
r2_score(y_train, y_pred)


Out[32]:
0.68658199455340752

In [43]:
cv_output = xgb.cv(xgb_params, dtrain, num_boost_round=2000, early_stopping_rounds=50, 
                   verbose_eval=50, show_stdv=False)
num_boost = [len(cv_output), 750, 1250, 1500]
r2_value = []
prediction = []

for i in num_boost:
    print(i)
    model = xgb.train(dict(xgb_params, silent=0), dtrain, num_boost_round=i)
    y_pred_train = model.predict(dtrain)
    y_pred_test = model.predict(dtest)
    r2_value.append(r2_score(y_train, y_pred_train))
    prediction.append(y_pred_test)


[0]	train-rmse:12.64	test-rmse:12.638
[50]	train-rmse:11.092	test-rmse:11.1459
[100]	train-rmse:10.0189	test-rmse:10.1351
[150]	train-rmse:9.28968	test-rmse:9.47303
[200]	train-rmse:8.80062	test-rmse:9.05063
[250]	train-rmse:8.47291	test-rmse:8.78791
[300]	train-rmse:8.25098	test-rmse:8.62768
[350]	train-rmse:8.0908	test-rmse:8.53291
[400]	train-rmse:7.95357	test-rmse:8.47836
[450]	train-rmse:7.82536	test-rmse:8.4491
[500]	train-rmse:7.70952	test-rmse:8.43563
[550]	train-rmse:7.61704	test-rmse:8.42952
[600]	train-rmse:7.53256	test-rmse:8.42824
576
750
1250
1500

Prediction


In [44]:
y_predict = model.predict(dtest)

In [45]:
y_predict


Out[45]:
array([  85.18135071,   97.18682861,   80.05259705, ...,   93.39948273,
        109.8515625 ,   93.75300598], dtype=float32)

In [46]:
r2_value


Out[46]:
[0.61848415826468184,
 0.64248259587514533,
 0.68658199455340752,
 0.70297983205481929]

In [60]:
mean_values = pd.DataFrame.transpose(pd.DataFrame(prediction)).mean(axis=1)

In [61]:
output = pd.DataFrame({"ID": test.index, "y": mean_values})

In [63]:
output.to_csv("submissions_xgb_average.csv", index=False)

In [65]:
test


Out[65]:
X0 X1 X2 X3 X4 X5 X6 X8 X10 X11 ... pca_11 ica_11 tsvd_11 grp_11 srp_11 pca_12 ica_12 tsvd_12 grp_12 srp_12
ID
1 24 23 38 5 3 26 0 22 0 0 ... -1.714782 0.003638 1.250838 7.046348 1.271178 3.610562 -0.021043 2.593055 26.800662 2.542356
2 46 3 9 0 3 9 6 24 0 0 ... 0.170500 -0.013091 0.158002 7.160356 5.084712 -0.177144 -0.024816 -1.128290 8.807027 5.084712
3 24 23 19 5 3 0 9 9 0 0 ... -0.982514 0.012243 0.828812 3.644108 3.813534 3.307781 0.006142 1.492705 6.044545 2.542356
4 24 13 38 5 3 32 11 13 0 0 ... -1.891332 -0.007817 1.165267 1.205195 2.542356 3.900539 -0.003266 3.635485 29.311298 2.542356
5 49 20 19 2 3 31 8 12 0 0 ... 0.372387 0.016720 -0.412456 6.412039 2.542356 0.203514 -0.005647 0.534005 18.929148 3.813534
8 51 1 9 4 3 30 6 18 0 0 ... 2.187761 -0.017054 -1.890669 8.139587 5.084712 0.996816 -0.012483 -0.702531 13.857533 3.813534
10 50 3 5 3 3 30 3 24 0 0 ... -2.170699 -0.012379 1.900214 14.435411 6.355890 -0.364884 -0.027055 0.455819 16.827351 1.271178
11 32 20 5 2 3 14 3 0 0 0 ... -0.309360 0.021801 -0.046182 3.499716 3.813534 0.205489 0.025721 0.900369 7.723744 3.813534
12 15 13 43 2 3 14 9 13 0 0 ... -1.558666 0.001382 3.041812 -3.430354 6.355890 -0.332851 -0.001156 -3.055479 20.151320 3.813534
14 41 23 19 5 3 13 5 21 0 0 ... -1.377017 0.020245 1.678514 9.761558 2.542356 -1.638675 -0.021474 -0.730287 12.687294 -1.271178
15 15 13 43 2 3 13 3 13 0 0 ... -1.530564 0.000197 3.105998 -1.746055 6.355890 -0.351794 -0.000613 -3.435102 19.017919 3.813534
16 23 3 26 0 3 13 11 17 0 0 ... -0.330472 -0.022255 0.298431 1.775172 6.355890 0.933991 -0.008615 0.151225 15.636658 2.542356
17 11 19 29 5 3 13 7 14 0 0 ... 1.155564 0.009838 -0.939427 -1.204039 3.813534 -1.017130 -0.000437 -1.578835 14.693384 0.000000
19 41 23 5 6 3 13 6 9 0 0 ... -0.514857 0.030249 0.177333 8.020388 2.542356 -0.576932 0.004293 0.626014 7.117652 1.271178
20 34 0 11 5 3 13 11 19 0 0 ... -0.221697 -0.018617 0.104881 8.835412 5.084712 0.211328 -0.014430 0.411417 10.055139 1.271178
21 11 19 29 5 3 13 7 18 0 0 ... 1.131660 0.009929 -0.934108 0.387623 3.813534 -1.058603 -0.009120 -1.521472 14.412868 -1.271178
22 15 13 43 2 3 13 3 7 0 0 ... -2.167571 0.002949 3.595895 -3.639964 5.084712 0.363775 0.010967 -2.657896 18.514652 5.084712
23 11 19 29 5 3 12 7 14 0 0 ... 1.156986 0.009881 -0.940505 -1.209123 3.813534 -1.016838 -0.000450 -1.585819 14.220430 0.000000
26 52 10 9 0 3 12 11 2 0 0 ... -0.296851 -0.006671 0.739267 -0.810420 7.627067 0.249610 0.022585 -1.393593 8.973012 3.813534
28 51 19 19 5 3 12 3 2 0 0 ... 0.502432 0.002563 0.054841 1.167066 5.084712 -0.146192 0.021908 -1.603956 8.719236 0.000000
29 11 19 29 5 3 12 7 14 0 0 ... 1.156986 0.009881 -0.940505 -1.209123 3.813534 -1.016838 -0.000450 -1.585819 14.220430 0.000000
33 11 19 29 5 3 12 7 14 0 0 ... 1.156986 0.009881 -0.940505 -1.209123 3.813534 -1.016838 -0.000450 -1.585819 14.220430 0.000000
35 52 23 9 4 3 12 9 12 0 0 ... 1.335471 0.017153 -0.982201 7.264861 5.084712 0.666863 -0.000245 -1.195631 7.681748 3.813534
41 11 19 29 5 3 16 7 14 0 0 ... 1.151297 0.009708 -0.936190 -1.188785 3.813534 -1.018005 -0.000395 -1.557885 16.112245 0.000000
42 51 3 9 2 3 16 6 10 0 0 ... 0.038288 -0.016736 -0.181836 4.278207 5.084712 -0.299314 0.005153 0.001296 10.184704 1.271178
43 15 13 43 2 3 16 9 22 0 0 ... -1.235619 0.001615 2.587240 -0.182639 5.084712 -0.817065 -0.020117 -2.864849 21.627612 1.271178
45 23 1 37 2 3 16 6 19 0 0 ... 2.861170 -0.021386 -2.835569 1.809420 5.084712 0.433416 -0.013857 -0.372506 15.526004 2.542356
46 23 0 19 3 3 16 11 7 0 0 ... 1.072904 -0.020223 -0.919104 -0.335273 2.542356 0.172243 0.011601 -0.174251 10.632803 1.271178
51 32 20 37 2 3 16 6 0 0 0 ... 0.638531 0.019319 -1.253382 -3.524436 3.813534 -0.571897 0.022759 1.167750 16.311535 3.813534
53 24 17 19 5 3 16 6 0 0 0 ... -1.824343 0.004396 2.122083 0.193996 5.084712 4.228805 0.025169 0.740460 11.686264 5.084712
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
8361 22 23 11 2 3 1 6 12 0 0 ... 0.991896 0.024041 -1.136656 5.580933 3.813534 0.173490 0.000314 0.350638 5.620973 5.084712
8363 9 13 19 5 3 1 8 2 0 0 ... 0.037404 0.007784 0.108064 -1.361012 1.271178 -0.630004 0.022961 -0.619932 6.217948 1.271178
8364 46 3 37 0 3 1 9 22 0 0 ... 0.700086 -0.013676 -1.192935 1.464615 3.813534 -1.542641 -0.023987 0.477139 10.957880 2.542356
8365 52 3 5 0 3 1 3 4 0 0 ... -1.561792 -0.013687 1.263724 4.795447 6.355890 -0.023743 0.016696 0.545716 1.832626 1.271178
8366 30 10 19 0 3 1 11 9 0 0 ... -0.331556 -0.014091 0.453208 0.073163 5.084712 0.282212 0.007661 -0.451066 6.956368 -1.271178
8370 50 1 19 0 3 1 9 21 0 0 ... -0.017286 -0.019971 -0.008425 8.639244 6.355890 0.744845 -0.023673 0.558851 4.852846 6.355890
8372 12 16 5 5 3 1 8 6 0 0 ... 0.313459 0.014531 -0.318786 2.679686 5.084712 -1.129103 0.017213 -0.918784 1.901126 0.000000
8376 52 1 37 2 3 1 8 18 0 0 ... 2.067564 -0.015646 -2.055505 2.399907 5.084712 -0.392982 -0.016272 -0.266275 6.389545 1.271178
8377 50 16 45 2 3 1 9 7 0 0 ... -0.558295 -0.002543 1.058515 -3.736250 5.084712 -1.033771 0.008687 -1.165563 14.206997 0.000000
8379 50 19 19 5 3 1 3 5 0 0 ... -0.412253 0.007865 0.845536 4.883085 5.084712 -0.902703 0.012608 -1.710462 3.419406 1.271178
8380 45 3 37 0 3 1 3 15 0 0 ... 0.772512 -0.023266 -1.018588 1.522533 3.813534 -0.655265 -0.006393 0.116385 11.043013 1.271178
8381 10 9 19 5 3 1 6 16 0 0 ... 0.172892 0.002054 0.107133 3.417428 2.542356 -1.261781 -0.006692 -0.978471 7.339528 1.271178
8386 45 1 9 2 3 1 9 22 0 0 ... 2.744113 -0.015741 -2.501008 7.991507 5.084712 1.138809 -0.020907 -0.605984 1.514459 5.084712
8388 23 1 19 2 3 1 11 14 0 0 ... 1.068149 -0.019977 -0.950221 4.240683 5.084712 1.228884 -0.004383 0.077862 4.263758 1.271178
8389 32 17 11 2 3 1 9 7 0 0 ... 0.410450 0.019066 -0.740641 3.138669 5.084712 0.539227 0.008495 1.134056 4.244269 5.084712
8391 49 6 19 0 3 1 6 21 0 0 ... -1.198979 -0.008478 1.108685 8.682647 5.084712 -0.074491 -0.023994 0.638499 7.138845 2.542356
8394 52 23 0 1 3 1 9 1 0 0 ... 0.318295 0.012021 0.011588 6.271418 6.355890 0.079390 0.025691 -0.817155 1.029160 0.000000
8396 9 13 19 5 3 1 9 4 0 0 ... 0.032410 0.012235 -0.048793 0.018301 1.271178 -0.309524 0.016275 -0.391421 5.602780 2.542356
8398 23 19 37 5 3 1 6 15 0 0 ... 1.664172 0.004929 -1.326393 0.406803 3.813534 -0.737913 -0.003642 -1.796293 11.362108 1.271178
8400 10 6 19 0 3 1 9 0 0 0 ... -0.694680 -0.002265 0.566937 -4.391322 2.542356 -0.675452 0.025959 0.109204 6.112038 0.000000
8401 11 16 30 5 3 1 3 6 0 0 ... 1.294331 0.003090 -0.761852 -1.305413 6.355890 -0.996526 0.018464 -2.537947 8.559801 1.271178
8404 11 19 29 5 3 1 11 20 0 0 ... 1.313694 0.008450 -1.098121 -0.159546 2.542356 -1.093286 -0.013183 -1.251245 9.146901 0.000000
8407 41 13 19 5 3 1 3 0 0 0 ... -1.592752 0.010834 1.532786 3.464167 1.271178 -0.822397 0.021415 0.119642 4.496499 1.271178
8408 36 10 19 0 3 1 11 13 0 0 ... -0.728555 -0.010491 0.815011 1.528973 5.084712 -0.377640 -0.002526 -0.377508 7.341222 -1.271178
8409 36 16 44 3 3 1 9 11 0 0 ... 0.065513 -0.000182 0.531252 -4.071465 1.271178 -0.674307 0.001571 -1.229955 15.366136 0.000000
8410 9 9 19 5 3 1 9 4 0 0 ... -0.131410 0.006054 0.125723 0.040165 2.542356 -0.198499 0.016121 -0.368980 5.014935 2.542356
8411 46 1 9 3 3 1 9 24 0 0 ... 0.389208 -0.012173 -0.258061 11.596712 7.627067 0.950585 -0.027055 -0.160181 3.511625 6.355890
8413 51 23 19 5 3 1 3 22 0 0 ... -0.006041 0.008858 0.295984 11.141230 3.813534 -0.286020 -0.022914 -1.094539 5.732252 1.271178
8414 10 23 19 0 3 1 2 16 0 0 ... 0.992178 0.023309 -1.051221 4.692869 3.813534 -0.017100 -0.008829 -0.217748 6.462538 5.084712
8416 46 1 9 2 3 1 6 17 0 0 ... 2.301100 -0.011585 -1.999656 7.128865 3.813534 0.547094 -0.011092 -0.886460 1.149453 5.084712

4209 rows × 436 columns


In [ ]:


In [ ]: