Cross Validation with XGBoost


In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import cross_val_score
from sklearn.cross_validation import KFold
import xgboost

Load only the users with known destination


In [2]:
train_users = pd.read_csv('../cache/train_users.csv')

Replace NaN values with -1.


In [3]:
train_users.fillna(-1, inplace=True)

Select proper X and y. The labels should be encoded into integers to be usable by XGBoost:


In [4]:
y_train = train_users['country_destination']
train_users.drop(['country_destination', 'id'], axis=1, inplace=True)
x_train = train_users.values

label_encoder = LabelEncoder()
encoded_y_train = label_encoder.fit_transform(y_train)

To use xgboost models we need a DMatrix. This can be done with the next command:


In [5]:
train_data = xgboost.DMatrix(x_train, encoded_y_train)

To see the model performance as it advance we are going to define the score function, for this competition is the NDCG5:


In [6]:
def ndcg5_score(preds, dtrain):
    labels = dtrain.get_label()
    top = []

    for i in range(preds.shape[0]):
        top.append(np.argsort(preds[i])[::-1][:5])

    mat = np.reshape(np.repeat(labels,np.shape(top)[1]) == np.array(top).ravel(),np.array(top).shape).astype(int)
    score = np.mean(np.sum(mat/np.log2(np.arange(2, mat.shape[1] + 2)),axis = 1))
    return 'ndcg5', score

Finally, we set the model parameters and run the model with 10 fold Cross Validation to check the reliability of the results:


In [7]:
param = {
    'max_depth': 10,
    'learning_rate': 1,
    'n_estimators': 5,
    'objective': 'multi:softprob',
    'num_class': 12,
    'gamma': 0,
    'min_child_weight': 1,
    'max_delta_step': 0,
    'subsample': 1,
    'colsample_bytree': 1,
    'colsample_bylevel': 1,
    'reg_alpha': 0,
    'reg_lambda': 1,
    'scale_pos_weight': 1,
    'base_score': 0.5,
    'missing': None,
    'silent': True,
    'nthread': 4,
    'seed': 42
}

num_round = 10
xgboost.cv(param, train_data, num_boost_round=num_round, metrics=['mlogloss'], feval=ndcg5_score)


Out[7]:
test-ndcg5-mean test-ndcg5-std train-ndcg5-mean train-ndcg5-std
0 0 0 0 0
1 0 0 0 0
2 0 0 0 0
3 0 0 0 0
4 0 0 0 0
5 0 0 0 0
6 0 0 0 0
7 0 0 0 0
8 0 0 0 0
9 0 0 0 0