Demand prediction baseline solution

Victor Kantor, xead.wl@gmail.com


In [1]:
import pandas as pd
from sklearn import model_selection, metrics
import numpy as np

train = pd.read_csv("train.tsv")
test = pd.read_csv("test.tsv")
sample_submission = pd.read_csv("sample_submission.tsv")

In [2]:
def score_func(y_true, y_pred):
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    return (np.abs(y_true - y_pred) / (np.abs(y_true) + np.abs(y_pred))).mean() * 200.
    return s

scorer = metrics.make_scorer(score_func=score_func, greater_is_better=False)

In [61]:
frac = 1. # fraction of learning examples used for model fitting

# sample the train set if your don't want to deel with all examples
train = train.sample(frac=frac, random_state=42)

X = train.drop(['Num','y'], axis=1)
y = train['y']
print len(X), len(y)


36229 36229

In [62]:
%%time
from sklearn.ensemble import GradientBoostingRegressor
import xgboost
#model = GradientBoostingRegressor(n_estimators=100, max_depth=5, random_state=43)
#model = xgboost.XGBRegressor(n_estimators=100, max_depth=3)
model = best
model.fit(X, y)

preds = model.predict(test.drop(['Num'], axis=1))

print len(preds)
print len(sample_submission)


2016
2016
CPU times: user 2min 31s, sys: 584 ms, total: 2min 31s
Wall time: 41.2 s

In [63]:
sample_submission['y'] = preds

In [64]:
sample_submission.head(5)


Out[64]:
Num y
0 348622 2135.611572
1 348623 34481.781250
2 348624 341513.687500
3 348625 35057.746094
4 348626 253.368301

In [65]:
# In GBM you can get some negative predictions:
print sample_submission[sample_submission['y'] < 0]


         Num          y
203   348825 -13.050495
654   349947 -86.815025
1106  350619 -42.518654

In [66]:
sample_submission['y'] = sample_submission['y'].map(lambda x: x if x > 0 else 0.0)

In [67]:
sample_submission.to_csv("baseline_submission.csv", sep=',', index=False)

In [167]:
model_selection.cross_val_score(model, X, y, scoring=scorer)


Out[167]:
array([-51.53960397, -53.21808074, -52.58545046])

In [160]:
def find(params):
    model = xgboost.XGBRegressor(n_estimators=params[0], max_depth=params[1])
    return model_selection.cross_val_score(model, X, y).mean()

In [54]:
params = []
params.append({
    'n_estimators': [112, 114, 116],
    'learning_rate': [0.1]
})

params2 = {
    'n_estimators' : [115],
    'learning_rate': [0.1],
    'max_depth': [2, 6, 10, 14, 18],
    'min_child_weight' : [1, 3, 5]
}

params3 = {
    'gamma':[i/10.0 for i in range(0,5)]
}

params4 = {
    'gamma': [0.3],
    'n_estimators' : [115],
    'learning_rate': [0.1],
    'max_depth': [18],
    'subsample':[i/10.0 for i in range(6,10)],
    'colsample_bytree':[i/10.0 for i in range(6,10)]
}

In [55]:
a = model_selection.GridSearchCV(xgboost.XGBRegressor(silent=False), params4, 
                                 scoring=scorer, cv=model_selection.TimeSeriesSplit(), 
                                 fit_params={'eval_metric' : 'mae'})

In [56]:
%%time
a.fit(X, y)


CPU times: user 1h 51min 44s, sys: 45.4 s, total: 1h 52min 29s
Wall time: 32min 16s
Out[56]:
GridSearchCV(cv=TimeSeriesSplit(n_splits=3), error_score='raise',
       estimator=XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0,
       learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=False, subsample=1),
       fit_params={'eval_metric': 'mae'}, iid=True, n_jobs=1,
       param_grid={'colsample_bytree': [0.6, 0.7, 0.8, 0.9], 'learning_rate': [0.1], 'n_estimators': [115], 'subsample': [0.6, 0.7, 0.8, 0.9], 'max_depth': [18], 'gamma': [0.3]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=make_scorer(score_func, greater_is_better=False), verbose=0)

In [57]:
a.best_params_


Out[57]:
{'colsample_bytree': 0.9,
 'gamma': 0.3,
 'learning_rate': 0.1,
 'max_depth': 18,
 'n_estimators': 115,
 'subsample': 0.9}

In [45]:
z = dict(a.best_params_, **params3)

In [58]:
best = xgboost.XGBRegressor(**a.best_params_)

In [59]:
best


Out[59]:
XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.9,
       gamma=0.3, learning_rate=0.1, max_delta_step=0, max_depth=18,
       min_child_weight=1, missing=None, n_estimators=115, nthread=-1,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=0.9)

In [43]:
z = dict(params3, **params2)

In [44]:
z


Out[44]:
{'gamma': [0.0, 0.1, 0.2, 0.3, 0.4],
 'learning_rate': [0.1],
 'max_depth': [2, 6, 10, 14, 18],
 'min_child_weight': [1, 3, 5],
 'n_estimators': [115]}

In [ ]: