In [2]:
%pylab inline

from sklearn.dummy import DummyRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import xgboost as xgb

from soln.dataset import AllCategoricalsFeaturizer
from soln.dataset import featurize_and_to_numpy
from soln.dataset import generate_xv_splits
from soln.dataset import get_augmented_train_and_test_set
from soln.utils import eval_regressor


Populating the interactive namespace from numpy and matplotlib

In [4]:
%time aug_train_set, aug_test_set = get_augmented_train_and_test_set()


CPU times: user 7.48 s, sys: 144 ms, total: 7.63 s
Wall time: 7.72 s

In [5]:
params = {
    'objective': 'reg:linear',
    'eta': 0.02,
    'min_child_weight': 6,
    'subsample': 0.7,
    'colsample_bytree': 0.6,
    'scale_pos_weight': 0.8,  # undocumented?!
    'silent': 1,
    'max_depth': 8,
    'max_delta_step': 2,
}

In [9]:
featurizer = AllCategoricalsFeaturizer()
num_rounds = 1000

train_rmsles = []
test_rmsles = []

for i, split in enumerate(generate_xv_splits(aug_train_set)):
    print "---------------------- split {}".format(i)
    %time split_np = featurize_and_to_numpy(featurizer, *split)
    X_train_np, y_train_np, X_test_np, y_test_np = split_np
    xgtrain = xgb.DMatrix(X_train_np, label=y_train_np)
    xgtest = xgb.DMatrix(X_test_np)

    %time model = xgb.train(params.items(), xgtrain, num_rounds)
    %time y_train_pred = model.predict(xgtrain)
    train_rmsle = np.sqrt(mean_squared_error(y_train_np, y_train_pred))
    %time y_test_pred = model.predict(xgtest)
    test_rmsle = np.sqrt(mean_squared_error(y_test_np, y_test_pred))

    print "train_rmsle {}; test_rmsle {}".format(train_rmsle, test_rmsle)
    train_rmsles.append(train_rmsle)
    test_rmsles.append(test_rmsle)

print
print "------------------------------ averages:".format(i)
print "    train RMSLE avg {} std {}".format(np.mean(train_rmsles), np.std(train_rmsles))
# print "    train RMSLEs: {}".format(train_rmsles)
print "    test RMSLE avg {} std {}".format(np.mean(test_rmsles), np.std(test_rmsles))
# print "    test RMSLEs: {}".format(test_rmsles)
print


---------------------- split 0
CPU times: user 2.44 s, sys: 392 ms, total: 2.84 s
Wall time: 2.87 s
CPU times: user 2min 9s, sys: 248 ms, total: 2min 10s
Wall time: 1min 17s
CPU times: user 8 ms, sys: 4 ms, total: 12 ms
Wall time: 5.34 ms
CPU times: user 700 ms, sys: 0 ns, total: 700 ms
Wall time: 476 ms
train_rmsle 0.131497068386; test_rmsle 0.224741115394
---------------------- split 1
CPU times: user 2.36 s, sys: 520 ms, total: 2.88 s
Wall time: 2.89 s
CPU times: user 2min 7s, sys: 340 ms, total: 2min 8s
Wall time: 1min 15s
CPU times: user 12 ms, sys: 0 ns, total: 12 ms
Wall time: 8.45 ms
CPU times: user 720 ms, sys: 0 ns, total: 720 ms
Wall time: 417 ms
train_rmsle 0.129737217496; test_rmsle 0.210744142982
---------------------- split 2
CPU times: user 2.43 s, sys: 428 ms, total: 2.86 s
Wall time: 2.89 s
CPU times: user 2min 17s, sys: 424 ms, total: 2min 17s
Wall time: 1min 29s
CPU times: user 16 ms, sys: 0 ns, total: 16 ms
Wall time: 14.5 ms
CPU times: user 728 ms, sys: 0 ns, total: 728 ms
Wall time: 501 ms
train_rmsle 0.128698215106; test_rmsle 0.235402625192
---------------------- split 3
CPU times: user 2.38 s, sys: 532 ms, total: 2.91 s
Wall time: 2.98 s
CPU times: user 2min 9s, sys: 408 ms, total: 2min 10s
Wall time: 1min 17s
CPU times: user 12 ms, sys: 0 ns, total: 12 ms
Wall time: 10.1 ms
CPU times: user 736 ms, sys: 4 ms, total: 740 ms
Wall time: 464 ms
train_rmsle 0.129471366875; test_rmsle 0.212931804071
---------------------- split 4
CPU times: user 2.44 s, sys: 320 ms, total: 2.76 s
Wall time: 2.77 s
CPU times: user 2min 8s, sys: 376 ms, total: 2min 9s
Wall time: 1min 15s
CPU times: user 20 ms, sys: 0 ns, total: 20 ms
Wall time: 25.1 ms
CPU times: user 700 ms, sys: 4 ms, total: 704 ms
Wall time: 403 ms
train_rmsle 0.129757517941; test_rmsle 0.22167281356
---------------------- split 5
CPU times: user 2.39 s, sys: 484 ms, total: 2.88 s
Wall time: 2.9 s
CPU times: user 2min 11s, sys: 348 ms, total: 2min 11s
Wall time: 1min 21s
CPU times: user 20 ms, sys: 0 ns, total: 20 ms
Wall time: 19.1 ms
CPU times: user 688 ms, sys: 0 ns, total: 688 ms
Wall time: 410 ms
train_rmsle 0.131557352301; test_rmsle 0.196064770099
---------------------- split 6
CPU times: user 2.4 s, sys: 428 ms, total: 2.82 s
Wall time: 2.85 s
CPU times: user 2min 11s, sys: 248 ms, total: 2min 11s
Wall time: 1min 19s
CPU times: user 20 ms, sys: 0 ns, total: 20 ms
Wall time: 33.1 ms
CPU times: user 720 ms, sys: 4 ms, total: 724 ms
Wall time: 451 ms
train_rmsle 0.129510982744; test_rmsle 0.250619980426
---------------------- split 7
CPU times: user 2.54 s, sys: 460 ms, total: 3 s
Wall time: 3.03 s
CPU times: user 2min 20s, sys: 508 ms, total: 2min 20s
Wall time: 1min 36s
CPU times: user 8 ms, sys: 0 ns, total: 8 ms
Wall time: 6.17 ms
CPU times: user 792 ms, sys: 4 ms, total: 796 ms
Wall time: 946 ms
train_rmsle 0.129149378055; test_rmsle 0.237027722482
---------------------- split 8
CPU times: user 2.51 s, sys: 464 ms, total: 2.97 s
Wall time: 3.31 s
CPU times: user 2min 16s, sys: 504 ms, total: 2min 16s
Wall time: 1min 27s
CPU times: user 16 ms, sys: 0 ns, total: 16 ms
Wall time: 14.1 ms
CPU times: user 704 ms, sys: 0 ns, total: 704 ms
Wall time: 410 ms
train_rmsle 0.129724331773; test_rmsle 0.21963796411
---------------------- split 9
CPU times: user 2.38 s, sys: 496 ms, total: 2.88 s
Wall time: 2.91 s
CPU times: user 2min 8s, sys: 328 ms, total: 2min 8s
Wall time: 1min 14s
CPU times: user 28 ms, sys: 0 ns, total: 28 ms
Wall time: 41.7 ms
CPU times: user 788 ms, sys: 16 ms, total: 804 ms
Wall time: 692 ms
train_rmsle 0.13132345535; test_rmsle 0.223434909418

------------------------------ averages:
    train RMSLE avg 0.130042688603 std 0.00097675526429
    test RMSLE avg 0.223227784773 std 0.0145131427263


In [ ]: