In [1]:
%pylab inline

from sklearn.dummy import DummyRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

from soln.dataset import AllCategoricalsFeaturizer
from soln.dataset import featurize_and_to_numpy
from soln.dataset import generate_xv_splits
from soln.dataset import get_augmented_train_and_test_set
from soln.utils import eval_regressor


Populating the interactive namespace from numpy and matplotlib

In [2]:
regressors = [
    # DummyRegressor(strategy='constant', constant=0.0),
    # DummyRegressor(strategy='mean'),
    # RandomForestRegressor(n_estimators=20),
    RandomForestRegressor(n_estimators=20, max_features=0.4),
    # RandomForestRegressor(n_estimators=100),
]

In [3]:
%time aug_train_set, aug_test_set = get_augmented_train_and_test_set()


CPU times: user 7.53 s, sys: 136 ms, total: 7.66 s
Wall time: 7.85 s

In [4]:
featurizer = AllCategoricalsFeaturizer()

train_rmsles = []
test_rmsles = []
for reg in regressors:
    train_rmsles.append([])
    test_rmsles.append([])

for i, split in enumerate(generate_xv_splits(aug_train_set)):
    print "---------------------- split {}".format(i)
    %time split_np = featurize_and_to_numpy(featurizer, *split)

    for reg_id, reg in enumerate(regressors):
        %time train_rmsle, test_rmsle = eval_regressor(reg, *split_np)
        print "reg_id {}: train_rmsle {}; test_rmsle {}".format(reg_id, train_rmsle, test_rmsle)
        train_rmsles[reg_id].append(train_rmsle)
        test_rmsles[reg_id].append(test_rmsle)

print
print "------------------------------ averages".format(i)

for reg_id, reg in enumerate(regressors):
    print "{}:".format(reg)
    print "    train RMSLE avg {} std {}".format(np.mean(train_rmsles[reg_id]), np.std(train_rmsles[reg_id]))
    print "    train RMSLEs: {}".format(train_rmsles)
    print "    test RMSLE avg {} std {}".format(np.mean(test_rmsles[reg_id]), np.std(test_rmsles[reg_id]))
    print "    test RMSLEs: {}".format(test_rmsles)
    print


---------------------- split 0
CPU times: user 2.5 s, sys: 576 ms, total: 3.08 s
Wall time: 3.34 s
CPU times: user 24.2 s, sys: 56 ms, total: 24.3 s
Wall time: 24.6 s
reg_id 0: train_rmsle 0.0871158415041; test_rmsle 0.259160235699
---------------------- split 1
CPU times: user 2.44 s, sys: 504 ms, total: 2.95 s
Wall time: 3 s
CPU times: user 23.5 s, sys: 128 ms, total: 23.7 s
Wall time: 23.8 s
reg_id 0: train_rmsle 0.0865913981979; test_rmsle 0.240741781859
---------------------- split 2
CPU times: user 2.42 s, sys: 468 ms, total: 2.89 s
Wall time: 2.93 s
CPU times: user 24.5 s, sys: 116 ms, total: 24.6 s
Wall time: 25.3 s
reg_id 0: train_rmsle 0.087197583272; test_rmsle 0.255161897852
---------------------- split 3
CPU times: user 2.38 s, sys: 476 ms, total: 2.86 s
Wall time: 2.88 s
CPU times: user 24.1 s, sys: 112 ms, total: 24.2 s
Wall time: 24.4 s
reg_id 0: train_rmsle 0.0878039033572; test_rmsle 0.226122173927
---------------------- split 4
CPU times: user 2.43 s, sys: 496 ms, total: 2.93 s
Wall time: 2.95 s
CPU times: user 24.4 s, sys: 140 ms, total: 24.6 s
Wall time: 24.8 s
reg_id 0: train_rmsle 0.0873579932669; test_rmsle 0.25039548187
---------------------- split 5
CPU times: user 2.49 s, sys: 480 ms, total: 2.97 s
Wall time: 3.01 s
CPU times: user 25 s, sys: 164 ms, total: 25.2 s
Wall time: 27.9 s
reg_id 0: train_rmsle 0.0893027975119; test_rmsle 0.235225218159
---------------------- split 6
CPU times: user 2.5 s, sys: 528 ms, total: 3.03 s
Wall time: 3.49 s
CPU times: user 24.8 s, sys: 168 ms, total: 24.9 s
Wall time: 30.1 s
reg_id 0: train_rmsle 0.087852783991; test_rmsle 0.278991961761
---------------------- split 7
CPU times: user 2.63 s, sys: 548 ms, total: 3.18 s
Wall time: 3.9 s
CPU times: user 26.1 s, sys: 56 ms, total: 26.2 s
Wall time: 32.4 s
reg_id 0: train_rmsle 0.0890031524975; test_rmsle 0.268045953555
---------------------- split 8
CPU times: user 2.5 s, sys: 476 ms, total: 2.97 s
Wall time: 4.43 s
CPU times: user 25.4 s, sys: 148 ms, total: 25.6 s
Wall time: 31.3 s
reg_id 0: train_rmsle 0.08813050125; test_rmsle 0.242321527626
---------------------- split 9
CPU times: user 2.59 s, sys: 512 ms, total: 3.1 s
Wall time: 3.86 s
CPU times: user 25 s, sys: 160 ms, total: 25.1 s
Wall time: 29.4 s
reg_id 0: train_rmsle 0.0899060312205; test_rmsle 0.250150514488

------------------------------ averages
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features=0.4, max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=20, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False):
    train RMSLE avg 0.0880261986069 std 0.0010113745691
    train RMSLEs: [[0.08711584150406966, 0.086591398197862574, 0.087197583271980683, 0.087803903357209584, 0.087357993266905229, 0.089302797511880444, 0.087852783991036118, 0.089003152497464127, 0.088130501249974263, 0.089906031220546884]]
    test RMSLE avg 0.25063167468 std 0.0148527863482
    test RMSLEs: [[0.25916023569929952, 0.24074178185872358, 0.25516189785212834, 0.22612217392713832, 0.25039548186967869, 0.23522521815930697, 0.27899196176142049, 0.2680459535546808, 0.24232152762602738, 0.25015051448809256]]


In [26]:
# Same thing with n_estimators=100


---------------------- split 0
CPU times: user 1.45 s, sys: 220 ms, total: 1.67 s
Wall time: 1.7 s
CPU times: user 3min 22s, sys: 156 ms, total: 3min 22s
Wall time: 3min 23s
reg_id 0: train_rmsle 0.0832996589187; test_rmsle 0.29448884641
---------------------- split 1
CPU times: user 1.44 s, sys: 388 ms, total: 1.83 s
Wall time: 1.85 s
CPU times: user 4min 3s, sys: 180 ms, total: 4min 3s
Wall time: 4min 6s
reg_id 0: train_rmsle 0.083701706416; test_rmsle 0.257979085022
---------------------- split 2
CPU times: user 1.44 s, sys: 324 ms, total: 1.77 s
Wall time: 1.79 s
CPU times: user 3min 25s, sys: 204 ms, total: 3min 25s
Wall time: 3min 28s
reg_id 0: train_rmsle 0.0836523980288; test_rmsle 0.267516413845
---------------------- split 3
CPU times: user 1.46 s, sys: 412 ms, total: 1.87 s
Wall time: 1.9 s
CPU times: user 4min, sys: 96 ms, total: 4min
Wall time: 4min 3s
reg_id 0: train_rmsle 0.0842293078426; test_rmsle 0.275240917052
---------------------- split 4
CPU times: user 1.4 s, sys: 352 ms, total: 1.76 s
Wall time: 1.78 s
CPU times: user 3min 37s, sys: 144 ms, total: 3min 37s
Wall time: 3min 39s
reg_id 0: train_rmsle 0.0830748472724; test_rmsle 0.288745540672
---------------------- split 5
CPU times: user 1.46 s, sys: 440 ms, total: 1.9 s
Wall time: 1.95 s
CPU times: user 3min 44s, sys: 76 ms, total: 3min 44s
Wall time: 3min 47s
reg_id 0: train_rmsle 0.0820464168436; test_rmsle 0.316766697986
---------------------- split 6
CPU times: user 1.5 s, sys: 372 ms, total: 1.87 s
Wall time: 1.89 s
CPU times: user 3min 51s, sys: 260 ms, total: 3min 51s
Wall time: 3min 59s
reg_id 0: train_rmsle 0.0834378693627; test_rmsle 0.303055548076
---------------------- split 7
CPU times: user 1.5 s, sys: 508 ms, total: 2 s
Wall time: 2.13 s
CPU times: user 3min 54s, sys: 176 ms, total: 3min 54s
Wall time: 3min 57s
reg_id 0: train_rmsle 0.0827663433699; test_rmsle 0.38010391987
---------------------- split 8
CPU times: user 1.4 s, sys: 492 ms, total: 1.9 s
Wall time: 1.93 s
CPU times: user 3min 44s, sys: 468 ms, total: 3min 45s
Wall time: 4min 13s
reg_id 0: train_rmsle 0.0840000465362; test_rmsle 0.286268726809
---------------------- split 9
CPU times: user 1.5 s, sys: 260 ms, total: 1.76 s
Wall time: 2.07 s
CPU times: user 3min 18s, sys: 596 ms, total: 3min 19s
Wall time: 3min 33s
reg_id 0: train_rmsle 0.086768786033; test_rmsle 0.2658708101

------------------------------ averages
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=100, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False):
    train RMSLE avg 0.0836977380624 std 0.00118387978577
    train RMSLEs: [[0.083299658918725816, 0.08370170641603461, 0.083652398028790556, 0.084229307842565784, 0.083074847272394808, 0.082046416843627554, 0.083437869362724071, 0.082766343369909984, 0.08400004653618591, 0.086768786033034423]]
    test RMSLE avg 0.293603650584 std 0.0335295416675
    test RMSLEs: [[0.29448884640989287, 0.25797908502213301, 0.26751641384485253, 0.2752409170522625, 0.28874554067246933, 0.31676669798603435, 0.30305554807646173, 0.3801039198699897, 0.28626872680932641, 0.26587081009955721]]