In [1]:
%pylab inline

from sklearn.metrics import mean_squared_error
import pandas as pd
import xgboost as xgb

from soln import expert_params
from soln.dataset import AllCategoricalsFeaturizer
from soln.dataset import generate_xv_splits
from soln.dataset import get_augmented_train_and_test_set
from soln.experts import get_predictions
from soln.experts import train_and_save_expert
from soln.experts import xv_eval_experts
from soln.utils import eval_model
from soln.utils import train_model

pd.set_option('display.max_columns', None)


Populating the interactive namespace from numpy and matplotlib

In [2]:
%time aug_train_set, aug_test_set = get_augmented_train_and_test_set()


CPU times: user 13.5 s, sys: 184 ms, total: 13.6 s
Wall time: 13.8 s

In [3]:
from itertools import islice
fold_number = 0
%time X_train, y_train, X_test, y_test = next(islice(generate_xv_splits(aug_train_set), fold_number, None))
print X_train.shape, y_train.shape, X_test.shape, y_test.shape


CPU times: user 112 ms, sys: 24 ms, total: 136 ms
Wall time: 138 ms
(27270, 53) (27270,) (2943, 53) (2943,)

In [4]:
# Baseline: Train a single model on everything.

baseline_params = {
    'objective': 'reg:linear',
    'silent': 1,
    'num_rounds': 1000,
    'gamma': 0.0,
    'eta': 0.02,
    'max_depth': 8,
    'min_child_weight': 6,
    'subsample': 0.7,
    'colsample_bytree': 0.6,
}

def all_get_indices(X):
    return np.ones(len(X), dtype=bool)

baseline_featurizer = AllCategoricalsFeaturizer()
%time baseline = train_model(baseline_params, all_get_indices, baseline_featurizer, X_train, y_train)


CPU times: user 2min 42s, sys: 1.76 s, total: 2min 43s
Wall time: 1min 42s

In [5]:
baseline_train_results = eval_model(baseline['model'], all_get_indices, baseline_featurizer, X_train, y_train)
baseline_test_results = eval_model(baseline['model'], all_get_indices, baseline_featurizer, X_test, y_test)
print "Train on everything, test on everything:"
print baseline['X_train'].shape
print baseline_train_results['X_eval'].shape
print baseline_test_results['X_eval'].shape
print "train RMSLE", baseline_train_results['rmsle']
print "test RMSLE", baseline_test_results['rmsle']


Train on everything, test on everything:
(27270, 53)
(27270, 53)
(2943, 53)
train RMSLE 0.124960740984
test RMSLE 0.227403087285

In [79]:
# Bagging:

bag_params = {
    'objective': 'reg:linear',
    'silent': 1,
    'num_rounds': 1000,
    'gamma': 0.0,
    'eta': 0.02,
    'max_depth': 8,
    'min_child_weight': 6,
    'subsample': 0.7,
    'colsample_bytree': 0.6,
}

all_taids = np.unique(X_train.tube_assembly_id.values)
print "X_train has {} rows and {} unique taids".format(len(X_train), len(all_taids))

n_bags = 9
bags = []
for i in xrange(n_bags):
    print "----- bag {}:".format(i)
    
    n_bag_taids = 0.9 * len(all_taids)
    bag_taids = np.random.choice(all_taids, size=n_bag_taids, replace=False)
    unique_bag_taids = np.unique(bag_taids)
    bag_is = X_train.tube_assembly_id.isin(bag_taids)
    bag_X_train = X_train[bag_is].reset_index(drop=True)
    bag_y_train = y_train[bag_is].reset_index(drop=True)
    print "this bag has {} rows ({} of all) and {} ({} of all) unique taids".format(
        len(bag_X_train), 1.0 * len(bag_X_train) / len(X_train),
        len(unique_bag_taids), 1.0 * len(unique_bag_taids) / len(all_taids))

    featurizer = AllCategoricalsFeaturizer()
    %time bag = train_model(bag_params, all_get_indices, featurizer, bag_X_train, bag_y_train)

    train_results = eval_model(bag['model'], all_get_indices, featurizer, bag_X_train, bag_y_train)
    test_results = eval_model(bag['model'], all_get_indices, featurizer, X_test, y_test)
    print "train RMSLE", train_results['rmsle']
    print "test RMSLE", test_results['rmsle']

    store_bag = {
        'taids': bag_taids,
        'is': bag_is,
        'featurizer': featurizer,
        'model': bag['model'],
        'train_results': train_results,
        'test_results': test_results,
    }
    bags.append(store_bag)


X_train has 27270 rows and 7960 unique taids
----- bag 0:
this bag has 24486 rows (0.897909790979 of all) and 7164 (0.9 of all) unique taids
CPU times: user 3min 5s, sys: 1.08 s, total: 3min 6s
Wall time: 2min 24s
train RMSLE 0.123432578577
test RMSLE 0.229418813724
----- bag 1:
this bag has 24586 rows (0.901576824349 of all) and 7164 (0.9 of all) unique taids
CPU times: user 2min 55s, sys: 988 ms, total: 2min 56s
Wall time: 2min 10s
train RMSLE 0.122111587117
test RMSLE 0.227541088433
----- bag 2:
this bag has 24717 rows (0.906380638064 of all) and 7164 (0.9 of all) unique taids
CPU times: user 2min 32s, sys: 900 ms, total: 2min 33s
Wall time: 1min 37s
train RMSLE 0.121361805697
test RMSLE 0.229519295813
----- bag 3:
this bag has 24601 rows (0.902126879355 of all) and 7164 (0.9 of all) unique taids
CPU times: user 2min 31s, sys: 840 ms, total: 2min 32s
Wall time: 1min 35s
train RMSLE 0.12122519513
test RMSLE 0.228024319671
----- bag 4:
this bag has 24556 rows (0.900476714338 of all) and 7164 (0.9 of all) unique taids
CPU times: user 2min 38s, sys: 680 ms, total: 2min 39s
Wall time: 1min 48s
train RMSLE 0.123122970152
test RMSLE 0.225918550374
----- bag 5:
this bag has 24489 rows (0.89801980198 of all) and 7164 (0.9 of all) unique taids
CPU times: user 2min 21s, sys: 736 ms, total: 2min 21s
Wall time: 1min 19s
train RMSLE 0.120961715396
test RMSLE 0.23123137468
----- bag 6:
this bag has 24612 rows (0.902530253025 of all) and 7164 (0.9 of all) unique taids
CPU times: user 2min 20s, sys: 640 ms, total: 2min 21s
Wall time: 1min 18s
train RMSLE 0.121177518095
test RMSLE 0.230244874745
----- bag 7:
this bag has 24573 rows (0.901100110011 of all) and 7164 (0.9 of all) unique taids
CPU times: user 2min 19s, sys: 692 ms, total: 2min 20s
Wall time: 1min 16s
train RMSLE 0.120974039334
test RMSLE 0.227374436674
----- bag 8:
this bag has 24593 rows (0.901833516685 of all) and 7164 (0.9 of all) unique taids
CPU times: user 2min 18s, sys: 652 ms, total: 2min 19s
Wall time: 1min 16s
train RMSLE 0.1206663763
test RMSLE 0.22666532693

In [80]:
print "best bag RMSLE:", np.min([bag['test_results']['rmsle'] for bag in bags])
print "worst bag RMSLE:", np.max([bag['test_results']['rmsle'] for bag in bags])
print "mean bag RMSLE:", np.mean([bag['test_results']['rmsle'] for bag in bags])
print "median bag RMSLE:", np.median([bag['test_results']['rmsle'] for bag in bags])


best bag RMSLE: 0.225918550374
worst bag RMSLE: 0.23123137468
mean bag RMSLE: 0.22843756456
median bag RMSLE: 0.228024319671

In [81]:
y_pred_all = np.vstack([bag['test_results']['y_eval_pred'].T for bag in bags]).T
y_pred_avg = np.mean(y_pred_all, axis=1)
y_pred_median = np.median(y_pred_all, axis=1)
print "mean-combined RMSLE:", np.sqrt(mean_squared_error(y_test, y_pred_avg))
print "median-combined RMSLE:", np.sqrt(mean_squared_error(y_test, y_pred_median))


mean-combined RMSLE: 0.225020920942
median-combined RMSLE: 0.22524214179

In [ ]: