In [1]:
%pylab inline

from sklearn.metrics import mean_squared_error
import pandas as pd
import xgboost as xgb

from soln.dataset import AllCategoricalsFeaturizer
from soln.dataset import generate_xv_splits
from soln.dataset import get_augmented_train_and_test_set
from soln.utils import eval_model
from soln.utils import train_model

pd.set_option('display.max_columns', None)


Populating the interactive namespace from numpy and matplotlib

In [2]:
%time aug_train_set, aug_test_set = get_augmented_train_and_test_set()


CPU times: user 13.5 s, sys: 188 ms, total: 13.7 s
Wall time: 14.2 s

In [75]:
from itertools import islice
fold_number = 0
%time X_train, y_train, X_test, y_test = next(islice(generate_xv_splits(aug_train_set), fold_number, None))
print X_train.shape, y_train.shape, X_test.shape, y_test.shape


CPU times: user 68 ms, sys: 4 ms, total: 72 ms
Wall time: 73.1 ms
(27270, 53) (27270,) (2943, 53) (2943,)

In [76]:
# Layer 1: Everything.

layer1_params = {
    'objective': 'reg:linear',
    'silent': 1,
    'num_rounds': 1000,
    'gamma': 0.0,
    'eta': 0.02,
    'max_depth': 8,
    'min_child_weight': 6,
    'subsample': 0.7,
    'colsample_bytree': 0.6,
}

def layer1_get_indices(X):
    return np.ones(len(X), dtype=bool)

In [77]:
layer1_featurizer = AllCategoricalsFeaturizer()
%time layer1 = train_model(layer1_params, layer1_get_indices, layer1_featurizer, X_train, y_train)


CPU times: user 2min 39s, sys: 884 ms, total: 2min 40s
Wall time: 1min 36s

In [78]:
layer1_train_results = eval_model(layer1['model'], layer1_get_indices, layer1_featurizer, X_train, y_train)
layer1_test_results = eval_model(layer1['model'], layer1_get_indices, layer1_featurizer, X_test, y_test)
print "Train on everything, test on everything:"
print layer1['X_train'].shape
print layer1_train_results['X_eval'].shape
print layer1_test_results['X_eval'].shape
print "train RMSLE", layer1_train_results['rmsle']
print "test RMSLE", layer1_test_results['rmsle']


Train on everything, test on everything:
(27270, 53)
(27270, 53)
(2943, 53)
train RMSLE 0.124960740984
test RMSLE 0.227403087285

In [79]:
# LEFT TODO: infra for storing layer1 models for each fold, instead of retraining...

In [80]:
# New layer:

new_layer_params = {
    'objective': 'reg:linear',
    'silent': 1,
    'num_rounds': 1000,
    'gamma': 0.0,
    'eta': 0.02,
    'max_depth': 8,
    'min_child_weight': 6,
    'subsample': 0.7,
    'colsample_bytree': 0.6,
}

def new_layer_get_indices(X):
    return (X.supplier == 'S-0041')

In [81]:
new_layer_featurizer = AllCategoricalsFeaturizer()
%time new_layer = train_model(new_layer_params, new_layer_get_indices, new_layer_featurizer, X_train, y_train)


CPU times: user 18.9 s, sys: 76 ms, total: 18.9 s
Wall time: 11 s

In [82]:
print "Train on everything, test on new layer:"
tmp = eval_model(layer1['model'], new_layer_get_indices, layer1_featurizer, X_test, y_test)
print tmp['X_eval'].shape
print "test RMSLE", tmp['rmsle']


Train on everything, test on new layer:
(331, 53)
test RMSLE 0.158136060823

In [83]:
new_layer_train_results = eval_model(new_layer['model'], new_layer_get_indices, new_layer_featurizer, X_train, y_train)
new_layer_test_results = eval_model(new_layer['model'], new_layer_get_indices, new_layer_featurizer, X_test, y_test)
print "Train on new layer, test on new layer:"
print new_layer['X_train'].shape
print new_layer_train_results['X_eval'].shape
print new_layer_test_results['X_eval'].shape
print "train RMSLE", new_layer_train_results['rmsle']
print "test RMSLE", new_layer_test_results['rmsle']


Train on new layer, test on new layer:
(2992, 53)
(2992, 53)
(331, 53)
train RMSLE 0.0480268262385
test RMSLE 0.134651303418

In [48]:
aug_test_set.supplier.value_counts(normalize=True)


Out[48]:
S-0066    0.679577
S-0041    0.103456
S-0072    0.077261
S-0054    0.031784
S-0026    0.025930
S-0013    0.020241
S-0058    0.017298
S-0064    0.014685
S-0062    0.007673
S-0014    0.004101
S-0030    0.003737
S-0104    0.002712
S-0081    0.001654
S-0105    0.001191
S-0042    0.001091
S-0059    0.000827
S-0005    0.000661
S-0070    0.000628
S-0031    0.000562
S-0008    0.000463
S-0080    0.000397
S-0027    0.000364
S-0074    0.000331
S-0088    0.000298
S-0043    0.000232
S-0056    0.000232
S-0018    0.000198
S-0073    0.000198
S-0092    0.000198
S-0060    0.000198
S-0011    0.000165
S-0033    0.000165
S-0050    0.000132
S-0009    0.000132
S-0051    0.000132
S-0015    0.000132
S-0090    0.000099
S-0012    0.000099
S-0061    0.000099
S-0007    0.000066
S-0004    0.000066
S-0040    0.000066
S-0108    0.000033
S-0077    0.000033
S-0069    0.000033
S-0091    0.000033
S-0076    0.000033
S-0078    0.000033
S-0087    0.000033
S-0006    0.000033
S-0028    0.000033
S-0023    0.000033
S-0039    0.000033
S-0036    0.000033
S-0046    0.000033
S-0068    0.000033
dtype: float64

In [ ]: