In [9]:
%pylab inline

from sklearn.metrics import mean_squared_error
import pandas as pd
import xgboost as xgb

from soln.dataset import AllCategoricalsFeaturizer
from soln.dataset import generate_xv_splits
from soln.dataset import get_augmented_train_and_test_set
from soln.utils import eval_model
from soln.utils import train_model

pd.set_option('display.max_columns', None)


Populating the interactive namespace from numpy and matplotlib

In [25]:
%time aug_train_set, aug_test_set = get_augmented_train_and_test_set()


CPU times: user 13.7 s, sys: 92 ms, total: 13.8 s
Wall time: 14.5 s

In [162]:
from itertools import islice
fold_number = 0
%time X_train, y_train, X_test, y_test = next(islice(generate_xv_splits(aug_train_set), fold_number, None))
print X_train.shape, y_train.shape, X_test.shape, y_test.shape


CPU times: user 72 ms, sys: 0 ns, total: 72 ms
Wall time: 76.7 ms
(27270, 53) (27270,) (2943, 53) (2943,)

In [163]:
# Layer 1: Everything.

layer1_params = {
    'objective': 'reg:linear',
    'silent': 1,
    'num_rounds': 1000,
    'gamma': 0.0,
    'eta': 0.02,
    'max_depth': 8,
    'min_child_weight': 6,
    'subsample': 0.7,
    'colsample_bytree': 0.6,
}

def layer1_get_indices(X):
    return np.ones(len(X), dtype=bool)

In [164]:
layer1_featurizer = AllCategoricalsFeaturizer()
%time layer1 = train_model(layer1_params, layer1_get_indices, layer1_featurizer, X_train, y_train)


CPU times: user 2min 49s, sys: 1.08 s, total: 2min 50s
Wall time: 1min 55s

In [165]:
layer1_train_results = eval_model(layer1['model'], layer1_get_indices, layer1_featurizer, X_train, y_train)
layer1_test_results = eval_model(layer1['model'], layer1_get_indices, layer1_featurizer, X_test, y_test)
print "Train on everything, test on everything:"
print layer1['X_train'].shape
print layer1_train_results['X_eval'].shape
print layer1_test_results['X_eval'].shape
print "train RMSLE", layer1_train_results['rmsle']
print "test RMSLE", layer1_test_results['rmsle']


Train on everything, test on everything:
(27270, 53)
(27270, 53)
(2943, 53)
train RMSLE 0.124960740984
test RMSLE 0.227403087285

In [166]:
# Layer 2: Uncommon brackets.

layer2_params = {
    'objective': 'reg:linear',
    'silent': 1,
    'num_rounds': 1000,
    'gamma': 0.0,
    'eta': 0.02,
    'max_depth': 8,
    'min_child_weight': 6,
    'subsample': 0.7,
    'colsample_bytree': 0.6,
}

common_brackets = [
    (1, 2, 5, 10, 25, 50, 100, 250),
    (1, 6, 20),
    (1, 2, 3, 5, 10, 20),
    (1, 2, 5, 10, 25, 50, 100),
    (5, 19, 20),
]

def layer2_get_indices(X):
    return ~X.bracketing_pattern.isin(common_brackets)

In [167]:
layer2_featurizer = AllCategoricalsFeaturizer()
%time layer2 = train_model(layer2_params, layer2_get_indices, layer2_featurizer, X_train, y_train)


CPU times: user 1min 3s, sys: 204 ms, total: 1min 3s
Wall time: 45.9 s

In [168]:
print "Train on everything, test on uncommon brackets:"
tmp = eval_model(layer1['model'], layer2_get_indices, layer1_featurizer, X_test, y_test)
print tmp['X_eval'].shape
print "test RMSLE", tmp['rmsle']


Train on everything, test on uncommon brackets:
(987, 53)
test RMSLE 0.369369099906

In [169]:
layer2_train_results = eval_model(layer2['model'], layer2_get_indices, layer2_featurizer, X_train, y_train)
layer2_test_results = eval_model(layer2['model'], layer2_get_indices, layer2_featurizer, X_test, y_test)
print "Train on uncommon brackets, test on uncommon brackets:"
print layer2['X_train'].shape
print layer2_train_results['X_eval'].shape
print layer2_test_results['X_eval'].shape
print "train RMSLE", layer2_train_results['rmsle']
print "test RMSLE", layer2_test_results['rmsle']


Train on uncommon brackets, test on uncommon brackets:
(8221, 53)
(8221, 53)
(987, 53)
train RMSLE 0.163275589812
test RMSLE 0.345135857409

In [170]:
y_test_pred = pd.Series(layer1_test_results['y_eval_pred'], copy=True)
y_test_pred[layer2_test_results['eval_is']] = layer2_test_results['y_eval_pred']
rmsle = np.sqrt(mean_squared_error(y_test.values, y_test_pred.values))
print "Layer 1 and layer 2 together:"
print y_test_pred.shape
print "test RMSLE", rmsle


Layer 1 and layer 2 together:
(2943,)
test RMSLE 0.214255159159

In [171]:
# Layer 3: Empty bracket.

layer3_params = {
    'objective': 'reg:linear',
    'silent': 1,
    'num_rounds': 1000,
    'gamma': 0.0,
    'eta': 0.02,
    'max_depth': 8,
    'min_child_weight': 6,
    'subsample': 0.7,
    'colsample_bytree': 0.6,
}

def layer3_get_indices(X):
    return (X.bracketing_pattern == ())

In [172]:
layer3_featurizer = AllCategoricalsFeaturizer()
%time layer3 = train_model(layer3_params, layer3_get_indices, layer3_featurizer, X_train, y_train)


CPU times: user 33.8 s, sys: 172 ms, total: 33.9 s
Wall time: 23.7 s

In [173]:
print "Train on everything, test on empty bracket:"
tmp = eval_model(layer1['model'], layer3_get_indices, layer1_featurizer, X_test, y_test)
print tmp['X_eval'].shape
print "test RMSLE", tmp['rmsle']


Train on everything, test on empty bracket:
(493, 53)
test RMSLE 0.402430706857

In [174]:
print "Train on uncommon brackets, test on empty bracket:"
tmp = eval_model(layer2['model'], layer3_get_indices, layer2_featurizer, X_test, y_test)
print tmp['X_eval'].shape
print "test RMSLE", tmp['rmsle']


Train on uncommon brackets, test on empty bracket:
(493, 53)
test RMSLE 0.378887498903

In [175]:
layer3_train_results = eval_model(layer3['model'], layer3_get_indices, layer3_featurizer, X_train, y_train)
layer3_test_results = eval_model(layer3['model'], layer3_get_indices, layer3_featurizer, X_test, y_test)
print "Train on empty bracket, test on empty bracket:"
print layer3['X_train'].shape
print layer3_train_results['X_eval'].shape
print layer3_test_results['X_eval'].shape
print "train RMSLE", layer3_train_results['rmsle']
print "test RMSLE", layer3_test_results['rmsle']


Train on empty bracket, test on empty bracket:
(4249, 53)
(4249, 53)
(493, 53)
train RMSLE 0.146880176789
test RMSLE 0.377893012301

In [176]:
y_test_pred = pd.Series(layer1_test_results['y_eval_pred'], copy=True)
y_test_pred[layer2_test_results['eval_is']] = layer2_test_results['y_eval_pred']
y_test_pred[layer3_test_results['eval_is']] = layer3_test_results['y_eval_pred']
rmsle = np.sqrt(mean_squared_error(y_test.values, y_test_pred.values))
print "Layer 1 and layer 2 and layer 3 together:"
print y_test_pred.shape
print "test RMSLE", rmsle


Layer 1 and layer 2 and layer 3 together:
(2943,)
test RMSLE 0.213960742254

In [ ]:


In [ ]: