In [9]:
%pylab inline
from sklearn.metrics import mean_squared_error
import pandas as pd
import xgboost as xgb
from soln.dataset import AllCategoricalsFeaturizer
from soln.dataset import generate_xv_splits
from soln.dataset import get_augmented_train_and_test_set
from soln.utils import eval_model
from soln.utils import train_model
pd.set_option('display.max_columns', None)
In [25]:
%time aug_train_set, aug_test_set = get_augmented_train_and_test_set()
In [162]:
from itertools import islice
fold_number = 0
%time X_train, y_train, X_test, y_test = next(islice(generate_xv_splits(aug_train_set), fold_number, None))
print X_train.shape, y_train.shape, X_test.shape, y_test.shape
In [163]:
# Layer 1: Everything.
layer1_params = {
'objective': 'reg:linear',
'silent': 1,
'num_rounds': 1000,
'gamma': 0.0,
'eta': 0.02,
'max_depth': 8,
'min_child_weight': 6,
'subsample': 0.7,
'colsample_bytree': 0.6,
}
def layer1_get_indices(X):
return np.ones(len(X), dtype=bool)
In [164]:
layer1_featurizer = AllCategoricalsFeaturizer()
%time layer1 = train_model(layer1_params, layer1_get_indices, layer1_featurizer, X_train, y_train)
In [165]:
layer1_train_results = eval_model(layer1['model'], layer1_get_indices, layer1_featurizer, X_train, y_train)
layer1_test_results = eval_model(layer1['model'], layer1_get_indices, layer1_featurizer, X_test, y_test)
print "Train on everything, test on everything:"
print layer1['X_train'].shape
print layer1_train_results['X_eval'].shape
print layer1_test_results['X_eval'].shape
print "train RMSLE", layer1_train_results['rmsle']
print "test RMSLE", layer1_test_results['rmsle']
In [166]:
# Layer 2: Uncommon brackets.
layer2_params = {
'objective': 'reg:linear',
'silent': 1,
'num_rounds': 1000,
'gamma': 0.0,
'eta': 0.02,
'max_depth': 8,
'min_child_weight': 6,
'subsample': 0.7,
'colsample_bytree': 0.6,
}
common_brackets = [
(1, 2, 5, 10, 25, 50, 100, 250),
(1, 6, 20),
(1, 2, 3, 5, 10, 20),
(1, 2, 5, 10, 25, 50, 100),
(5, 19, 20),
]
def layer2_get_indices(X):
return ~X.bracketing_pattern.isin(common_brackets)
In [167]:
layer2_featurizer = AllCategoricalsFeaturizer()
%time layer2 = train_model(layer2_params, layer2_get_indices, layer2_featurizer, X_train, y_train)
In [168]:
print "Train on everything, test on uncommon brackets:"
tmp = eval_model(layer1['model'], layer2_get_indices, layer1_featurizer, X_test, y_test)
print tmp['X_eval'].shape
print "test RMSLE", tmp['rmsle']
In [169]:
layer2_train_results = eval_model(layer2['model'], layer2_get_indices, layer2_featurizer, X_train, y_train)
layer2_test_results = eval_model(layer2['model'], layer2_get_indices, layer2_featurizer, X_test, y_test)
print "Train on uncommon brackets, test on uncommon brackets:"
print layer2['X_train'].shape
print layer2_train_results['X_eval'].shape
print layer2_test_results['X_eval'].shape
print "train RMSLE", layer2_train_results['rmsle']
print "test RMSLE", layer2_test_results['rmsle']
In [170]:
y_test_pred = pd.Series(layer1_test_results['y_eval_pred'], copy=True)
y_test_pred[layer2_test_results['eval_is']] = layer2_test_results['y_eval_pred']
rmsle = np.sqrt(mean_squared_error(y_test.values, y_test_pred.values))
print "Layer 1 and layer 2 together:"
print y_test_pred.shape
print "test RMSLE", rmsle
In [171]:
# Layer 3: Empty bracket.
layer3_params = {
'objective': 'reg:linear',
'silent': 1,
'num_rounds': 1000,
'gamma': 0.0,
'eta': 0.02,
'max_depth': 8,
'min_child_weight': 6,
'subsample': 0.7,
'colsample_bytree': 0.6,
}
def layer3_get_indices(X):
return (X.bracketing_pattern == ())
In [172]:
layer3_featurizer = AllCategoricalsFeaturizer()
%time layer3 = train_model(layer3_params, layer3_get_indices, layer3_featurizer, X_train, y_train)
In [173]:
print "Train on everything, test on empty bracket:"
tmp = eval_model(layer1['model'], layer3_get_indices, layer1_featurizer, X_test, y_test)
print tmp['X_eval'].shape
print "test RMSLE", tmp['rmsle']
In [174]:
print "Train on uncommon brackets, test on empty bracket:"
tmp = eval_model(layer2['model'], layer3_get_indices, layer2_featurizer, X_test, y_test)
print tmp['X_eval'].shape
print "test RMSLE", tmp['rmsle']
In [175]:
layer3_train_results = eval_model(layer3['model'], layer3_get_indices, layer3_featurizer, X_train, y_train)
layer3_test_results = eval_model(layer3['model'], layer3_get_indices, layer3_featurizer, X_test, y_test)
print "Train on empty bracket, test on empty bracket:"
print layer3['X_train'].shape
print layer3_train_results['X_eval'].shape
print layer3_test_results['X_eval'].shape
print "train RMSLE", layer3_train_results['rmsle']
print "test RMSLE", layer3_test_results['rmsle']
In [176]:
y_test_pred = pd.Series(layer1_test_results['y_eval_pred'], copy=True)
y_test_pred[layer2_test_results['eval_is']] = layer2_test_results['y_eval_pred']
y_test_pred[layer3_test_results['eval_is']] = layer3_test_results['y_eval_pred']
rmsle = np.sqrt(mean_squared_error(y_test.values, y_test_pred.values))
print "Layer 1 and layer 2 and layer 3 together:"
print y_test_pred.shape
print "test RMSLE", rmsle
In [ ]:
In [ ]: