In [1]:
%pylab inline
from sklearn.metrics import mean_squared_error
import pandas as pd
import xgboost as xgb
from soln.dataset import AllCategoricalsFeaturizer
from soln.dataset import generate_xv_splits
from soln.dataset import get_augmented_train_and_test_set
from soln.utils import eval_model
from soln.utils import train_model
pd.set_option('display.max_columns', None)
In [2]:
%time aug_train_set, aug_test_set = get_augmented_train_and_test_set()
In [75]:
from itertools import islice
fold_number = 0
%time X_train, y_train, X_test, y_test = next(islice(generate_xv_splits(aug_train_set), fold_number, None))
print X_train.shape, y_train.shape, X_test.shape, y_test.shape
In [76]:
# Layer 1: Everything.
layer1_params = {
'objective': 'reg:linear',
'silent': 1,
'num_rounds': 1000,
'gamma': 0.0,
'eta': 0.02,
'max_depth': 8,
'min_child_weight': 6,
'subsample': 0.7,
'colsample_bytree': 0.6,
}
def layer1_get_indices(X):
return np.ones(len(X), dtype=bool)
In [77]:
layer1_featurizer = AllCategoricalsFeaturizer()
%time layer1 = train_model(layer1_params, layer1_get_indices, layer1_featurizer, X_train, y_train)
In [78]:
layer1_train_results = eval_model(layer1['model'], layer1_get_indices, layer1_featurizer, X_train, y_train)
layer1_test_results = eval_model(layer1['model'], layer1_get_indices, layer1_featurizer, X_test, y_test)
print "Train on everything, test on everything:"
print layer1['X_train'].shape
print layer1_train_results['X_eval'].shape
print layer1_test_results['X_eval'].shape
print "train RMSLE", layer1_train_results['rmsle']
print "test RMSLE", layer1_test_results['rmsle']
In [79]:
# LEFT TODO: infra for storing layer1 models for each fold, instead of retraining...
In [80]:
# New layer:
new_layer_params = {
'objective': 'reg:linear',
'silent': 1,
'num_rounds': 1000,
'gamma': 0.0,
'eta': 0.02,
'max_depth': 8,
'min_child_weight': 6,
'subsample': 0.7,
'colsample_bytree': 0.6,
}
def new_layer_get_indices(X):
return (X.supplier == 'S-0041')
In [81]:
new_layer_featurizer = AllCategoricalsFeaturizer()
%time new_layer = train_model(new_layer_params, new_layer_get_indices, new_layer_featurizer, X_train, y_train)
In [82]:
print "Train on everything, test on new layer:"
tmp = eval_model(layer1['model'], new_layer_get_indices, layer1_featurizer, X_test, y_test)
print tmp['X_eval'].shape
print "test RMSLE", tmp['rmsle']
In [83]:
new_layer_train_results = eval_model(new_layer['model'], new_layer_get_indices, new_layer_featurizer, X_train, y_train)
new_layer_test_results = eval_model(new_layer['model'], new_layer_get_indices, new_layer_featurizer, X_test, y_test)
print "Train on new layer, test on new layer:"
print new_layer['X_train'].shape
print new_layer_train_results['X_eval'].shape
print new_layer_test_results['X_eval'].shape
print "train RMSLE", new_layer_train_results['rmsle']
print "test RMSLE", new_layer_test_results['rmsle']
In [48]:
aug_test_set.supplier.value_counts(normalize=True)
Out[48]:
In [ ]: