In [1]:
%pylab inline
from sklearn.metrics import mean_squared_error
import pandas as pd
import xgboost as xgb
from soln import expert_params
from soln.dataset import AllCategoricalsFeaturizer
from soln.dataset import generate_xv_splits
from soln.dataset import get_augmented_train_and_test_set
from soln.experts import get_predictions
from soln.experts import train_and_save_expert
from soln.experts import xv_eval_experts
from soln.utils import eval_model
from soln.utils import train_model
pd.set_option('display.max_columns', None)
In [2]:
%time aug_train_set, aug_test_set = get_augmented_train_and_test_set()
In [3]:
from itertools import islice
fold_number = 0
%time X_train, y_train, X_test, y_test = next(islice(generate_xv_splits(aug_train_set), fold_number, None))
print X_train.shape, y_train.shape, X_test.shape, y_test.shape
In [4]:
# Baseline: Train a single model on everything.
baseline_params = {
'objective': 'reg:linear',
'silent': 1,
'num_rounds': 1000,
'gamma': 0.0,
'eta': 0.02,
'max_depth': 8,
'min_child_weight': 6,
'subsample': 0.7,
'colsample_bytree': 0.6,
}
def all_get_indices(X):
return np.ones(len(X), dtype=bool)
baseline_featurizer = AllCategoricalsFeaturizer()
%time baseline = train_model(baseline_params, all_get_indices, baseline_featurizer, X_train, y_train)
In [5]:
baseline_train_results = eval_model(baseline['model'], all_get_indices, baseline_featurizer, X_train, y_train)
baseline_test_results = eval_model(baseline['model'], all_get_indices, baseline_featurizer, X_test, y_test)
print "Train on everything, test on everything:"
print baseline['X_train'].shape
print baseline_train_results['X_eval'].shape
print baseline_test_results['X_eval'].shape
print "train RMSLE", baseline_train_results['rmsle']
print "test RMSLE", baseline_test_results['rmsle']
In [79]:
# Bagging:
bag_params = {
'objective': 'reg:linear',
'silent': 1,
'num_rounds': 1000,
'gamma': 0.0,
'eta': 0.02,
'max_depth': 8,
'min_child_weight': 6,
'subsample': 0.7,
'colsample_bytree': 0.6,
}
all_taids = np.unique(X_train.tube_assembly_id.values)
print "X_train has {} rows and {} unique taids".format(len(X_train), len(all_taids))
n_bags = 9
bags = []
for i in xrange(n_bags):
print "----- bag {}:".format(i)
n_bag_taids = 0.9 * len(all_taids)
bag_taids = np.random.choice(all_taids, size=n_bag_taids, replace=False)
unique_bag_taids = np.unique(bag_taids)
bag_is = X_train.tube_assembly_id.isin(bag_taids)
bag_X_train = X_train[bag_is].reset_index(drop=True)
bag_y_train = y_train[bag_is].reset_index(drop=True)
print "this bag has {} rows ({} of all) and {} ({} of all) unique taids".format(
len(bag_X_train), 1.0 * len(bag_X_train) / len(X_train),
len(unique_bag_taids), 1.0 * len(unique_bag_taids) / len(all_taids))
featurizer = AllCategoricalsFeaturizer()
%time bag = train_model(bag_params, all_get_indices, featurizer, bag_X_train, bag_y_train)
train_results = eval_model(bag['model'], all_get_indices, featurizer, bag_X_train, bag_y_train)
test_results = eval_model(bag['model'], all_get_indices, featurizer, X_test, y_test)
print "train RMSLE", train_results['rmsle']
print "test RMSLE", test_results['rmsle']
store_bag = {
'taids': bag_taids,
'is': bag_is,
'featurizer': featurizer,
'model': bag['model'],
'train_results': train_results,
'test_results': test_results,
}
bags.append(store_bag)
In [80]:
print "best bag RMSLE:", np.min([bag['test_results']['rmsle'] for bag in bags])
print "worst bag RMSLE:", np.max([bag['test_results']['rmsle'] for bag in bags])
print "mean bag RMSLE:", np.mean([bag['test_results']['rmsle'] for bag in bags])
print "median bag RMSLE:", np.median([bag['test_results']['rmsle'] for bag in bags])
In [81]:
y_pred_all = np.vstack([bag['test_results']['y_eval_pred'].T for bag in bags]).T
y_pred_avg = np.mean(y_pred_all, axis=1)
y_pred_median = np.median(y_pred_all, axis=1)
print "mean-combined RMSLE:", np.sqrt(mean_squared_error(y_test, y_pred_avg))
print "median-combined RMSLE:", np.sqrt(mean_squared_error(y_test, y_pred_median))
In [ ]: