In [6]:
%pylab inline

from sklearn.metrics import mean_squared_error
import pandas as pd
import xgboost as xgb

from soln import expert_params
from soln.dataset import AllCategoricalsFeaturizer
from soln.dataset import generate_xv_splits
from soln.dataset import get_augmented_train_and_test_set
from soln.utils import eval_model
from soln.utils import train_model

pd.set_option('display.max_columns', None)


Populating the interactive namespace from numpy and matplotlib

In [2]:
%time aug_train_set, aug_test_set = get_augmented_train_and_test_set()


CPU times: user 13.3 s, sys: 148 ms, total: 13.5 s
Wall time: 13.6 s

In [3]:
from itertools import islice
fold_number = 0
%time X_train, y_train, X_test, y_test = next(islice(generate_xv_splits(aug_train_set), fold_number, None))
print X_train.shape, y_train.shape, X_test.shape, y_test.shape


CPU times: user 80 ms, sys: 44 ms, total: 124 ms
Wall time: 132 ms
(27270, 53) (27270,) (2943, 53) (2943,)

In [36]:
expert_name = 'uncommon_suppliers_1'
expert_get_indices = expert_params.uncommon_suppliers_1_get_indices
expert_params = expert_params.uncommon_suppliers_1_params

In [37]:
featurizer = AllCategoricalsFeaturizer()
%time layer = train_model(expert_params, expert_get_indices, featurizer, X_train, y_train)


CPU times: user 20.8 s, sys: 32 ms, total: 20.9 s
Wall time: 12.7 s

In [38]:
tmp = eval_model(layer['model'], expert_get_indices, featurizer, X_test, y_test)
print "Train on {}, test on {}:".format(expert_name, expert_name)
print tmp['X_eval'].shape
print "test RMSLE", tmp['rmsle']


Train on uncommon_suppliers_1, test on uncommon_suppliers_1:
(325, 53)
test RMSLE 0.395571643059

In [79]:
# Try training layer on all instances, but assigning higher weights to the layer's instances.

low_weight = 0.05
high_weight = 8.0
high_weight_is = expert_get_indices(X_train)
weights = pd.Series(np.ones(len(X_train)) * low_weight)
weights[high_weight_is] = high_weight
print weights.value_counts()

featurizer2 = AllCategoricalsFeaturizer()
featurizer2.fit(X_train)
X_train_feats = featurizer2.transform(X_train)
X_train_np = X_train_feats.astype(np.float).values
y_train_np = y_train.values
xgtrain = xgb.DMatrix(X_train_np, label=y_train_np, weight=weights.values)


0.05    24413
8.00     2857
dtype: int64

In [80]:
%time model = xgb.train(expert_params.items(), xgtrain, expert_params['num_rounds'])


CPU times: user 2min 33s, sys: 408 ms, total: 2min 33s
Wall time: 1min 26s

In [81]:
test_results = eval_model(model, expert_get_indices, featurizer2, X_test, y_test)
print "Train on weighted, test on s66:"
print test_results['X_eval'].shape
print "test RMSLE", test_results['rmsle']


Train on weighted, test on s66:
(325, 53)
test RMSLE 0.391161014352

In [ ]: