In [1]:
%pylab inline
from collections import Counter
from collections import defaultdict
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_squared_error
import pandas as pd
from soln.dataset import AllCategoricalsFeaturizer
from soln.dataset import generate_xv_splits
from soln.dataset import get_augmented_train_and_test_set
from soln.dataset import inverse_log_transform_y
from soln.dataset import log_transform_y
from soln.bracket import brapa
from soln.bracket import fc_vals
from soln.bracket import generate_bracket_csv
from soln.utils import eval_regressor
from soln.utils import print_brackets
from soln.utils import print_feature_importances
In [3]:
%time aug_train_set, aug_test_set = get_augmented_train_and_test_set()
In [4]:
%time generate_bracket_csv(aug_train_set)
In [6]:
bracket = pd.read_csv('bracket.csv')
print bracket.shape
bracket[:5]
Out[6]:
In [60]:
# Check that if we know the true fixed_cost and var_cost,
# we can recover the total cost with tiny error.
print aug_train_set.shape
df = aug_train_set[aug_train_set.bracketing_pattern == brapa]
print df.bracketing_pattern.value_counts()
print df.shape
df = df.merge(bracket, on='tube_assembly_id')
print df.shape
df['pred_cost'] = df['fixed_cost'] / df['adj_quantity'] + df['var_cost']
df['pred_log_cost'] = log_transform_y(df['pred_cost'])
print np.sqrt(mean_squared_error(df.log_cost, df.pred_log_cost))
df['err2'] = (df.log_cost - df.pred_log_cost) ** 2
print df.err2.describe()
df.sort('err2', ascending=False, inplace=True)
df[:10]
Out[60]:
In [39]:
# Get train and test set only for the well-behaved bracket.
X_train, y_train, X_test, y_test = next(generate_xv_splits(aug_train_set))
X_train['log_cost'] = y_train
X_test['log_cost'] = y_test
print X_train.shape, X_test.shape
X_train = X_train[X_train.bracketing_pattern == brapa]
X_test = X_test[X_test.bracketing_pattern == brapa]
print X_train.shape, X_test.shape
y_train = X_train.pop('log_cost')
y_test = X_test.pop('log_cost')
print X_train.bracketing_pattern.value_counts()
print X_test.bracketing_pattern.value_counts()
print X_train.supplier.value_counts()
print X_test.supplier.value_counts()
In [40]:
# Evaluate original RF on the well-behaved bracket.
featurizer = AllCategoricalsFeaturizer()
%time featurizer.fit(X_train)
%time X_train_feats = featurizer.transform(X_train)
%time X_test_feats = featurizer.transform(X_test)
X_train_feats.info(verbose=True)
%time X_train_np = X_train_feats.astype(np.float).values
y_train_np = y_train.values
%time X_test_np = X_test_feats.astype(np.float).values
y_test_np = y_test.values
reg = RandomForestRegressor(n_estimators=20)
%time train_rmsle, test_rmsle = eval_regressor(reg, X_train_np, y_train_np, X_test_np, y_test_np)
print "{}:".format(reg)
print " train RMSLE {}".format(train_rmsle)
print " test RMSLE {}".format(test_rmsle)
print
In [81]:
print bracket.var_cost.describe()
np.log(bracket.var_cost + 1).hist(bins=100)
Out[81]:
In [90]:
# Get training set only for well-behaved bracket, and only for adj_quantity=250.
X_train, y_train, X_test, y_test = next(generate_xv_splits(aug_train_set))
X_train['log_cost'] = y_train
X_test['log_cost'] = y_test
print X_train.shape, X_test.shape
X_train = X_train[(X_train.bracketing_pattern == brapa) & (X_train.adj_quantity == 250)]
X_test = X_test[(X_test.bracketing_pattern == brapa) & (X_test.adj_quantity == 250)]
print X_train.shape, X_test.shape
y_train = X_train.pop('log_cost')
y_test = X_test.pop('log_cost')
print X_train.adj_quantity.value_counts()
print X_test.adj_quantity.value_counts()
print X_train.bracketing_pattern.value_counts()
print X_test.bracketing_pattern.value_counts()
print X_train.supplier.value_counts()
print X_test.supplier.value_counts()
In [94]:
# Evaluate RF only on examples with adj_quantity=250.
featurizer = AllCategoricalsFeaturizer()
%time featurizer.fit(X_train)
%time X_train_feats = featurizer.transform(X_train)
%time X_test_feats = featurizer.transform(X_test)
X_train_feats.info(verbose=True)
%time X_train_np = X_train_feats.astype(np.float).values
y_train_np = y_train.values
%time X_test_np = X_test_feats.astype(np.float).values
y_test_np = y_test.values
reg = RandomForestRegressor(n_estimators=20)
%time train_rmsle, test_rmsle = eval_regressor(reg, X_train_np, y_train_np, X_test_np, y_test_np)
print "{}:".format(reg)
print " train RMSLE {}".format(train_rmsle)
print " test RMSLE {}".format(test_rmsle)
print
In [68]:
# Extract pred_var_cost from y_test_pred, which is pred_log_cost for qty=1.
y_test_pred = reg.predict(X_test_np)
print np.sqrt(mean_squared_error(y_test_np, y_test_pred))
df = pd.DataFrame()
df['tube_assembly_id'] = X_test.tube_assembly_id
df['pred_cost_for_qty_1'] = inverse_log_transform_y(y_test_pred)
print df.shape
df = df.merge(bracket)
df['pred_var_cost'] = df.pred_cost_for_qty_1 - df.fixed_cost
print df.shape
df[:5]
Out[68]:
In [112]:
# Try to predict log(var_cost) directly.
X_train, y_train, X_test, y_test = next(generate_xv_splits(aug_train_set))
X_train['log_cost'] = y_train
X_test['log_cost'] = y_test
print X_train.shape, X_test.shape
X_train = X_train[(X_train.bracketing_pattern == brapa) & (X_train.adj_quantity == 1)]
X_test = X_test[(X_test.bracketing_pattern == brapa) & (X_test.adj_quantity == 1)]
log_cost_train = X_train.pop('log_cost')
log_cost_test = X_test.pop('log_cost')
print X_train.shape, X_test.shape
X_train = X_train.merge(bracket, on='tube_assembly_id')
X_test = X_test.merge(bracket, on='tube_assembly_id')
y_train = log_transform_y(X_train.pop('var_cost'))
y_test = log_transform_y(X_test.pop('var_cost'))
print X_train.shape, X_test.shape, y_train.shape, y_test.shape
featurizer = AllCategoricalsFeaturizer()
%time featurizer.fit(X_train)
%time X_train_feats = featurizer.transform(X_train)
%time X_test_feats = featurizer.transform(X_test)
X_train_feats.info(verbose=True)
%time X_train_np = X_train_feats.astype(np.float).values
y_train_np = y_train.values
%time X_test_np = X_test_feats.astype(np.float).values
y_test_np = y_test.values
reg = RandomForestRegressor(n_estimators=20)
%time train_rmsle, test_rmsle = eval_regressor(reg, X_train_np, y_train_np, X_test_np, y_test_np)
print "{}:".format(reg)
print " train RMSLE_vc {}".format(train_rmsle)
print " test RMSLE_vc {}".format(test_rmsle)
print
In [116]:
# Extract pred_var_cost for each taid in X_test.
y_test_pred = reg.predict(X_test_np)
print np.sqrt(mean_squared_error(y_test_np, y_test_pred))
df = pd.DataFrame()
df['tube_assembly_id'] = X_test.tube_assembly_id
df = df.merge(bracket, on='tube_assembly_id')
df['pred_var_cost'] = inverse_log_transform_y(y_test_pred)
print df.shape
df[:5]
Out[116]:
In [118]:
# Extrapolate predicted cost for other quantities, assuming true fixed_cost observed.
_, _, X_test_full, y_test_full = next(generate_xv_splits(aug_train_set))
X_test_full['log_cost'] = y_test_full
print X_test_full.shape
X_test_full = X_test_full[X_test_full.bracketing_pattern == brapa]
print X_test_full.shape
X_test_full = X_test_full.merge(df, on='tube_assembly_id')
print X_test_full.shape
X_test_full['pred_cost'] = X_test_full.fixed_cost / X_test_full.adj_quantity + X_test_full.pred_var_cost
X_test_full['pred_log_cost'] = log_transform_y(X_test_full.pred_cost)
X_test_full['err2'] = (X_test_full.log_cost.values - X_test_full.pred_log_cost.values) ** 2
X_test_full.sort('err2', ascending=False, inplace=True)
print X_test_full.shape
print X_test_full.err2.describe()
print np.sqrt(mean_squared_error(X_test_full.log_cost.values, X_test_full.pred_log_cost.values))
X_test_full[:10]
Out[118]:
In [126]:
# Try to predict fixed_cost_class and log(var_cost) independently, then combine the two.
X_train, y_train, X_test, y_test = next(generate_xv_splits(aug_train_set))
X_train['log_cost'] = y_train
X_test['log_cost'] = y_test
print X_train.shape, X_test.shape
X_train = X_train[(X_train.bracketing_pattern == brapa) & (X_train.adj_quantity == 1)]
X_test = X_test[(X_test.bracketing_pattern == brapa) & (X_test.adj_quantity == 1)]
log_cost_train = X_train.pop('log_cost')
log_cost_test = X_test.pop('log_cost')
print X_train.shape, X_test.shape
X_train = X_train.merge(bracket, on='tube_assembly_id')
X_test = X_test.merge(bracket, on='tube_assembly_id')
X_train.pop('fixed_cost')
X_test.pop('fixed_cost')
log_var_cost_train = log_transform_y(X_train.pop('var_cost'))
log_var_cost_test = log_transform_y(X_test.pop('var_cost'))
fcc_train = X_train.pop('fixed_cost_class')
fcc_test = X_test.pop('fixed_cost_class')
print X_train.shape, X_test.shape
featurizer = AllCategoricalsFeaturizer()
%time featurizer.fit(X_train)
%time X_train_feats = featurizer.transform(X_train)
%time X_test_feats = featurizer.transform(X_test)
X_train_feats.info(verbose=True)
%time X_train_np = X_train_feats.astype(np.float).values
%time X_test_np = X_test_feats.astype(np.float).values
In [136]:
# The classification part: predict fixed_cost_class.
y_train = fcc_train
y_test = fcc_test
y_train_np = y_train.values
y_test_np = y_test.values
clf = RandomForestClassifier(n_estimators=100)
%time clf.fit(X_train_np, y_train_np)
y_train_pred = clf.predict(X_train_np)
print "on train:"
print clf.score(X_train_np, y_train_np)
print confusion_matrix(y_train_np, y_train_pred)
y_test_pred = clf.predict(X_test_np)
pred_fixed_cost_class_test = y_test_pred
print
print "on test:"
print clf.score(X_test_np, y_test_np)
print confusion_matrix(y_test_np, y_test_pred)
print
print "feature importances:"
print_feature_importances(X_train_feats, clf);
In [139]:
# The regression part: predict var_cost.
y_train = log_var_cost_train
y_test = log_var_cost_test
y_train_np = y_train.values
y_test_np = y_test.values
reg = RandomForestRegressor(n_estimators=20)
%time train_rmsle, test_rmsle = eval_regressor(reg, X_train_np, y_train_np, X_test_np, y_test_np)
print "{}:".format(reg)
print " train RMSLE_vc {}".format(train_rmsle)
print " test RMSLE_vc {}".format(test_rmsle)
print
y_test_pred = reg.predict(X_test_np)
pred_log_var_cost_test = y_test_pred
In [141]:
pred_log_var_cost_test
Out[141]:
In [148]:
# Combine predictions.
print accuracy_score(fcc_test, pred_fixed_cost_class_test)
print np.sqrt(mean_squared_error(log_var_cost_test, pred_log_var_cost_test))
df = pd.DataFrame()
df['tube_assembly_id'] = X_test.tube_assembly_id
df['pred_fixed_cost_class'] = pred_fixed_cost_class_test
df['pred_fixed_cost'] = np.array(fc_vals)[df.pred_fixed_cost_class]
df['pred_var_cost'] = inverse_log_transform_y(pred_log_var_cost_test)
print df.shape
df[:5]
Out[148]:
In [150]:
# Evaluate on all quantities.
_, _, X_test_full, y_test_full = next(generate_xv_splits(aug_train_set))
X_test_full['log_cost'] = y_test_full
print X_test_full.shape
X_test_full = X_test_full[X_test_full.bracketing_pattern == brapa]
print X_test_full.shape
X_test_full = X_test_full.merge(bracket, on='tube_assembly_id')
X_test_full = X_test_full.merge(df, on='tube_assembly_id')
print X_test_full.shape
X_test_full['pred_cost'] = X_test_full.pred_fixed_cost / X_test_full.adj_quantity + X_test_full.pred_var_cost
X_test_full['pred_log_cost'] = log_transform_y(X_test_full.pred_cost)
X_test_full['err2'] = (X_test_full.log_cost.values - X_test_full.pred_log_cost.values) ** 2
X_test_full.sort('err2', ascending=False, inplace=True)
print X_test_full.shape
print X_test_full.err2.describe()
print np.sqrt(mean_squared_error(X_test_full.log_cost.values, X_test_full.pred_log_cost.values))
X_test_full[:10]
Out[150]:
In [ ]: