In [1]:
%pylab inline
from sklearn.dummy import DummyRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import pandas as pd
from soln.dataset import AllCategoricalsFeaturizer
from soln.dataset import generate_xv_splits
from soln.dataset import get_augmented_train_and_test_set
from soln.utils import dump_decision_tree
from soln.utils import eval_regressor
from soln.utils import print_feature_importances
pd.set_option('display.max_columns', None)
Populating the interactive namespace from numpy and matplotlib
In [2]:
%time aug_train_set, aug_test_set = get_augmented_train_and_test_set()
CPU times: user 13.6 s, sys: 208 ms, total: 13.8 s
Wall time: 14.3 s
In [31]:
%time X_train, y_train, X_test, y_test = next(generate_xv_splits(aug_train_set))
CPU times: user 76 ms, sys: 8 ms, total: 84 ms
Wall time: 97.7 ms
In [6]:
# Approach 1: Keep only the examples with the () bracket.
print X_train.shape, y_train.shape
train_is = (X_train.bracketing_pattern == ())
X_train = X_train[train_is].reset_index(drop=True)
y_train = y_train[train_is].reset_index(drop=True)
print X_train.shape, y_train.shape
print X_test.shape, y_test.shape
test_is = (X_test.bracketing_pattern == ())
X_test = X_test[test_is].reset_index(drop=True)
y_test = y_test[test_is].reset_index(drop=True)
print X_test.shape, y_test.shape
(27270, 53) (27270,)
(4249, 53) (4249,)
(2943, 53) (2943,)
(493, 53) (493,)
In [17]:
# Approach 2: In train, keep all examples with bracket=() or adj_qty=1.
# In test, keep all examples with bracket=().
print X_train.shape, y_train.shape
train_is = ((X_train.bracketing_pattern == ()) | (X_train.adj_quantity == 1))
X_train = X_train[train_is].reset_index(drop=True)
y_train = y_train[train_is].reset_index(drop=True)
print X_train.shape, y_train.shape
print X_test.shape, y_test.shape
test_is = (X_test.bracketing_pattern == ())
X_test = X_test[test_is].reset_index(drop=True)
y_test = y_test[test_is].reset_index(drop=True)
print X_test.shape, y_test.shape
# Toss bracketing info.
X_train.bracketing_pattern = 666
X_test.bracketing_pattern = 666
(27270, 53) (27270,)
(7473, 53) (7473,)
(2943, 53) (2943,)
(493, 53) (493,)
In [32]:
# Approach 3: Instead of just the empty bracket, train and test on all uncommon brackets.
common_brackets = [
(1, 2, 5, 10, 25, 50, 100, 250),
(1, 6, 20),
(1, 2, 3, 5, 10, 20),
(1, 2, 5, 10, 25, 50, 100),
(5, 19, 20),
]
print X_train.shape, y_train.shape
train_is = ~X_train.bracketing_pattern.isin(common_brackets)
X_train = X_train[train_is].reset_index(drop=True)
y_train = y_train[train_is].reset_index(drop=True)
print X_train.shape, y_train.shape
print X_test.shape, y_test.shape
test_is = ~X_test.bracketing_pattern.isin(common_brackets)
X_test = X_test[test_is].reset_index(drop=True)
y_test = y_test[test_is].reset_index(drop=True)
print X_test.shape, y_test.shape
(27270, 53) (27270,)
(8221, 53) (8221,)
(2943, 53) (2943,)
(987, 53) (987,)
In [33]:
featurizer = AllCategoricalsFeaturizer()
%time featurizer.fit(X_train)
%time X_train_feats = featurizer.transform(X_train)
%time X_test_feats = featurizer.transform(X_test)
X_train_feats.info(verbose=True)
CPU times: user 456 ms, sys: 0 ns, total: 456 ms
Wall time: 468 ms
CPU times: user 376 ms, sys: 64 ms, total: 440 ms
Wall time: 443 ms
CPU times: user 76 ms, sys: 0 ns, total: 76 ms
Wall time: 76 ms
<class 'pandas.core.frame.DataFrame'>
Int64Index: 8221 entries, 0 to 8220
Data columns (total 499 columns):
annual_usage int64
min_order_quantity int64
bracket_pricing bool
quantity int64
diameter float64
wall_thickness float64
length float64
num_bends int64
bend_radius float64
end_a_1x bool
end_a_2x bool
end_x_1x bool
end_x_2x bool
num_boss int64
num_bracket int64
num_other int64
quote_age float64
adj_quantity int64
adj_bracketing bool
physical_volume float64
inner_radius float64
material_volume float64
end_a_forming bool
end_1x_count int64
end_x_forming bool
end_2x_count int64
end_forming_count int64
unique_feature_count float64
orientation_count float64
groove_count float64
total_component_weight float64
component_max_length float64
component_max_overall_length float64
component_max_bolt_pattern_wide float64
component_max_bolt_pattern_long float64
component_max_thickness float64
component_min_thread_pitch float64
component_min_thread_size float64
supplier XXX_other float64
supplier S-0042 float64
supplier S-0005 float64
supplier S-0026 float64
supplier S-0027 float64
supplier S-0072 float64
supplier S-0062 float64
supplier S-0064 float64
supplier S-0043 float64
supplier S-0066 float64
supplier S-0041 float64
supplier S-0105 float64
supplier S-0080 float64
supplier S-0081 float64
supplier S-0104 float64
supplier S-0013 float64
supplier S-0014 float64
supplier S-0070 float64
supplier S-0031 float64
supplier S-0030 float64
supplier S-0058 float64
supplier S-0054 float64
material_id XXX_other float64
material_id nan float64
material_id SP-0046 float64
material_id SP-0041 float64
material_id SP-0033 float64
material_id SP-0048 float64
material_id SP-0034 float64
material_id SP-0035 float64
material_id SP-0037 float64
material_id SP-0030 float64
material_id SP-0019 float64
material_id SP-0008 float64
material_id SP-0038 float64
material_id SP-0039 float64
material_id SP-0029 float64
material_id SP-0028 float64
end_a XXX_other float64
end_a EF-005 float64
end_a NONE float64
end_a EF-002 float64
end_a EF-003 float64
end_a EF-008 float64
end_a EF-009 float64
end_a EF-023 float64
end_a EF-021 float64
end_a EF-012 float64
end_a EF-017 float64
end_a EF-015 float64
end_a EF-019 float64
end_a EF-018 float64
end_x XXX_other float64
end_x NONE float64
end_x EF-002 float64
end_x EF-003 float64
end_x EF-008 float64
end_x EF-009 float64
end_x EF-023 float64
end_x EF-021 float64
end_x EF-012 float64
end_x EF-010 float64
end_x EF-017 float64
end_x EF-015 float64
end_x EF-019 float64
end_x EF-018 float64
specs XXX_other float64
specs SP-0065 float64
specs SP-0002 float64
specs SP-0050 float64
specs SP-0051 float64
specs SP-0057 float64
specs SP-0058 float64
specs SP-0079 float64
specs SP-0024 float64
specs SP-0070 float64
specs SP-0017 float64
specs SP-0072 float64
specs SP-0016 float64
specs SP-0012 float64
specs SP-0013 float64
specs SP-0076 float64
specs SP-0022 float64
specs SP-0021 float64
specs SP-0063 float64
specs SP-0071 float64
specs SP-0088 float64
specs SP-0080 float64
specs SP-0082 float64
specs SP-0062 float64
specs SP-0010 float64
specs SP-0075 float64
specs SP-0026 float64
specs SP-0069 float64
specs SP-0068 float64
specs SP-0005 float64
specs SP-0004 float64
specs SP-0007 float64
specs SP-0009 float64
specs SP-0061 float64
specs SP-0067 float64
specs SP-0029 float64
components XXX_other float64
components C-0217 float64
components C-0215 float64
components C-0214 float64
components C-0211 float64
components C-0210 float64
components C-1867 float64
components C-1860 float64
components C-1229 float64
components C-1677 float64
components C-1898 float64
components C-1355 float64
components C-1354 float64
components C-1352 float64
components C-1672 float64
components C-1670 float64
components C-0122 float64
components C-0071 float64
components C-1017 float64
components C-0250 float64
components C-0318 float64
components C-0422 float64
components C-0855 float64
components C-0550 float64
components C-0228 float64
components C-1866 float64
components C-1850 float64
components C-1313 float64
components C-1312 float64
components C-1619 float64
components C-0577 float64
components C-1536 float64
components C-1625 float64
components C-1624 float64
components C-1627 float64
components C-1626 float64
components C-1621 float64
components C-1620 float64
components C-1623 float64
components C-1622 float64
components C-1629 float64
components C-1628 float64
components C-1745 float64
components C-2030 float64
components C-0045 float64
components C-1663 float64
components C-2032 float64
components C-1817 float64
components C-1956 float64
components C-1954 float64
components C-1218 float64
components C-0002 float64
components C-1889 float64
components C-1910 float64
components C-1881 float64
components C-1880 float64
components C-1369 float64
components C-1541 float64
components C-1661 float64
components C-0095 float64
components C-0003 float64
components C-0001 float64
components C-0007 float64
components C-0165 float64
components C-2043 float64
components C-1781 float64
components C-0548 float64
components C-1963 float64
components C-1848 float64
components C-1845 float64
components C-1846 float64
components C-1244 float64
components C-1243 float64
components C-1242 float64
components C-0579 float64
components C-1459 float64
components C-0120 float64
components C-1183 float64
components C-1439 float64
components C-1614 float64
components C-1615 float64
components C-1750 float64
components C-2005 float64
components C-0699 float64
components C-0751 float64
components C-1505 float64
components C-0058 float64
components C-0057 float64
components C-0199 float64
components C-0674 float64
components C-1577 float64
components C-0823 float64
components C-0580 float64
components C-0275 float64
components C-1209 float64
components C-1208 float64
components C-1203 float64
components C-1386 float64
components C-1385 float64
components C-1375 float64
components C-1374 float64
components C-1373 float64
components C-1475 float64
components C-1476 float64
components C-1477 float64
components C-1555 float64
components C-1554 float64
components C-0388 float64
components C-1725 float64
components C-0333 float64
components C-1654 float64
components C-1655 float64
components C-1650 float64
components C-0448 float64
components C-0449 float64
components C-0520 float64
components C-0444 float64
components C-0445 float64
components C-0208 float64
components C-0209 float64
components C-1976 float64
components C-0004 float64
components C-1877 float64
components C-1233 float64
components C-1235 float64
components C-0227 float64
components C-1425 float64
components C-1421 float64
components C-1420 float64
components C-1586 float64
components C-1565 float64
components C-2019 float64
components C-2017 float64
components C-1630 float64
components C-0062 float64
components C-0562 float64
components C-0244 float64
components C-0369 float64
components C-1768 float64
components C-0844 float64
components C-1398 float64
components C-1901 float64
components C-1907 float64
components C-1994 float64
components C-1417 float64
components C-1411 float64
components C-1637 float64
components C-1635 float64
components C-1632 float64
components C-1633 float64
components C-1739 float64
components C-1631 float64
components C-1638 float64
components C-1639 float64
components C-2028 float64
components C-2029 float64
components C-1643 float64
components C-1642 float64
components C-1641 float64
components C-1640 float64
components C-1647 float64
components C-1646 float64
components C-1645 float64
components C-1644 float64
components C-1649 float64
components C-1648 float64
bracketing_pattern XXX_other float64
bracketing_pattern (10, 15, 20, 25, 30) float64
bracketing_pattern (5, 10, 20, 50, 100) float64
bracketing_pattern (25, 50, 75, 100, 290, 325, 350) float64
bracketing_pattern (1, 2, 3, 5, 10, 20, 50) float64
bracketing_pattern (1, 3, 5, 7, 10) float64
bracketing_pattern (1, 2, 3, 4, 6) float64
bracketing_pattern (1, 3, 5, 7, 9) float64
bracketing_pattern (1, 15) float64
bracketing_pattern (10, 15, 20, 30) float64
bracketing_pattern (1, 3, 5, 10, 25) float64
bracketing_pattern (30, 60, 90, 120) float64
bracketing_pattern (5, 10) float64
bracketing_pattern (1, 2, 4, 9, 19) float64
bracketing_pattern (5, 20) float64
bracketing_pattern (1, 8) float64
bracketing_pattern (1, 3, 5, 10, 20) float64
bracketing_pattern (2, 3, 4, 6) float64
bracketing_pattern (15, 25, 35) float64
bracketing_pattern (1, 5, 10, 20) float64
bracketing_pattern (1, 5, 10, 20, 50) float64
bracketing_pattern (1, 3, 5, 10, 15, 25) float64
bracketing_pattern (25, 50, 75, 100) float64
bracketing_pattern (1, 6) float64
bracketing_pattern (2, 5) float64
bracketing_pattern (1, 2, 3, 5, 10) float64
bracketing_pattern (1, 3, 5) float64
bracketing_pattern (1, 2, 5, 10) float64
bracketing_pattern (1, 10) float64
bracketing_pattern (1, 2, 3, 5, 7) float64
bracketing_pattern (1, 3) float64
bracketing_pattern (5, 10, 15) float64
bracketing_pattern (6, 12, 18, 24) float64
bracketing_pattern (3, 5, 7, 9) float64
bracketing_pattern (10, 15, 20) float64
bracketing_pattern (1, 4) float64
bracketing_pattern () float64
bracketing_pattern (1, 3, 5, 10, 15) float64
bracketing_pattern (10, 20, 30, 40) float64
bracketing_pattern (5, 10, 15, 20, 25) float64
bracketing_pattern (10, 15, 20, 25) float64
bracketing_pattern (50, 50) float64
bracketing_pattern (20, 40, 60, 80) float64
bracketing_pattern (4, 10) float64
bracketing_pattern (1, 2, 4) float64
bracketing_pattern (10, 25, 40, 55, 70) float64
bracketing_pattern (5, 10, 25) float64
bracketing_pattern (1, 2, 3, 4, 5) float64
bracketing_pattern (2, 10, 25, 50, 100) float64
bracketing_pattern (1, 20, 50) float64
bracketing_pattern (3, 5, 10) float64
bracketing_pattern (1, 2, 4, 8, 16) float64
bracketing_pattern (2, 3, 4, 5) float64
bracketing_pattern (1, 5, 10, 15, 20) float64
bracketing_pattern (1, 2, 5) float64
bracketing_pattern (8, 16, 24, 32) float64
bracketing_pattern (2, 4, 6, 8) float64
bracketing_pattern (1, 2, 5, 10, 25, 50) float64
bracketing_pattern (1, 2, 3, 4) float64
bracketing_pattern (4, 6, 8, 10) float64
bracketing_pattern (1, 12) float64
bracketing_pattern (4, 15) float64
bracketing_pattern (2, 4, 6, 8, 10) float64
bracketing_pattern (2, 3, 4) float64
bracketing_pattern (1, 3, 5, 10) float64
bracketing_pattern (8, 16, 24, 32, 48) float64
bracketing_pattern (1, 5) float64
bracketing_pattern (3, 4, 5, 6) float64
bracketing_pattern (1, 3, 5, 10, 20, 30) float64
bracketing_pattern (1, 2, 5, 10, 25) float64
bracketing_pattern (1, 30) float64
bracketing_pattern (3, 6, 9, 12) float64
bracketing_pattern (1, 2, 3, 5, 10, 20, 50, 100) float64
bracketing_pattern (3, 5, 10, 20) float64
bracketing_pattern (10, 20, 30) float64
bracketing_pattern (1, 2, 5, 25, 35) float64
bracketing_pattern (1, 2) float64
bracketing_pattern (5, 10, 15, 20) float64
bracketing_pattern (5, 10, 20) float64
bracketing_pattern (4, 8, 12, 16) float64
bracketing_pattern (10, 15, 25) float64
ends XXX_other float64
ends EF-005 float64
ends NONE float64
ends EF-001 float64
ends EF-002 float64
ends EF-003 float64
ends EF-008 float64
ends EF-009 float64
ends EF-023 float64
ends EF-021 float64
ends EF-012 float64
ends EF-011 float64
ends EF-010 float64
ends EF-017 float64
ends EF-016 float64
ends EF-015 float64
ends EF-019 float64
ends EF-018 float64
component_groups XXX_other float64
component_groups threaded float64
component_groups sleeve float64
component_groups adaptor float64
component_groups nut float64
component_groups float float64
component_groups boss float64
component_groups other float64
component_groups elbow float64
component_groups straight float64
component_types XXX_other float64
component_types CP-004 float64
component_types CP-006 float64
component_types CP-007 float64
component_types CP-001 float64
component_types CP-002 float64
component_types CP-003 float64
component_types CP-008 float64
component_types CP-009 float64
component_types CP-028 float64
component_types CP-022 float64
component_types CP-026 float64
component_types CP-027 float64
component_types CP-024 float64
component_types CP-025 float64
component_types other float64
component_types CP-012 float64
component_types CP-011 float64
component_types CP-010 float64
component_types CP-016 float64
component_types CP-015 float64
component_types CP-014 float64
component_types CP-019 float64
component_types CP-018 float64
component_end_forms XXX_other float64
component_end_forms A-007 float64
component_end_forms A-006 float64
component_end_forms A-005 float64
component_end_forms A-004 float64
component_end_forms A-003 float64
component_end_forms A-002 float64
component_end_forms A-001 float64
component_end_forms 9999 float64
component_connection_types XXX_other float64
component_connection_types 9999 float64
component_connection_types B-012 float64
component_connection_types B-011 float64
component_connection_types B-004 float64
component_connection_types B-005 float64
component_connection_types B-006 float64
component_connection_types B-007 float64
component_connection_types B-001 float64
component_connection_types B-002 float64
component_part_names XXX_other float64
component_part_names ADAPTER float64
component_part_names LINK float64
component_part_names ELBOW float64
component_part_names CONNECTOR-WELD float64
component_part_names SLEEVE-CRIMP float64
component_part_names FITTING-NUT float64
component_part_names HEAD-FLANGED float64
component_part_names WASHER-FUEL INJ float64
component_part_names SLEEVE-FLARED float64
component_part_names CONNECTOR-BHD float64
component_part_names BOSS float64
component_part_names NUT-ORFS float64
component_part_names CAP-A/C float64
component_part_names PLATE float64
component_part_names NUT-A/C float64
component_part_names ADAPTER-OIL LIN float64
component_part_names NUT-FUEL LINE float64
component_part_names FLANGE float64
component_part_names WASHER-FUEL LIN float64
component_part_names BLOCK float64
component_part_names NUT-FUEL INJ float64
component_part_names TUBE float64
component_part_names NUT-FLARED float64
component_part_names STUD-WELD float64
component_part_names SEAL-O-RING-ORFS float64
component_part_names CLIP float64
component_part_names BRACKET float64
component_part_names VALVE AS.-A/C float64
component_part_names LUG float64
component_part_names NUT float64
component_part_names PIPE float64
component_part_names FITTING float64
component_part_names ADAPTER-A/C float64
component_part_names NUT-INJ LINE float64
component_part_names ADAPTER-EXH PIP float64
component_part_names COLLAR float64
component_part_names SLEEVE float64
component_part_names NUT-WELD float64
component_part_names SEAL-O-RING float64
component_part_names WASHER float64
dtypes: bool(8), float64(480), int64(11)
memory usage: 30.9 MB
In [34]:
X_train_np = X_train_feats.astype(np.float).values
y_train_np = y_train.values
X_test_np = X_test_feats.astype(np.float).values
y_test_np = y_test.values
print X_train_np.shape, X_test_np.shape, y_train_np.shape, y_test_np.shape
(8221, 499) (987, 499) (8221,) (987,)
In [35]:
import xgboost as xgb
params = {
'objective': 'reg:linear',
'eta': 0.02,
'min_child_weight': 6,
'subsample': 0.7,
'colsample_bytree': 0.6,
'silent': 1,
'max_depth': 8,
}
xgtrain = xgb.DMatrix(X_train_np, label=y_train_np)
xgtest = xgb.DMatrix(X_test_np)
In [36]:
num_rounds = 1000
%time model = xgb.train(params.items(), xgtrain, num_rounds)
%time y_train_pred = model.predict(xgtrain)
train_rmsle = np.sqrt(mean_squared_error(y_train_np, y_train_pred))
%time y_test_pred = model.predict(xgtest)
test_rmsle = np.sqrt(mean_squared_error(y_test_np, y_test_pred))
print train_rmsle, test_rmsle
CPU times: user 55.3 s, sys: 116 ms, total: 55.4 s
Wall time: 33.5 s
CPU times: user 4 ms, sys: 0 ns, total: 4 ms
Wall time: 1.54 ms
CPU times: user 240 ms, sys: 0 ns, total: 240 ms
Wall time: 141 ms
0.163275589812 0.345135857409
In [30]:
aug_train_set.bracketing_pattern.value_counts()[:10]
Out[30]:
(1, 2, 5, 10, 25, 50, 100, 250) 17640
() 4742
(1, 6, 20) 2022
(1, 2, 3, 5, 10, 20) 516
(1, 2, 5, 10, 25, 50, 100) 497
(5, 19, 20) 330
(1, 2, 5, 10, 25, 50) 186
(1, 3, 5, 7, 9) 175
(1, 2, 3, 4, 5) 165
(2, 4, 6, 8) 140
dtype: int64
In [37]:
# Check RMSLE on () bracket.
indices = (X_test.bracketing_pattern == ())
print indices.mean()
bra_y_test = y_test[indices]
bra_y_test_pred = pd.Series(y_test_pred)[indices]
print y_test.shape, y_test_pred.shape
print bra_y_test.shape, bra_y_test_pred.shape
bra_test_rmsle = np.sqrt(mean_squared_error(bra_y_test, bra_y_test_pred))
print bra_test_rmsle
0.499493414387
(987,) (987,)
(493,) (493,)
0.378887498903
In [ ]:
Content source: arorahardeep/kaggle-caterpillar
Similar notebooks: