In [1]:
%pylab inline

from sklearn.dummy import DummyRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import pandas as pd

from soln.dataset import AllCategoricalsFeaturizer
from soln.dataset import generate_xv_splits
from soln.dataset import get_augmented_train_and_test_set
from soln.utils import dump_decision_tree
from soln.utils import eval_regressor
from soln.utils import print_feature_importances

pd.set_option('display.max_columns', None)


Populating the interactive namespace from numpy and matplotlib

In [2]:
%time aug_train_set, aug_test_set = get_augmented_train_and_test_set()


CPU times: user 13.6 s, sys: 208 ms, total: 13.8 s
Wall time: 14.3 s

In [31]:
%time X_train, y_train, X_test, y_test = next(generate_xv_splits(aug_train_set))


CPU times: user 76 ms, sys: 8 ms, total: 84 ms
Wall time: 97.7 ms

In [6]:
# Approach 1: Keep only the examples with the () bracket.

print X_train.shape, y_train.shape
train_is = (X_train.bracketing_pattern == ())
X_train = X_train[train_is].reset_index(drop=True)
y_train = y_train[train_is].reset_index(drop=True)
print X_train.shape, y_train.shape

print X_test.shape, y_test.shape
test_is = (X_test.bracketing_pattern == ())
X_test = X_test[test_is].reset_index(drop=True)
y_test = y_test[test_is].reset_index(drop=True)
print X_test.shape, y_test.shape


(27270, 53) (27270,)
(4249, 53) (4249,)
(2943, 53) (2943,)
(493, 53) (493,)

In [17]:
# Approach 2: In train, keep all examples with bracket=() or adj_qty=1.
# In test, keep all examples with bracket=().

print X_train.shape, y_train.shape
train_is = ((X_train.bracketing_pattern == ()) | (X_train.adj_quantity == 1))
X_train = X_train[train_is].reset_index(drop=True)
y_train = y_train[train_is].reset_index(drop=True)
print X_train.shape, y_train.shape

print X_test.shape, y_test.shape
test_is = (X_test.bracketing_pattern == ())
X_test = X_test[test_is].reset_index(drop=True)
y_test = y_test[test_is].reset_index(drop=True)
print X_test.shape, y_test.shape

# Toss bracketing info.
X_train.bracketing_pattern = 666
X_test.bracketing_pattern = 666


(27270, 53) (27270,)
(7473, 53) (7473,)
(2943, 53) (2943,)
(493, 53) (493,)

In [32]:
# Approach 3: Instead of just the empty bracket, train and test on all uncommon brackets.

common_brackets = [
    (1, 2, 5, 10, 25, 50, 100, 250),
    (1, 6, 20),
    (1, 2, 3, 5, 10, 20),
    (1, 2, 5, 10, 25, 50, 100),
    (5, 19, 20),
]

print X_train.shape, y_train.shape
train_is = ~X_train.bracketing_pattern.isin(common_brackets)
X_train = X_train[train_is].reset_index(drop=True)
y_train = y_train[train_is].reset_index(drop=True)
print X_train.shape, y_train.shape

print X_test.shape, y_test.shape
test_is = ~X_test.bracketing_pattern.isin(common_brackets)
X_test = X_test[test_is].reset_index(drop=True)
y_test = y_test[test_is].reset_index(drop=True)
print X_test.shape, y_test.shape


(27270, 53) (27270,)
(8221, 53) (8221,)
(2943, 53) (2943,)
(987, 53) (987,)

In [33]:
featurizer = AllCategoricalsFeaturizer()
%time featurizer.fit(X_train)
%time X_train_feats = featurizer.transform(X_train)
%time X_test_feats = featurizer.transform(X_test)
X_train_feats.info(verbose=True)


CPU times: user 456 ms, sys: 0 ns, total: 456 ms
Wall time: 468 ms
CPU times: user 376 ms, sys: 64 ms, total: 440 ms
Wall time: 443 ms
CPU times: user 76 ms, sys: 0 ns, total: 76 ms
Wall time: 76 ms
<class 'pandas.core.frame.DataFrame'>
Int64Index: 8221 entries, 0 to 8220
Data columns (total 499 columns):
annual_usage                                           int64
min_order_quantity                                     int64
bracket_pricing                                        bool
quantity                                               int64
diameter                                               float64
wall_thickness                                         float64
length                                                 float64
num_bends                                              int64
bend_radius                                            float64
end_a_1x                                               bool
end_a_2x                                               bool
end_x_1x                                               bool
end_x_2x                                               bool
num_boss                                               int64
num_bracket                                            int64
num_other                                              int64
quote_age                                              float64
adj_quantity                                           int64
adj_bracketing                                         bool
physical_volume                                        float64
inner_radius                                           float64
material_volume                                        float64
end_a_forming                                          bool
end_1x_count                                           int64
end_x_forming                                          bool
end_2x_count                                           int64
end_forming_count                                      int64
unique_feature_count                                   float64
orientation_count                                      float64
groove_count                                           float64
total_component_weight                                 float64
component_max_length                                   float64
component_max_overall_length                           float64
component_max_bolt_pattern_wide                        float64
component_max_bolt_pattern_long                        float64
component_max_thickness                                float64
component_min_thread_pitch                             float64
component_min_thread_size                              float64
supplier XXX_other                                     float64
supplier S-0042                                        float64
supplier S-0005                                        float64
supplier S-0026                                        float64
supplier S-0027                                        float64
supplier S-0072                                        float64
supplier S-0062                                        float64
supplier S-0064                                        float64
supplier S-0043                                        float64
supplier S-0066                                        float64
supplier S-0041                                        float64
supplier S-0105                                        float64
supplier S-0080                                        float64
supplier S-0081                                        float64
supplier S-0104                                        float64
supplier S-0013                                        float64
supplier S-0014                                        float64
supplier S-0070                                        float64
supplier S-0031                                        float64
supplier S-0030                                        float64
supplier S-0058                                        float64
supplier S-0054                                        float64
material_id XXX_other                                  float64
material_id nan                                        float64
material_id SP-0046                                    float64
material_id SP-0041                                    float64
material_id SP-0033                                    float64
material_id SP-0048                                    float64
material_id SP-0034                                    float64
material_id SP-0035                                    float64
material_id SP-0037                                    float64
material_id SP-0030                                    float64
material_id SP-0019                                    float64
material_id SP-0008                                    float64
material_id SP-0038                                    float64
material_id SP-0039                                    float64
material_id SP-0029                                    float64
material_id SP-0028                                    float64
end_a XXX_other                                        float64
end_a EF-005                                           float64
end_a NONE                                             float64
end_a EF-002                                           float64
end_a EF-003                                           float64
end_a EF-008                                           float64
end_a EF-009                                           float64
end_a EF-023                                           float64
end_a EF-021                                           float64
end_a EF-012                                           float64
end_a EF-017                                           float64
end_a EF-015                                           float64
end_a EF-019                                           float64
end_a EF-018                                           float64
end_x XXX_other                                        float64
end_x NONE                                             float64
end_x EF-002                                           float64
end_x EF-003                                           float64
end_x EF-008                                           float64
end_x EF-009                                           float64
end_x EF-023                                           float64
end_x EF-021                                           float64
end_x EF-012                                           float64
end_x EF-010                                           float64
end_x EF-017                                           float64
end_x EF-015                                           float64
end_x EF-019                                           float64
end_x EF-018                                           float64
specs XXX_other                                        float64
specs SP-0065                                          float64
specs SP-0002                                          float64
specs SP-0050                                          float64
specs SP-0051                                          float64
specs SP-0057                                          float64
specs SP-0058                                          float64
specs SP-0079                                          float64
specs SP-0024                                          float64
specs SP-0070                                          float64
specs SP-0017                                          float64
specs SP-0072                                          float64
specs SP-0016                                          float64
specs SP-0012                                          float64
specs SP-0013                                          float64
specs SP-0076                                          float64
specs SP-0022                                          float64
specs SP-0021                                          float64
specs SP-0063                                          float64
specs SP-0071                                          float64
specs SP-0088                                          float64
specs SP-0080                                          float64
specs SP-0082                                          float64
specs SP-0062                                          float64
specs SP-0010                                          float64
specs SP-0075                                          float64
specs SP-0026                                          float64
specs SP-0069                                          float64
specs SP-0068                                          float64
specs SP-0005                                          float64
specs SP-0004                                          float64
specs SP-0007                                          float64
specs SP-0009                                          float64
specs SP-0061                                          float64
specs SP-0067                                          float64
specs SP-0029                                          float64
components XXX_other                                   float64
components C-0217                                      float64
components C-0215                                      float64
components C-0214                                      float64
components C-0211                                      float64
components C-0210                                      float64
components C-1867                                      float64
components C-1860                                      float64
components C-1229                                      float64
components C-1677                                      float64
components C-1898                                      float64
components C-1355                                      float64
components C-1354                                      float64
components C-1352                                      float64
components C-1672                                      float64
components C-1670                                      float64
components C-0122                                      float64
components C-0071                                      float64
components C-1017                                      float64
components C-0250                                      float64
components C-0318                                      float64
components C-0422                                      float64
components C-0855                                      float64
components C-0550                                      float64
components C-0228                                      float64
components C-1866                                      float64
components C-1850                                      float64
components C-1313                                      float64
components C-1312                                      float64
components C-1619                                      float64
components C-0577                                      float64
components C-1536                                      float64
components C-1625                                      float64
components C-1624                                      float64
components C-1627                                      float64
components C-1626                                      float64
components C-1621                                      float64
components C-1620                                      float64
components C-1623                                      float64
components C-1622                                      float64
components C-1629                                      float64
components C-1628                                      float64
components C-1745                                      float64
components C-2030                                      float64
components C-0045                                      float64
components C-1663                                      float64
components C-2032                                      float64
components C-1817                                      float64
components C-1956                                      float64
components C-1954                                      float64
components C-1218                                      float64
components C-0002                                      float64
components C-1889                                      float64
components C-1910                                      float64
components C-1881                                      float64
components C-1880                                      float64
components C-1369                                      float64
components C-1541                                      float64
components C-1661                                      float64
components C-0095                                      float64
components C-0003                                      float64
components C-0001                                      float64
components C-0007                                      float64
components C-0165                                      float64
components C-2043                                      float64
components C-1781                                      float64
components C-0548                                      float64
components C-1963                                      float64
components C-1848                                      float64
components C-1845                                      float64
components C-1846                                      float64
components C-1244                                      float64
components C-1243                                      float64
components C-1242                                      float64
components C-0579                                      float64
components C-1459                                      float64
components C-0120                                      float64
components C-1183                                      float64
components C-1439                                      float64
components C-1614                                      float64
components C-1615                                      float64
components C-1750                                      float64
components C-2005                                      float64
components C-0699                                      float64
components C-0751                                      float64
components C-1505                                      float64
components C-0058                                      float64
components C-0057                                      float64
components C-0199                                      float64
components C-0674                                      float64
components C-1577                                      float64
components C-0823                                      float64
components C-0580                                      float64
components C-0275                                      float64
components C-1209                                      float64
components C-1208                                      float64
components C-1203                                      float64
components C-1386                                      float64
components C-1385                                      float64
components C-1375                                      float64
components C-1374                                      float64
components C-1373                                      float64
components C-1475                                      float64
components C-1476                                      float64
components C-1477                                      float64
components C-1555                                      float64
components C-1554                                      float64
components C-0388                                      float64
components C-1725                                      float64
components C-0333                                      float64
components C-1654                                      float64
components C-1655                                      float64
components C-1650                                      float64
components C-0448                                      float64
components C-0449                                      float64
components C-0520                                      float64
components C-0444                                      float64
components C-0445                                      float64
components C-0208                                      float64
components C-0209                                      float64
components C-1976                                      float64
components C-0004                                      float64
components C-1877                                      float64
components C-1233                                      float64
components C-1235                                      float64
components C-0227                                      float64
components C-1425                                      float64
components C-1421                                      float64
components C-1420                                      float64
components C-1586                                      float64
components C-1565                                      float64
components C-2019                                      float64
components C-2017                                      float64
components C-1630                                      float64
components C-0062                                      float64
components C-0562                                      float64
components C-0244                                      float64
components C-0369                                      float64
components C-1768                                      float64
components C-0844                                      float64
components C-1398                                      float64
components C-1901                                      float64
components C-1907                                      float64
components C-1994                                      float64
components C-1417                                      float64
components C-1411                                      float64
components C-1637                                      float64
components C-1635                                      float64
components C-1632                                      float64
components C-1633                                      float64
components C-1739                                      float64
components C-1631                                      float64
components C-1638                                      float64
components C-1639                                      float64
components C-2028                                      float64
components C-2029                                      float64
components C-1643                                      float64
components C-1642                                      float64
components C-1641                                      float64
components C-1640                                      float64
components C-1647                                      float64
components C-1646                                      float64
components C-1645                                      float64
components C-1644                                      float64
components C-1649                                      float64
components C-1648                                      float64
bracketing_pattern XXX_other                           float64
bracketing_pattern (10, 15, 20, 25, 30)                float64
bracketing_pattern (5, 10, 20, 50, 100)                float64
bracketing_pattern (25, 50, 75, 100, 290, 325, 350)    float64
bracketing_pattern (1, 2, 3, 5, 10, 20, 50)            float64
bracketing_pattern (1, 3, 5, 7, 10)                    float64
bracketing_pattern (1, 2, 3, 4, 6)                     float64
bracketing_pattern (1, 3, 5, 7, 9)                     float64
bracketing_pattern (1, 15)                             float64
bracketing_pattern (10, 15, 20, 30)                    float64
bracketing_pattern (1, 3, 5, 10, 25)                   float64
bracketing_pattern (30, 60, 90, 120)                   float64
bracketing_pattern (5, 10)                             float64
bracketing_pattern (1, 2, 4, 9, 19)                    float64
bracketing_pattern (5, 20)                             float64
bracketing_pattern (1, 8)                              float64
bracketing_pattern (1, 3, 5, 10, 20)                   float64
bracketing_pattern (2, 3, 4, 6)                        float64
bracketing_pattern (15, 25, 35)                        float64
bracketing_pattern (1, 5, 10, 20)                      float64
bracketing_pattern (1, 5, 10, 20, 50)                  float64
bracketing_pattern (1, 3, 5, 10, 15, 25)               float64
bracketing_pattern (25, 50, 75, 100)                   float64
bracketing_pattern (1, 6)                              float64
bracketing_pattern (2, 5)                              float64
bracketing_pattern (1, 2, 3, 5, 10)                    float64
bracketing_pattern (1, 3, 5)                           float64
bracketing_pattern (1, 2, 5, 10)                       float64
bracketing_pattern (1, 10)                             float64
bracketing_pattern (1, 2, 3, 5, 7)                     float64
bracketing_pattern (1, 3)                              float64
bracketing_pattern (5, 10, 15)                         float64
bracketing_pattern (6, 12, 18, 24)                     float64
bracketing_pattern (3, 5, 7, 9)                        float64
bracketing_pattern (10, 15, 20)                        float64
bracketing_pattern (1, 4)                              float64
bracketing_pattern ()                                  float64
bracketing_pattern (1, 3, 5, 10, 15)                   float64
bracketing_pattern (10, 20, 30, 40)                    float64
bracketing_pattern (5, 10, 15, 20, 25)                 float64
bracketing_pattern (10, 15, 20, 25)                    float64
bracketing_pattern (50, 50)                            float64
bracketing_pattern (20, 40, 60, 80)                    float64
bracketing_pattern (4, 10)                             float64
bracketing_pattern (1, 2, 4)                           float64
bracketing_pattern (10, 25, 40, 55, 70)                float64
bracketing_pattern (5, 10, 25)                         float64
bracketing_pattern (1, 2, 3, 4, 5)                     float64
bracketing_pattern (2, 10, 25, 50, 100)                float64
bracketing_pattern (1, 20, 50)                         float64
bracketing_pattern (3, 5, 10)                          float64
bracketing_pattern (1, 2, 4, 8, 16)                    float64
bracketing_pattern (2, 3, 4, 5)                        float64
bracketing_pattern (1, 5, 10, 15, 20)                  float64
bracketing_pattern (1, 2, 5)                           float64
bracketing_pattern (8, 16, 24, 32)                     float64
bracketing_pattern (2, 4, 6, 8)                        float64
bracketing_pattern (1, 2, 5, 10, 25, 50)               float64
bracketing_pattern (1, 2, 3, 4)                        float64
bracketing_pattern (4, 6, 8, 10)                       float64
bracketing_pattern (1, 12)                             float64
bracketing_pattern (4, 15)                             float64
bracketing_pattern (2, 4, 6, 8, 10)                    float64
bracketing_pattern (2, 3, 4)                           float64
bracketing_pattern (1, 3, 5, 10)                       float64
bracketing_pattern (8, 16, 24, 32, 48)                 float64
bracketing_pattern (1, 5)                              float64
bracketing_pattern (3, 4, 5, 6)                        float64
bracketing_pattern (1, 3, 5, 10, 20, 30)               float64
bracketing_pattern (1, 2, 5, 10, 25)                   float64
bracketing_pattern (1, 30)                             float64
bracketing_pattern (3, 6, 9, 12)                       float64
bracketing_pattern (1, 2, 3, 5, 10, 20, 50, 100)       float64
bracketing_pattern (3, 5, 10, 20)                      float64
bracketing_pattern (10, 20, 30)                        float64
bracketing_pattern (1, 2, 5, 25, 35)                   float64
bracketing_pattern (1, 2)                              float64
bracketing_pattern (5, 10, 15, 20)                     float64
bracketing_pattern (5, 10, 20)                         float64
bracketing_pattern (4, 8, 12, 16)                      float64
bracketing_pattern (10, 15, 25)                        float64
ends XXX_other                                         float64
ends EF-005                                            float64
ends NONE                                              float64
ends EF-001                                            float64
ends EF-002                                            float64
ends EF-003                                            float64
ends EF-008                                            float64
ends EF-009                                            float64
ends EF-023                                            float64
ends EF-021                                            float64
ends EF-012                                            float64
ends EF-011                                            float64
ends EF-010                                            float64
ends EF-017                                            float64
ends EF-016                                            float64
ends EF-015                                            float64
ends EF-019                                            float64
ends EF-018                                            float64
component_groups XXX_other                             float64
component_groups threaded                              float64
component_groups sleeve                                float64
component_groups adaptor                               float64
component_groups nut                                   float64
component_groups float                                 float64
component_groups boss                                  float64
component_groups other                                 float64
component_groups elbow                                 float64
component_groups straight                              float64
component_types XXX_other                              float64
component_types CP-004                                 float64
component_types CP-006                                 float64
component_types CP-007                                 float64
component_types CP-001                                 float64
component_types CP-002                                 float64
component_types CP-003                                 float64
component_types CP-008                                 float64
component_types CP-009                                 float64
component_types CP-028                                 float64
component_types CP-022                                 float64
component_types CP-026                                 float64
component_types CP-027                                 float64
component_types CP-024                                 float64
component_types CP-025                                 float64
component_types other                                  float64
component_types CP-012                                 float64
component_types CP-011                                 float64
component_types CP-010                                 float64
component_types CP-016                                 float64
component_types CP-015                                 float64
component_types CP-014                                 float64
component_types CP-019                                 float64
component_types CP-018                                 float64
component_end_forms XXX_other                          float64
component_end_forms A-007                              float64
component_end_forms A-006                              float64
component_end_forms A-005                              float64
component_end_forms A-004                              float64
component_end_forms A-003                              float64
component_end_forms A-002                              float64
component_end_forms A-001                              float64
component_end_forms 9999                               float64
component_connection_types XXX_other                   float64
component_connection_types 9999                        float64
component_connection_types B-012                       float64
component_connection_types B-011                       float64
component_connection_types B-004                       float64
component_connection_types B-005                       float64
component_connection_types B-006                       float64
component_connection_types B-007                       float64
component_connection_types B-001                       float64
component_connection_types B-002                       float64
component_part_names XXX_other                         float64
component_part_names ADAPTER                           float64
component_part_names LINK                              float64
component_part_names ELBOW                             float64
component_part_names CONNECTOR-WELD                    float64
component_part_names SLEEVE-CRIMP                      float64
component_part_names FITTING-NUT                       float64
component_part_names HEAD-FLANGED                      float64
component_part_names WASHER-FUEL INJ                   float64
component_part_names SLEEVE-FLARED                     float64
component_part_names CONNECTOR-BHD                     float64
component_part_names BOSS                              float64
component_part_names NUT-ORFS                          float64
component_part_names CAP-A/C                           float64
component_part_names PLATE                             float64
component_part_names NUT-A/C                           float64
component_part_names ADAPTER-OIL LIN                   float64
component_part_names NUT-FUEL LINE                     float64
component_part_names FLANGE                            float64
component_part_names WASHER-FUEL LIN                   float64
component_part_names BLOCK                             float64
component_part_names NUT-FUEL INJ                      float64
component_part_names TUBE                              float64
component_part_names NUT-FLARED                        float64
component_part_names STUD-WELD                         float64
component_part_names SEAL-O-RING-ORFS                  float64
component_part_names CLIP                              float64
component_part_names BRACKET                           float64
component_part_names VALVE AS.-A/C                     float64
component_part_names LUG                               float64
component_part_names NUT                               float64
component_part_names PIPE                              float64
component_part_names FITTING                           float64
component_part_names ADAPTER-A/C                       float64
component_part_names NUT-INJ LINE                      float64
component_part_names ADAPTER-EXH PIP                   float64
component_part_names COLLAR                            float64
component_part_names SLEEVE                            float64
component_part_names NUT-WELD                          float64
component_part_names SEAL-O-RING                       float64
component_part_names WASHER                            float64
dtypes: bool(8), float64(480), int64(11)
memory usage: 30.9 MB

In [34]:
X_train_np = X_train_feats.astype(np.float).values
y_train_np = y_train.values
X_test_np = X_test_feats.astype(np.float).values
y_test_np = y_test.values
print X_train_np.shape, X_test_np.shape, y_train_np.shape, y_test_np.shape


(8221, 499) (987, 499) (8221,) (987,)

In [35]:
import xgboost as xgb

params = {
    'objective': 'reg:linear',
    'eta': 0.02,
    'min_child_weight': 6,
    'subsample': 0.7,
    'colsample_bytree': 0.6,
    'silent': 1,
    'max_depth': 8,
}

xgtrain = xgb.DMatrix(X_train_np, label=y_train_np)
xgtest = xgb.DMatrix(X_test_np)

In [36]:
num_rounds = 1000
%time model = xgb.train(params.items(), xgtrain, num_rounds)
%time y_train_pred = model.predict(xgtrain)
train_rmsle = np.sqrt(mean_squared_error(y_train_np, y_train_pred))
%time y_test_pred = model.predict(xgtest)
test_rmsle = np.sqrt(mean_squared_error(y_test_np, y_test_pred))
print train_rmsle, test_rmsle


CPU times: user 55.3 s, sys: 116 ms, total: 55.4 s
Wall time: 33.5 s
CPU times: user 4 ms, sys: 0 ns, total: 4 ms
Wall time: 1.54 ms
CPU times: user 240 ms, sys: 0 ns, total: 240 ms
Wall time: 141 ms
0.163275589812 0.345135857409

In [30]:
aug_train_set.bracketing_pattern.value_counts()[:10]


Out[30]:
(1, 2, 5, 10, 25, 50, 100, 250)    17640
()                                  4742
(1, 6, 20)                          2022
(1, 2, 3, 5, 10, 20)                 516
(1, 2, 5, 10, 25, 50, 100)           497
(5, 19, 20)                          330
(1, 2, 5, 10, 25, 50)                186
(1, 3, 5, 7, 9)                      175
(1, 2, 3, 4, 5)                      165
(2, 4, 6, 8)                         140
dtype: int64

In [37]:
# Check RMSLE on () bracket.

indices = (X_test.bracketing_pattern == ())
print indices.mean()
bra_y_test = y_test[indices]
bra_y_test_pred = pd.Series(y_test_pred)[indices]
print y_test.shape, y_test_pred.shape
print bra_y_test.shape, bra_y_test_pred.shape
bra_test_rmsle = np.sqrt(mean_squared_error(bra_y_test, bra_y_test_pred))
print bra_test_rmsle


0.499493414387
(987,) (987,)
(493,) (493,)
0.378887498903

In [ ]: