In [117]:
%pylab inline

from sklearn.dummy import DummyRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import pandas as pd

from soln.dataset import AllCategoricalsFeaturizer
from soln.dataset import generate_xv_splits
from soln.dataset import get_augmented_train_and_test_set
from soln.utils import dump_decision_tree
from soln.utils import eval_regressor
from soln.utils import print_feature_importances

pd.set_option('display.max_columns', None)


Populating the interactive namespace from numpy and matplotlib

In [118]:
%time aug_train_set, aug_test_set = get_augmented_train_and_test_set()


CPU times: user 14 s, sys: 24 ms, total: 14.1 s
Wall time: 14.4 s

In [119]:
%time X_train, y_train, X_test, y_test = next(generate_xv_splits(aug_train_set))


CPU times: user 128 ms, sys: 0 ns, total: 128 ms
Wall time: 130 ms

In [120]:
# Keep only the test examples that have unknown components.

from soln.dataset import get_component_info_df
from soln.dataset import load_raw_components
comp_types, group_dfs, cluster_dfs = load_raw_components()
cinfo_df = get_component_info_df(comp_types, group_dfs, cluster_dfs)

from soln.utils import count_components
train_counts = count_components(X_train, cinfo_df)
train_counts.rename(columns={'count': 'train_count'}, inplace=True)
test_counts = count_components(X_test, cinfo_df)
test_counts.rename(columns={'count': 'test_count'}, inplace=True)
all_counts = cinfo_df[['component_id', 'component_type_id', 'component_group_id']]
all_counts = all_counts.merge(train_counts, on='component_id')
all_counts = all_counts.merge(test_counts, on='component_id')

known_cids = set(all_counts.component_id[all_counts.train_count > 0].values)
print len(all_counts), len(known_cids)

has_unk = []
for cids in X_test.components:
    has_unk.append(any([cid not in known_cids for cid in cids]))
print len(X_test), len(has_unk)

X_test['has_unk'] = has_unk
print X_test.has_unk.value_counts()
print X_test.has_unk.value_counts(normalize=True)
tmp_df = X_test[['tube_assembly_id', 'has_unk']].drop_duplicates()
print len(X_test), len(tmp_df)
print tmp_df.has_unk.value_counts()
print tmp_df.has_unk.value_counts(normalize=True)

X_test_orig = X_test
y_test_orig = y_test
print X_train.shape, y_train.shape
print X_test_orig.shape, y_test_orig.shape
X_test = X_test_orig[X_test_orig.has_unk == True].reset_index(drop=True)
X_test.pop('has_unk')
y_test = y_test_orig[X_test_orig.has_unk == True].reset_index(drop=True)
print X_test.shape, y_test.shape


2047 1141
2943 2943
False    2791
True      152
dtype: int64
False    0.948352
True     0.051648
dtype: float64
2943 895
False    828
True      67
dtype: int64
False    0.92514
True     0.07486
dtype: float64
(27270, 53) (27270,)
(2943, 54) (2943,)
(152, 53) (152,)

In [121]:
featurizer = AllCategoricalsFeaturizer()
%time featurizer.fit(X_train)
%time X_train_feats = featurizer.transform(X_train)
%time X_test_feats = featurizer.transform(X_test)
X_train_feats.info(verbose=True)


CPU times: user 1.44 s, sys: 0 ns, total: 1.44 s
Wall time: 1.46 s
CPU times: user 1.42 s, sys: 264 ms, total: 1.68 s
Wall time: 1.73 s
CPU times: user 40 ms, sys: 0 ns, total: 40 ms
Wall time: 42 ms
<class 'pandas.core.frame.DataFrame'>
Int64Index: 27270 entries, 0 to 27269
Data columns (total 599 columns):
annual_usage                                           int64
min_order_quantity                                     int64
bracket_pricing                                        bool
quantity                                               int64
diameter                                               float64
wall_thickness                                         float64
length                                                 float64
num_bends                                              int64
bend_radius                                            float64
end_a_1x                                               bool
end_a_2x                                               bool
end_x_1x                                               bool
end_x_2x                                               bool
num_boss                                               int64
num_bracket                                            int64
num_other                                              int64
quote_age                                              float64
adj_quantity                                           int64
adj_bracketing                                         bool
physical_volume                                        float64
inner_radius                                           float64
material_volume                                        float64
end_a_forming                                          bool
end_1x_count                                           int64
end_x_forming                                          bool
end_2x_count                                           int64
end_forming_count                                      int64
unique_feature_count                                   float64
orientation_count                                      float64
groove_count                                           float64
total_component_weight                                 float64
component_max_length                                   float64
component_max_overall_length                           float64
component_max_bolt_pattern_wide                        float64
component_max_bolt_pattern_long                        float64
component_max_thickness                                float64
component_min_thread_pitch                             float64
component_min_thread_size                              float64
supplier XXX_other                                     float64
supplier S-0042                                        float64
supplier S-0005                                        float64
supplier S-0026                                        float64
supplier S-0027                                        float64
supplier S-0072                                        float64
supplier S-0062                                        float64
supplier S-0064                                        float64
supplier S-0043                                        float64
supplier S-0066                                        float64
supplier S-0041                                        float64
supplier S-0105                                        float64
supplier S-0080                                        float64
supplier S-0081                                        float64
supplier S-0104                                        float64
supplier S-0013                                        float64
supplier S-0014                                        float64
supplier S-0070                                        float64
supplier S-0031                                        float64
supplier S-0030                                        float64
supplier S-0058                                        float64
supplier S-0054                                        float64
material_id XXX_other                                  float64
material_id nan                                        float64
material_id SP-0046                                    float64
material_id SP-0041                                    float64
material_id SP-0033                                    float64
material_id SP-0048                                    float64
material_id SP-0034                                    float64
material_id SP-0035                                    float64
material_id SP-0036                                    float64
material_id SP-0037                                    float64
material_id SP-0030                                    float64
material_id SP-0019                                    float64
material_id SP-0008                                    float64
material_id SP-0038                                    float64
material_id SP-0039                                    float64
material_id SP-0029                                    float64
material_id SP-0028                                    float64
end_a XXX_other                                        float64
end_a EF-005                                           float64
end_a NONE                                             float64
end_a EF-002                                           float64
end_a EF-003                                           float64
end_a EF-008                                           float64
end_a EF-009                                           float64
end_a EF-023                                           float64
end_a EF-021                                           float64
end_a EF-013                                           float64
end_a EF-012                                           float64
end_a EF-017                                           float64
end_a EF-016                                           float64
end_a EF-015                                           float64
end_a EF-019                                           float64
end_a EF-018                                           float64
end_x XXX_other                                        float64
end_x NONE                                             float64
end_x EF-002                                           float64
end_x EF-003                                           float64
end_x EF-008                                           float64
end_x EF-009                                           float64
end_x EF-023                                           float64
end_x EF-021                                           float64
end_x EF-006                                           float64
end_x EF-013                                           float64
end_x EF-012                                           float64
end_x EF-010                                           float64
end_x EF-017                                           float64
end_x EF-016                                           float64
end_x EF-015                                           float64
end_x EF-019                                           float64
end_x EF-018                                           float64
specs XXX_other                                        float64
specs SP-0065                                          float64
specs SP-0002                                          float64
specs SP-0050                                          float64
specs SP-0051                                          float64
specs SP-0057                                          float64
specs SP-0025                                          float64
specs SP-0058                                          float64
specs SP-0079                                          float64
specs SP-0024                                          float64
specs SP-0070                                          float64
specs SP-0017                                          float64
specs SP-0072                                          float64
specs SP-0016                                          float64
specs SP-0012                                          float64
specs SP-0013                                          float64
specs SP-0076                                          float64
specs SP-0022                                          float64
specs SP-0021                                          float64
specs SP-0063                                          float64
specs SP-0071                                          float64
specs SP-0088                                          float64
specs SP-0080                                          float64
specs SP-0082                                          float64
specs SP-0062                                          float64
specs SP-0010                                          float64
specs SP-0075                                          float64
specs SP-0026                                          float64
specs SP-0069                                          float64
specs SP-0068                                          float64
specs SP-0005                                          float64
specs SP-0004                                          float64
specs SP-0007                                          float64
specs SP-0009                                          float64
specs SP-0061                                          float64
specs SP-0067                                          float64
specs SP-0029                                          float64
components XXX_other                                   float64
components C-1653                                      float64
components C-0218                                      float64
components C-0217                                      float64
components C-0215                                      float64
components C-0214                                      float64
components C-0211                                      float64
components C-0210                                      float64
components C-1867                                      float64
components C-1860                                      float64
components C-1869                                      float64
components C-0063                                      float64
components C-1229                                      float64
components C-1677                                      float64
components C-1898                                      float64
components C-1355                                      float64
components C-1354                                      float64
components C-1352                                      float64
components C-1670                                      float64
components C-0122                                      float64
components C-0071                                      float64
components C-1017                                      float64
components C-0250                                      float64
components C-0318                                      float64
components C-1779                                      float64
components C-0422                                      float64
components C-1821                                      float64
components C-0855                                      float64
components C-0550                                      float64
components C-0494                                      float64
components C-0228                                      float64
components C-0051                                      float64
components C-1914                                      float64
components C-1910                                      float64
components C-1672                                      float64
components C-1850                                      float64
components C-1313                                      float64
components C-1312                                      float64
components C-1619                                      float64
components C-0577                                      float64
components C-1533                                      float64
components C-1536                                      float64
components C-0133                                      float64
components C-0134                                      float64
components C-1405                                      float64
components C-1625                                      float64
components C-1624                                      float64
components C-1627                                      float64
components C-1626                                      float64
components C-1621                                      float64
components C-1620                                      float64
components C-1623                                      float64
components C-1622                                      float64
components C-1743                                      float64
components C-1629                                      float64
components C-1628                                      float64
components C-1745                                      float64
components C-2030                                      float64
components C-0048                                      float64
components C-1650                                      float64
components C-0045                                      float64
components C-1663                                      float64
components C-0599                                      float64
components C-0616                                      float64
components C-1817                                      float64
components C-1956                                      float64
components C-1954                                      float64
components C-1218                                      float64
components C-1889                                      float64
components C-1445                                      float64
components C-1881                                      float64
components C-1880                                      float64
components C-1885                                      float64
components C-1369                                      float64
components C-1541                                      float64
components C-1547                                      float64
components C-1661                                      float64
components C-1660                                      float64
components C-0095                                      float64
components C-0002                                      float64
components C-0003                                      float64
components C-0001                                      float64
components C-0007                                      float64
components C-0004                                      float64
components C-0165                                      float64
components C-1714                                      float64
components C-1716                                      float64
components C-1711                                      float64
components C-1718                                      float64
components C-1866                                      float64
components C-2043                                      float64
components C-1781                                      float64
components C-1715                                      float64
components C-0434                                      float64
components C-0539                                      float64
components C-0544                                      float64
components C-0548                                      float64
components C-1963                                      float64
components C-1848                                      float64
components C-1845                                      float64
components C-1846                                      float64
components C-1244                                      float64
components C-1243                                      float64
components C-1242                                      float64
components C-0579                                      float64
components C-0102                                      float64
components C-1459                                      float64
components C-0120                                      float64
components C-1183                                      float64
components C-1439                                      float64
components C-1430                                      float64
components C-1434                                      float64
components C-1435                                      float64
components C-1758                                      float64
components C-1614                                      float64
components C-1615                                      float64
components C-1750                                      float64
components C-2008                                      float64
components C-2004                                      float64
components C-2005                                      float64
components C-2006                                      float64
components C-2001                                      float64
components C-2002                                      float64
components C-2003                                      float64
components C-0699                                      float64
components C-0751                                      float64
components C-1505                                      float64
components C-2032                                      float64
components C-0058                                      float64
components C-1502                                      float64
components C-0057                                      float64
components C-0199                                      float64
components C-0052                                      float64
components C-0674                                      float64
components C-1577                                      float64
components C-0826                                      float64
components C-0823                                      float64
components C-1873                                      float64
components C-0580                                      float64
components C-0401                                      float64
components C-0275                                      float64
components C-1808                                      float64
components C-0409                                      float64
components C-1209                                      float64
components C-1208                                      float64
components C-1203                                      float64
components C-1200                                      float64
components C-1206                                      float64
components C-1205                                      float64
components C-0473                                      float64
components C-1386                                      float64
components C-1385                                      float64
components C-1936                                      float64
components C-1375                                      float64
components C-1374                                      float64
components C-1373                                      float64
components C-1475                                      float64
components C-1476                                      float64
components C-1477                                      float64
components C-1555                                      float64
components C-1554                                      float64
components C-0389                                      float64
components C-0388                                      float64
components C-1724                                      float64
components C-1725                                      float64
components C-1727                                      float64
components C-1728                                      float64
components C-0333                                      float64
components C-1654                                      float64
components C-1655                                      float64
components C-1651                                      float64
components C-1652                                      float64
components C-1658                                      float64
components C-1659                                      float64
components C-0703                                      float64
components C-0448                                      float64
components C-0449                                      float64
components C-0520                                      float64
components C-0444                                      float64
components C-0445                                      float64
components C-0208                                      float64
components C-0209                                      float64
components C-1970                                      float64
components C-1976                                      float64
components C-1877                                      float64
components C-1233                                      float64
components C-1230                                      float64
components C-1231                                      float64
components C-1235                                      float64
components C-1332                                      float64
components C-0227                                      float64
components C-1425                                      float64
components C-1421                                      float64
components C-1420                                      float64
components C-1586                                      float64
components C-1428                                      float64
components C-1348                                      float64
components C-1349                                      float64
components C-1344                                      float64
components C-1345                                      float64
components C-1343                                      float64
components C-1565                                      float64
components C-2019                                      float64
components C-2017                                      float64
components C-1630                                      float64
components C-0062                                      float64
components C-0562                                      float64
components C-0244                                      float64
components C-1761                                      float64
components C-0369                                      float64
components C-1768                                      float64
components C-0844                                      float64
components C-1841                                      float64
components C-1398                                      float64
components C-1908                                      float64
components C-1909                                      float64
components C-1901                                      float64
components C-1906                                      float64
components C-1907                                      float64
components C-1998                                      float64
components C-1995                                      float64
components C-1994                                      float64
components C-1417                                      float64
components C-1411                                      float64
components C-1637                                      float64
components C-1635                                      float64
components C-1632                                      float64
components C-1633                                      float64
components C-1739                                      float64
components C-1631                                      float64
components C-1638                                      float64
components C-1639                                      float64
components C-2026                                      float64
components C-2027                                      float64
components C-2028                                      float64
components C-2029                                      float64
components C-1643                                      float64
components C-1642                                      float64
components C-1641                                      float64
components C-1640                                      float64
components C-1647                                      float64
components C-1646                                      float64
components C-1645                                      float64
components C-1644                                      float64
components C-1649                                      float64
components C-1648                                      float64
bracketing_pattern XXX_other                           float64
bracketing_pattern (10, 15, 20, 25, 30)                float64
bracketing_pattern (5, 10, 20, 50, 100)                float64
bracketing_pattern (25, 50, 75, 100, 290, 325, 350)    float64
bracketing_pattern (1, 2, 3, 5, 10, 20, 50)            float64
bracketing_pattern (1, 3, 5, 7, 10)                    float64
bracketing_pattern (1, 2, 3, 4, 6)                     float64
bracketing_pattern (5, 19, 20)                         float64
bracketing_pattern (1, 3, 5, 7, 9)                     float64
bracketing_pattern (1, 15)                             float64
bracketing_pattern (10, 15, 20, 30)                    float64
bracketing_pattern (1, 3, 5, 10, 25)                   float64
bracketing_pattern (30, 60, 90, 120)                   float64
bracketing_pattern (5, 10)                             float64
bracketing_pattern (1, 2, 4, 9, 19)                    float64
bracketing_pattern (5, 20)                             float64
bracketing_pattern (1, 3, 5, 10, 20)                   float64
bracketing_pattern (2, 3, 4, 6)                        float64
bracketing_pattern (15, 25, 35)                        float64
bracketing_pattern (1, 5, 10, 20)                      float64
bracketing_pattern (1, 5, 10, 20, 50)                  float64
bracketing_pattern (1, 3, 5, 10, 15, 25)               float64
bracketing_pattern (25, 50, 75, 100)                   float64
bracketing_pattern (3, 5, 7, 9)                        float64
bracketing_pattern (1, 2, 5, 10, 25, 50, 100, 250)     float64
bracketing_pattern (1, 6)                              float64
bracketing_pattern (2, 5)                              float64
bracketing_pattern (1, 2, 3, 5, 10)                    float64
bracketing_pattern (1, 3, 5)                           float64
bracketing_pattern (1, 2, 5, 10)                       float64
bracketing_pattern (1, 10)                             float64
bracketing_pattern (1, 2, 3, 5, 7)                     float64
bracketing_pattern (1, 3)                              float64
bracketing_pattern (5, 10, 15)                         float64
bracketing_pattern (6, 12, 18, 24)                     float64
bracketing_pattern (10, 15, 20)                        float64
bracketing_pattern (1, 4)                              float64
bracketing_pattern ()                                  float64
bracketing_pattern (1, 3, 5, 10, 15)                   float64
bracketing_pattern (1, 8)                              float64
bracketing_pattern (10, 20, 30, 40)                    float64
bracketing_pattern (5, 10, 15, 20, 25)                 float64
bracketing_pattern (10, 15, 20, 25)                    float64
bracketing_pattern (50, 50)                            float64
bracketing_pattern (20, 40, 60, 80)                    float64
bracketing_pattern (4, 10)                             float64
bracketing_pattern (1, 2, 4)                           float64
bracketing_pattern (10, 25, 40, 55, 70)                float64
bracketing_pattern (5, 10, 25)                         float64
bracketing_pattern (1, 2, 3, 4, 5)                     float64
bracketing_pattern (2, 10, 25, 50, 100)                float64
bracketing_pattern (1, 20, 50)                         float64
bracketing_pattern (1, 2, 3, 5, 10, 20)                float64
bracketing_pattern (3, 5, 10)                          float64
bracketing_pattern (1, 2, 4, 8, 16)                    float64
bracketing_pattern (2, 3, 4, 5)                        float64
bracketing_pattern (1, 5, 10, 15, 20)                  float64
bracketing_pattern (1, 2, 5)                           float64
bracketing_pattern (8, 16, 24, 32)                     float64
bracketing_pattern (2, 4, 6, 8)                        float64
bracketing_pattern (1, 2, 5, 10, 25, 50)               float64
bracketing_pattern (1, 2, 3, 4)                        float64
bracketing_pattern (5, 10, 15, 20)                     float64
bracketing_pattern (4, 6, 8, 10)                       float64
bracketing_pattern (1, 12)                             float64
bracketing_pattern (4, 15)                             float64
bracketing_pattern (2, 4, 6, 8, 10)                    float64
bracketing_pattern (2, 3, 4)                           float64
bracketing_pattern (1, 3, 5, 10)                       float64
bracketing_pattern (8, 16, 24, 32, 48)                 float64
bracketing_pattern (1, 5)                              float64
bracketing_pattern (3, 4, 5, 6)                        float64
bracketing_pattern (1, 3, 5, 10, 20, 30)               float64
bracketing_pattern (1, 2, 5, 10, 25)                   float64
bracketing_pattern (1, 30)                             float64
bracketing_pattern (1, 2, 5, 10, 25, 50, 100)          float64
bracketing_pattern (1, 6, 20)                          float64
bracketing_pattern (3, 6, 9, 12)                       float64
bracketing_pattern (1, 2, 3, 5, 10, 20, 50, 100)       float64
bracketing_pattern (3, 5, 10, 20)                      float64
bracketing_pattern (10, 20, 30)                        float64
bracketing_pattern (1, 2, 5, 25, 35)                   float64
bracketing_pattern (1, 2)                              float64
bracketing_pattern (5, 10, 20)                         float64
bracketing_pattern (4, 8, 12, 16)                      float64
bracketing_pattern (10, 15, 25)                        float64
ends XXX_other                                         float64
ends EF-005                                            float64
ends NONE                                              float64
ends EF-001                                            float64
ends EF-002                                            float64
ends EF-003                                            float64
ends EF-008                                            float64
ends EF-009                                            float64
ends EF-023                                            float64
ends EF-021                                            float64
ends EF-006                                            float64
ends EF-013                                            float64
ends EF-012                                            float64
ends EF-011                                            float64
ends EF-010                                            float64
ends EF-017                                            float64
ends EF-016                                            float64
ends EF-015                                            float64
ends EF-019                                            float64
ends EF-018                                            float64
component_groups XXX_other                             float64
component_groups threaded                              float64
component_groups sleeve                                float64
component_groups adaptor                               float64
component_groups nut                                   float64
component_groups float                                 float64
component_groups boss                                  float64
component_groups other                                 float64
component_groups hfl                                   float64
component_groups elbow                                 float64
component_groups straight                              float64
component_types XXX_other                              float64
component_types CP-004                                 float64
component_types CP-006                                 float64
component_types CP-007                                 float64
component_types CP-001                                 float64
component_types CP-002                                 float64
component_types CP-003                                 float64
component_types CP-008                                 float64
component_types CP-009                                 float64
component_types CP-028                                 float64
component_types CP-022                                 float64
component_types CP-023                                 float64
component_types CP-026                                 float64
component_types CP-027                                 float64
component_types CP-024                                 float64
component_types CP-025                                 float64
component_types other                                  float64
component_types CP-012                                 float64
component_types CP-011                                 float64
component_types CP-010                                 float64
component_types CP-016                                 float64
component_types CP-015                                 float64
component_types CP-014                                 float64
component_types CP-019                                 float64
component_types CP-018                                 float64
component_end_forms XXX_other                          float64
component_end_forms A-007                              float64
component_end_forms A-006                              float64
component_end_forms A-005                              float64
component_end_forms A-004                              float64
component_end_forms A-003                              float64
component_end_forms A-002                              float64
component_end_forms A-001                              float64
component_end_forms 9999                               float64
component_connection_types XXX_other                   float64
component_connection_types 9999                        float64
component_connection_types B-012                       float64
component_connection_types B-011                       float64
component_connection_types B-004                       float64
component_connection_types B-005                       float64
component_connection_types B-006                       float64
component_connection_types B-007                       float64
component_connection_types B-001                       float64
component_connection_types B-002                       float64
component_part_names XXX_other                         float64
component_part_names ADAPTER                           float64
component_part_names LINK                              float64
component_part_names ELBOW                             float64
component_part_names CONNECTOR-WELD                    float64
component_part_names SLEEVE-CRIMP                      float64
component_part_names FITTING-NUT                       float64
component_part_names HEAD-FLANGED                      float64
component_part_names WASHER-FUEL INJ                   float64
component_part_names SLEEVE-FLARED                     float64
component_part_names CONNECTOR-BHD                     float64
component_part_names BOSS                              float64
component_part_names NUT-ORFS                          float64
component_part_names CAP-A/C                           float64
component_part_names PLATE                             float64
component_part_names NUT-A/C                           float64
component_part_names ADAPTER-OIL LIN                   float64
component_part_names NUT-FUEL LINE                     float64
component_part_names FLANGE                            float64
component_part_names WASHER-FUEL LIN                   float64
component_part_names BLOCK                             float64
component_part_names NUT-FUEL INJ                      float64
component_part_names TUBE                              float64
component_part_names NUT-FLARED                        float64
component_part_names STUD-WELD                         float64
component_part_names SEAL-O-RING-ORFS                  float64
component_part_names CLIP                              float64
component_part_names BRACKET                           float64
component_part_names VALVE AS.-A/C                     float64
component_part_names LUG                               float64
component_part_names NUT                               float64
component_part_names PIPE                              float64
component_part_names FITTING                           float64
component_part_names NUT-FITTING                       float64
component_part_names ADAPTER-A/C                       float64
component_part_names NUT-SWIVEL                        float64
component_part_names NUT-INJ LINE                      float64
component_part_names ADAPTER-EXH PIP                   float64
component_part_names COLLAR                            float64
component_part_names SLEEVE-FITTING                    float64
component_part_names SLEEVE                            float64
component_part_names TUBE AS                           float64
component_part_names NUT-WELD                          float64
component_part_names SEAL-O-RING                       float64
component_part_names WASHER                            float64
dtypes: bool(8), float64(580), int64(11)
memory usage: 123.4 MB

In [ ]:
# Experiment: remove components in the 'straight' group.

assert False

cids_to_remove = set(cinfo_df.component_id[cinfo_df.component_group_id == 'straight'])
print len(cids_to_remove)

for col in list(X_train_feats.columns):
    if col.startswith('components '):
        cid = col[len('components '):]
        if cid in cids_to_remove:
            print "popping", col
            X_train_feats.pop(col)
            X_test_feats.pop(col)

X_train_feats.info(verbose=True)

X_train_feats.shape, X_test_feats.shape

In [122]:
X_train_np = X_train_feats.astype(np.float).values
y_train_np = y_train.values
X_test_np = X_test_feats.astype(np.float).values
y_test_np = y_test.values
print X_train_np.shape, X_test_np.shape, y_train_np.shape, y_test_np.shape


(27270, 599) (152, 599) (27270,) (152,)

In [123]:
import xgboost as xgb

params = {
    'objective': 'reg:linear',
    'eta': 0.02,
    'min_child_weight': 6,
    'subsample': 0.7,
    'colsample_bytree': 0.6,
    'silent': 1,
    'max_depth': 8,
}

xgtrain = xgb.DMatrix(X_train_np, label=y_train_np)
xgtest = xgb.DMatrix(X_test_np)

In [124]:
num_rounds = 1000
%time model = xgb.train(params.items(), xgtrain, num_rounds)
%time y_train_pred = model.predict(xgtrain)
train_rmsle = np.sqrt(mean_squared_error(y_train_np, y_train_pred))
%time y_test_pred = model.predict(xgtest)
test_rmsle = np.sqrt(mean_squared_error(y_test_np, y_test_pred))
print train_rmsle, test_rmsle


CPU times: user 2min 39s, sys: 380 ms, total: 2min 39s
Wall time: 1min 37s
CPU times: user 12 ms, sys: 0 ns, total: 12 ms
Wall time: 4.96 ms
CPU times: user 44 ms, sys: 0 ns, total: 44 ms
Wall time: 27.6 ms
0.124960740984 0.445949340695

In [62]:
in_test_not_train = all_counts[(all_counts.train_count == 0) & (all_counts.test_count > 0)]
print len(in_test_not_train)
print in_test_not_train.component_group_id.value_counts()


78
other       50
straight    10
boss         8
threaded     6
elbow        2
adaptor      2
dtype: int64

In [127]:
df = X_test.copy()
df['true_log_cost'] = y_test
df['pred_log_cost'] = y_test_pred
df['err2'] = (df.true_log_cost - df.pred_log_cost) ** 2
print np.sqrt(df.err2.mean())
df.sort('err2', ascending=False, inplace=True)
# df[:10]


0.445949340695

In [131]:
print np.sqrt(df.err2[df.supplier == 'S-0066'].mean())


0.415543115089

In [133]:
print np.sqrt(df.err2[df.bracketing_pattern == (1, 2, 5, 10, 25, 50, 100, 250)].mean())


0.237514483043

In [134]:
print np.sqrt(df.err2[df.bracketing_pattern == ()].mean())


0.57245976886

In [ ]:


In [55]:
# Approach 1: Replace unknown 'straight' components with their nearest known neighbor.

straight = pd.read_csv('straight_vecs.csv')
straight.set_index('component_id', drop=True, inplace=True)
straight_np = straight.astype(np.float).values
print straight.shape
print straight_np.shape

from scipy.cluster.vq import whiten
straight_np_wh = whiten(straight_np)
cid_to_row = {}
for i, cid in enumerate(straight.index):
    cid_to_row[cid] = straight_np_wh[i, :]

unknown_cids = set(in_test_not_train.component_id[in_test_not_train.component_group_id == 'straight'].values)
print unknown_cids

from scipy.spatial.distance import euclidean

cid_to_subst = {}
for cid in unknown_cids:
    cid_row = cid_to_row[cid]
    best_target_cid = None
    best_dist = np.inf
    for target_cid, target_cid_row in cid_to_row.iteritems():
        if target_cid in unknown_cids:
            continue
        dist = euclidean(cid_row, target_cid_row)
        if dist < best_dist:
            best_target_cid = target_cid
            best_dist = dist
    cid_to_subst[cid] = best_target_cid
    print "unknown cid {} mapped to known cid {} with dist {}".format(cid, best_target_cid, best_dist)

cid_to_subst


(361, 26)
(361, 26)
set(['C-0334', 'C-1494', 'C-1999', 'C-0141', 'C-1549', 'C-0621', 'C-1897', 'C-0466', 'C-1785', 'C-0362'])
unknown cid C-0334 mapped to known cid C-0741 with dist 0.424655790434
unknown cid C-1494 mapped to known cid C-1495 with dist 0.0148741682958
unknown cid C-1999 mapped to known cid C-0457 with dist 1.30878282013
unknown cid C-0141 mapped to known cid C-1996 with dist 0.459905470358
unknown cid C-1549 mapped to known cid C-1740 with dist 0.0863014648659
unknown cid C-0621 mapped to known cid C-1900 with dist 1.48704650335
unknown cid C-1897 mapped to known cid C-1344 with dist 0.887588960897
unknown cid C-0466 mapped to known cid C-1433 with dist 0.151038244132
unknown cid C-1785 mapped to known cid C-1329 with dist 0.23612852287
unknown cid C-0362 mapped to known cid C-0038 with dist 0.41829806389
Out[55]:
{'C-0141': 'C-1996',
 'C-0334': 'C-0741',
 'C-0362': 'C-0038',
 'C-0466': 'C-1433',
 'C-0621': 'C-1900',
 'C-1494': 'C-1495',
 'C-1549': 'C-1740',
 'C-1785': 'C-1329',
 'C-1897': 'C-1344',
 'C-1999': 'C-0457'}

In [46]:
cids = ('C-0334', 'C-0741')
straight[straight.index.isin(cids)]


Out[46]:
bolt_pattern_long bolt_pattern_wide head_diameter overall_length thickness groove unique_feature orientation weight MJ-001 MJ-002 MJ-003 MJ-007 MJ-other CP-001 CP-002 CP-003 CP-004 CP-005 CP-006 CP-007 bolt_pattern_long_missing bolt_pattern_wide_missing head_diameter_missing overall_length_missing weight_missing
component_id
C-0334 71.77567 40.841225 47.63 27.906098 9.53 True False False 0.104 0 0 0 0 1 0 0 0 0 0 1 0 True True False True False
C-0741 71.77567 40.841225 50.80 27.906098 8.24 True False False 0.140 0 0 0 0 1 0 0 0 0 0 1 0 True True False True False

In [60]:
X_test_mangled = X_test.copy()
orig_components = X_test_mangled.pop('components')
subst_components = []
for cids in orig_components.values:
    subst_cids = []
    for cid in cids:
        if cid in unknown_cids:
            target_cid = cid_to_subst[cid]
        else:
            target_cid = cid
        subst_cids.append(target_cid)
    subst_components.append(subst_cids)
X_test_mangled['components'] = subst_components

X_test_mangled['orig_components'] = orig_components
print X_test_mangled[['tube_assembly_id', 'components', 'orig_components']][:10]
X_test_mangled.pop('orig_components')
None

# Note that we only make substitutions for 'straight' at the moment,
# so some component lists will remain unchanged...


  tube_assembly_id        components   orig_components
0         TA-00181  [C-1468, C-1480]  [C-1468, C-1480]
1         TA-00181  [C-1468, C-1480]  [C-1468, C-1480]
2         TA-00181  [C-1468, C-1480]  [C-1468, C-1480]
3         TA-00181  [C-1468, C-1480]  [C-1468, C-1480]
4         TA-00566  [C-1329, C-1329]  [C-1785, C-1785]
5         TA-00968          [C-1764]          [C-1764]
6         TA-01243          [C-1996]          [C-0141]
7         TA-01243          [C-1996]          [C-0141]
8         TA-01243          [C-1996]          [C-0141]
9         TA-01243          [C-1996]          [C-0141]

In [61]:
print X_test.shape, X_test_mangled.shape
X_test_mangled_feats = featurizer.transform(X_test_mangled)
X_test_mangled_np = X_test_mangled_feats.astype(np.float).values
xgtest_mangled = xgb.DMatrix(X_test_mangled_np)
y_test_mangled_pred = model.predict(xgtest_mangled)
test_mangled_rmsle = np.sqrt(mean_squared_error(y_test_np, y_test_mangled_pred))
print test_mangled_rmsle


(152, 50) (152, 50)
0.432077643867

In [110]:
# Experiment: Train on test set, to see if our model can even represent this shit.

num_rounds = 1000
hack_train = xgb.DMatrix(X_test_np, label=y_test_np)
%time model = xgb.train(params.items(), hack_train, num_rounds)
%time y_hack_train_pred = model.predict(hack_train)
hack_train_rmsle = np.sqrt(mean_squared_error(y_test_np, y_hack_train_pred))
print train_rmsle


CPU times: user 1.64 s, sys: 24 ms, total: 1.66 s
Wall time: 1.04 s
CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 185 µs
0.123782731596

In [136]:
X_test.bracketing_pattern.value_counts()


Out[136]:
()                                      51
(1, 2, 5, 10, 25, 50, 100, 250)         40
(15, 30, 50, 60, 100, 200, 300, 400)     8
(1, 6, 20)                               6
(1, 2, 5, 10, 25, 50)                    6
(1, 3, 5, 10, 20, 50)                    6
(2, 5, 10, 15, 20)                       5
(1, 2, 3, 5, 7)                          5
(1, 3, 5, 7, 9)                          5
(1, 2, 3, 5)                             4
(3, 5, 10, 15)                           4
(1, 2, 5)                                3
(10, 20, 50)                             3
(50, 100)                                2
(5, 20)                                  2
(1, 2)                                   2
dtype: int64

In [ ]: