In [124]:
%pylab inline

import pandas as pd

from soln.dataset import get_augmented_train_and_test_set

%time aug_train_set, aug_test_set = get_augmented_train_and_test_set()
print aug_train_set.shape
print aug_test_set.shape
aug_train_set.info()


Populating the interactive namespace from numpy and matplotlib
CPU times: user 14.1 s, sys: 332 ms, total: 14.4 s
Wall time: 15.3 s
(30213, 55)
(30235, 53)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 30213 entries, 0 to 30212
Data columns (total 55 columns):
tube_assembly_id                   30213 non-null object
supplier                           30213 non-null object
quote_date                         30213 non-null object
annual_usage                       30213 non-null int64
min_order_quantity                 30213 non-null int64
bracket_pricing                    30213 non-null bool
quantity                           30213 non-null int64
log_cost                           30213 non-null float64
material_id                        29984 non-null object
diameter                           30213 non-null float64
wall_thickness                     30213 non-null float64
length                             30213 non-null float64
num_bends                          30213 non-null int64
bend_radius                        30213 non-null float64
end_a_1x                           30213 non-null bool
end_a_2x                           30213 non-null bool
end_x_1x                           30213 non-null bool
end_x_2x                           30213 non-null bool
end_a                              30213 non-null object
end_x                              30213 non-null object
num_boss                           30213 non-null int64
num_bracket                        30213 non-null int64
num_other                          30213 non-null int64
specs                              30213 non-null object
components                         30213 non-null object
quote_age                          30213 non-null float64
adj_quantity                       30213 non-null int64
adj_bracketing                     30213 non-null bool
bracketing_pattern                 30213 non-null object
physical_volume                    30213 non-null float64
inner_radius                       30213 non-null float64
material_volume                    30213 non-null float64
ends                               30213 non-null object
end_a_forming                      30213 non-null bool
end_1x_count                       30213 non-null int64
end_x_forming                      30213 non-null bool
end_2x_count                       30213 non-null int64
end_forming_count                  30213 non-null int64
component_groups                   30213 non-null object
component_types                    30213 non-null object
unique_feature_count               30213 non-null float64
orientation_count                  30213 non-null float64
groove_count                       30213 non-null float64
total_component_weight             30213 non-null float64
component_end_forms                30213 non-null object
component_connection_types         30213 non-null object
component_max_length               30213 non-null float64
component_max_overall_length       30213 non-null float64
component_max_bolt_pattern_wide    30213 non-null float64
component_max_bolt_pattern_long    30213 non-null float64
component_max_thickness            30213 non-null float64
component_min_thread_pitch         30213 non-null float64
component_min_thread_size          30213 non-null float64
component_part_names               30213 non-null object
dev_fold                           30213 non-null int64
dtypes: bool(8), float64(20), int64(12), object(15)
memory usage: 11.3+ MB

In [22]:
# Check that dev_fold is properly shuffled.
plot(aug_train_set.dev_fold.values, '.')


Out[22]:
[<matplotlib.lines.Line2D at 0xc00c390>]

In [23]:
# Check that the folds have roughly equal sizes.
aug_train_set.dev_fold.value_counts()


Out[23]:
1    3141
3    3121
8    3110
5    3013
9    2997
7    2995
2    2983
6    2965
4    2945
0    2943
dtype: int64

In [27]:
# Check that no taid occurs in more than one fold.
taid_to_fold = {}
arr = aug_train_set[['tube_assembly_id', 'dev_fold']].values
assert arr.shape == (len(aug_train_set), 2)
for taid, fold in arr:
    if taid in taid_to_fold:
        if taid_to_fold[taid] != fold:
            print "found taid {} with multiple folds".format(taid)
    else:
        taid_to_fold[taid] = fold
print "done"


done

In [28]:
df = aug_train_set[['tube_assembly_id', 'adj_quantity', 'bracketing_pattern']]
df[500:520]


Out[28]:
tube_assembly_id adj_quantity bracketing_pattern
500 TA-00222 5 (1, 2, 5, 10, 25, 50, 100, 250)
501 TA-00222 10 (1, 2, 5, 10, 25, 50, 100, 250)
502 TA-00222 25 (1, 2, 5, 10, 25, 50, 100, 250)
503 TA-00222 50 (1, 2, 5, 10, 25, 50, 100, 250)
504 TA-00222 100 (1, 2, 5, 10, 25, 50, 100, 250)
505 TA-00222 250 (1, 2, 5, 10, 25, 50, 100, 250)
506 TA-00228 1 (1, 10)
507 TA-00228 10 (1, 10)
508 TA-00229 5 ()
509 TA-00230 1 ()
510 TA-00239 1 (1, 12)
511 TA-00239 12 (1, 12)
512 TA-00242 1 (1, 6, 20)
513 TA-00242 6 (1, 6, 20)
514 TA-00242 20 (1, 6, 20)
515 TA-00243 1 ()
516 TA-00248 1 ()
517 TA-00249 1 (1, 2, 5, 10, 25, 50, 100, 250)
518 TA-00249 2 (1, 2, 5, 10, 25, 50, 100, 250)
519 TA-00249 5 (1, 2, 5, 10, 25, 50, 100, 250)

In [37]:
aug_train_set.dev_fold.value_counts()


Out[37]:
9    4695
0    3902
1    3819
2    3769
3    2959
8    2925
4    2781
5    1918
6    1856
7    1589
dtype: int64

In [41]:
aug_train_set.dev_fold.plot()


Out[41]:
<matplotlib.axes._subplots.AxesSubplot at 0xb638190>

In [44]:
from soln.dataset import generate_xv_splits

split = next(generate_xv_splits(aug_train_set))
X_train, y_train, X_test, y_test = split
print aug_train_set.shape
print [thing.shape for thing in split]


(30213, 30)
[(26311, 28), (26311,), (3902, 28), (3902,)]

In [56]:
from soln.dataset import AllCategoricalsFeaturizer

featurizer = AllCategoricalsFeaturizer(keep_orig_feats=True)
%time featurizer.fit(X_train)
%time X_train_feats = featurizer.transform(X_train)
X_train_feats.info(verbose=True)


CPU times: user 452 ms, sys: 0 ns, total: 452 ms
Wall time: 470 ms
CPU times: user 644 ms, sys: 200 ms, total: 844 ms
Wall time: 864 ms
<class 'pandas.core.frame.DataFrame'>
Int64Index: 26311 entries, 0 to 26310
Data columns (total 448 columns):
tube_assembly_id                                      object
supplier                                              object
quote_date                                            object
annual_usage                                          int64
min_order_quantity                                    int64
bracket_pricing                                       bool
quantity                                              int64
material_id                                           object
diameter                                              float64
wall_thickness                                        float64
length                                                float64
num_bends                                             int64
bend_radius                                           float64
end_a_1x                                              bool
end_a_2x                                              bool
end_x_1x                                              bool
end_x_2x                                              bool
end_a                                                 object
end_x                                                 object
num_boss                                              int64
num_bracket                                           int64
num_other                                             int64
specs                                                 object
components                                            object
quote_age                                             float64
adj_quantity                                          int64
adj_bracketing                                        bool
bracketing_pattern                                    object
supplier other                                        float64
supplier S-0042                                       float64
supplier S-0005                                       float64
supplier S-0026                                       float64
supplier S-0027                                       float64
supplier S-0072                                       float64
supplier S-0062                                       float64
supplier S-0064                                       float64
supplier S-0043                                       float64
supplier S-0066                                       float64
supplier S-0041                                       float64
supplier S-0105                                       float64
supplier S-0080                                       float64
supplier S-0081                                       float64
supplier S-0104                                       float64
supplier S-0013                                       float64
supplier S-0014                                       float64
supplier S-0070                                       float64
supplier S-0031                                       float64
supplier S-0030                                       float64
supplier S-0058                                       float64
supplier S-0054                                       float64
supplier S-0092                                       float64
material_id other                                     float64
material_id nan                                       float64
material_id SP-0046                                   float64
material_id SP-0041                                   float64
material_id SP-0033                                   float64
material_id SP-0048                                   float64
material_id SP-0034                                   float64
material_id SP-0035                                   float64
material_id SP-0036                                   float64
material_id SP-0037                                   float64
material_id SP-0030                                   float64
material_id SP-0019                                   float64
material_id SP-0008                                   float64
material_id SP-0038                                   float64
material_id SP-0039                                   float64
material_id SP-0029                                   float64
material_id SP-0028                                   float64
end_a other                                           float64
end_a EF-005                                          float64
end_a NONE                                            float64
end_a EF-002                                          float64
end_a EF-003                                          float64
end_a EF-008                                          float64
end_a EF-009                                          float64
end_a EF-023                                          float64
end_a EF-021                                          float64
end_a EF-012                                          float64
end_a EF-017                                          float64
end_a EF-016                                          float64
end_a EF-015                                          float64
end_a EF-019                                          float64
end_a EF-018                                          float64
end_x other                                           float64
end_x EF-005                                          float64
end_x NONE                                            float64
end_x EF-002                                          float64
end_x EF-003                                          float64
end_x EF-008                                          float64
end_x EF-009                                          float64
end_x EF-023                                          float64
end_x EF-021                                          float64
end_x EF-006                                          float64
end_x EF-012                                          float64
end_x EF-010                                          float64
end_x EF-017                                          float64
end_x EF-015                                          float64
end_x EF-019                                          float64
end_x EF-018                                          float64
specs other                                           float64
specs SP-0065                                         float64
specs SP-0050                                         float64
specs SP-0057                                         float64
specs SP-0009                                         float64
specs SP-0058                                         float64
specs SP-0025                                         float64
specs SP-0079                                         float64
specs SP-0002                                         float64
specs SP-0068                                         float64
specs SP-0070                                         float64
specs SP-0017                                         float64
specs SP-0072                                         float64
specs SP-0012                                         float64
specs SP-0013                                         float64
specs SP-0076                                         float64
specs SP-0022                                         float64
specs SP-0021                                         float64
specs SP-0016                                         float64
specs SP-0071                                         float64
specs SP-0088                                         float64
specs SP-0080                                         float64
specs SP-0082                                         float64
specs SP-0010                                         float64
specs SP-0075                                         float64
specs SP-0026                                         float64
specs SP-0069                                         float64
specs SP-0024                                         float64
specs SP-0005                                         float64
specs SP-0004                                         float64
specs SP-0007                                         float64
specs SP-0063                                         float64
specs SP-0062                                         float64
specs SP-0061                                         float64
specs SP-0067                                         float64
specs SP-0029                                         float64
components other                                      float64
components C-1653                                     float64
components C-0218                                     float64
components C-0217                                     float64
components C-0215                                     float64
components C-0214                                     float64
components C-0211                                     float64
components C-0210                                     float64
components C-1867                                     float64
components C-1866                                     float64
components C-1860                                     float64
components C-1554                                     float64
components C-1869                                     float64
components C-1229                                     float64
components C-1898                                     float64
components C-0102                                     float64
components C-1359                                     float64
components C-1355                                     float64
components C-1354                                     float64
components C-1352                                     float64
components C-1577                                     float64
components C-1677                                     float64
components C-0122                                     float64
components C-0071                                     float64
components C-1017                                     float64
components C-1578                                     float64
components C-0250                                     float64
components C-0318                                     float64
components C-1779                                     float64
components C-0422                                     float64
components C-1821                                     float64
components C-0855                                     float64
components C-0227                                     float64
components C-0550                                     float64
components C-0494                                     float64
components C-0228                                     float64
components C-1910                                     float64
components C-1850                                     float64
components C-1313                                     float64
components C-1312                                     float64
components C-1619                                     float64
components C-1538                                     float64
components C-0577                                     float64
components C-1533                                     float64
components C-1536                                     float64
components C-0133                                     float64
components C-0134                                     float64
components C-1405                                     float64
components C-1625                                     float64
components C-1624                                     float64
components C-1627                                     float64
components C-1626                                     float64
components C-1621                                     float64
components C-1620                                     float64
components C-1623                                     float64
components C-1622                                     float64
components C-1743                                     float64
components C-1629                                     float64
components C-1628                                     float64
components C-1745                                     float64
components C-2030                                     float64
components C-1547                                     float64
components C-0048                                     float64
components C-1660                                     float64
components C-1663                                     float64
components C-0599                                     float64
components C-0616                                     float64
components C-1817                                     float64
components C-0448                                     float64
components C-1954                                     float64
components C-1218                                     float64
components C-1889                                     float64
components C-1445                                     float64
components C-1881                                     float64
components C-1880                                     float64
components C-1885                                     float64
components C-0199                                     float64
components C-1369                                     float64
components C-1059                                     float64
components C-1661                                     float64
components C-0095                                     float64
components C-0002                                     float64
components C-0003                                     float64
components C-0001                                     float64
components C-0007                                     float64
components C-0004                                     float64
components C-0165                                     float64
components C-1714                                     float64
components C-1716                                     float64
components C-1711                                     float64
components C-1718                                     float64
components C-1781                                     float64
components C-1715                                     float64
components C-0434                                     float64
components C-0539                                     float64
components C-0544                                     float64
components C-0548                                     float64
components C-1963                                     float64
components C-0679                                     float64
components C-1848                                     float64
components C-1845                                     float64
components C-1846                                     float64
components C-1244                                     float64
components C-1243                                     float64
components C-1242                                     float64
components C-0579                                     float64
components C-1902                                     float64
components C-0120                                     float64
components C-1183                                     float64
components C-1439                                     float64
components C-1430                                     float64
components C-1435                                     float64
components C-1758                                     float64
components C-1614                                     float64
components C-1615                                     float64
components C-1750                                     float64
components C-2008                                     float64
components C-2004                                     float64
components C-2005                                     float64
components C-2006                                     float64
components C-2001                                     float64
components C-2002                                     float64
components C-2003                                     float64
components C-0699                                     float64
components C-0751                                     float64
components C-2032                                     float64
components C-0058                                     float64
components C-1502                                     float64
components C-0051                                     float64
components C-1058                                     float64
components C-0052                                     float64
components C-0674                                     float64
components C-0826                                     float64
components C-0823                                     float64
components C-0580                                     float64
components C-0275                                     float64
components C-0401                                     float64
components C-1808                                     float64
components C-0409                                     float64
components C-1209                                     float64
components C-1203                                     float64
components C-1200                                     float64
components C-1206                                     float64
components C-1205                                     float64
components C-0345                                     float64
components C-1386                                     float64
components C-1385                                     float64
components C-1936                                     float64
components C-1375                                     float64
components C-1374                                     float64
components C-1373                                     float64
components C-1475                                     float64
components C-1476                                     float64
components C-1555                                     float64
components C-0389                                     float64
components C-0388                                     float64
components C-1988                                     float64
components C-1723                                     float64
components C-1727                                     float64
components C-1728                                     float64
components C-0333                                     float64
components C-1654                                     float64
components C-1655                                     float64
components C-1650                                     float64
components C-1651                                     float64
components C-1652                                     float64
components C-1658                                     float64
components C-0703                                     float64
components C-0449                                     float64
components C-0520                                     float64
components C-0444                                     float64
components C-0445                                     float64
components C-0208                                     float64
components C-0209                                     float64
components C-1970                                     float64
components C-1976                                     float64
components C-1873                                     float64
components C-1233                                     float64
components C-1230                                     float64
components C-1235                                     float64
components C-1332                                     float64
components C-1425                                     float64
components C-1421                                     float64
components C-1420                                     float64
components C-1428                                     float64
components C-1348                                     float64
components C-1349                                     float64
components C-1344                                     float64
components C-1345                                     float64
components C-1343                                     float64
components C-1565                                     float64
components C-2019                                     float64
components C-1630                                     float64
components C-0562                                     float64
components C-0244                                     float64
components C-1761                                     float64
components C-0369                                     float64
components C-1768                                     float64
components C-0844                                     float64
components C-1841                                     float64
components C-1398                                     float64
components C-1908                                     float64
components C-1909                                     float64
components C-1906                                     float64
components C-1907                                     float64
components C-1998                                     float64
components C-1995                                     float64
components C-1994                                     float64
components C-1411                                     float64
components C-1637                                     float64
components C-1635                                     float64
components C-1632                                     float64
components C-1633                                     float64
components C-1739                                     float64
components C-1631                                     float64
components C-1638                                     float64
components C-1639                                     float64
components C-2026                                     float64
components C-2027                                     float64
components C-2028                                     float64
components C-2029                                     float64
components C-1643                                     float64
components C-1642                                     float64
components C-1641                                     float64
components C-1640                                     float64
components C-1647                                     float64
components C-1646                                     float64
components C-1645                                     float64
components C-1644                                     float64
components C-1649                                     float64
components C-1648                                     float64
components C-1672                                     float64
bracketing_pattern other                              float64
bracketing_pattern (10, 15, 20, 25, 30)               float64
bracketing_pattern (5, 10, 20, 50, 100)               float64
bracketing_pattern (1, 2, 3, 5, 10, 20, 50)           float64
bracketing_pattern (1, 3, 5, 7, 10)                   float64
bracketing_pattern (1, 2, 3, 4, 6)                    float64
bracketing_pattern (5, 19, 20)                        float64
bracketing_pattern (1, 3, 5, 7, 9)                    float64
bracketing_pattern (10, 15, 20, 30)                   float64
bracketing_pattern (1, 3, 5, 10, 25)                  float64
bracketing_pattern (5, 10)                            float64
bracketing_pattern (1, 2, 4, 9, 19)                   float64
bracketing_pattern (5, 20)                            float64
bracketing_pattern (25, 50, 75, 100)                  float64
bracketing_pattern (1, 3, 5, 10, 20)                  float64
bracketing_pattern (2, 3, 4, 6)                       float64
bracketing_pattern (15, 25, 35)                       float64
bracketing_pattern (1, 5, 10, 20)                     float64
bracketing_pattern (1, 3, 5, 10, 15, 25)              float64
bracketing_pattern (17, 30)                           float64
bracketing_pattern (1, 2, 5, 10, 25, 50, 100, 250)    float64
bracketing_pattern (1, 2, 4)                          float64
bracketing_pattern (1, 6)                             float64
bracketing_pattern (1, 2, 3, 5, 10)                   float64
bracketing_pattern (1, 3, 5)                          float64
bracketing_pattern (3, 4, 5, 6)                       float64
bracketing_pattern (1, 10)                            float64
bracketing_pattern (1, 2, 3, 5, 7)                    float64
bracketing_pattern (1, 3)                             float64
bracketing_pattern (5, 10, 15)                        float64
bracketing_pattern (6, 12, 18, 24)                    float64
bracketing_pattern (3, 5, 7, 9)                       float64
bracketing_pattern (10, 15, 20)                       float64
bracketing_pattern (1, 4)                             float64
bracketing_pattern (1, 15)                            float64
bracketing_pattern (1, 2, 3, 5)                       float64
bracketing_pattern (1, 60, 125, 150, 200)             float64
bracketing_pattern ()                                 float64
bracketing_pattern (1, 3, 5, 10, 15)                  float64
bracketing_pattern (1, 8)                             float64
bracketing_pattern (10, 20, 30, 40)                   float64
bracketing_pattern (5, 10, 15, 20, 25)                float64
bracketing_pattern (50, 50)                           float64
bracketing_pattern (20, 40, 60, 80)                   float64
bracketing_pattern (4, 10)                            float64
bracketing_pattern (10, 25, 40, 55, 70)               float64
bracketing_pattern (5, 10, 25)                        float64
bracketing_pattern (1, 2, 3, 4, 5)                    float64
bracketing_pattern (1, 20, 50)                        float64
bracketing_pattern (1, 2, 3, 5, 10, 20)               float64
bracketing_pattern (3, 5, 10)                         float64
bracketing_pattern (1, 2, 4, 8, 16)                   float64
bracketing_pattern (2, 3, 4, 5)                       float64
bracketing_pattern (1, 5, 10, 15, 20)                 float64
bracketing_pattern (1, 2, 5)                          float64
bracketing_pattern (8, 16, 24, 32)                    float64
bracketing_pattern (2, 4, 6, 8)                       float64
bracketing_pattern (1, 2, 5, 10, 25, 50)              float64
bracketing_pattern (1, 2, 3, 4)                       float64
bracketing_pattern (5, 10, 15, 20)                    float64
bracketing_pattern (4, 15)                            float64
bracketing_pattern (4, 6, 8, 12)                      float64
bracketing_pattern (2, 4, 6, 8, 10)                   float64
bracketing_pattern (1, 3, 5, 10)                      float64
bracketing_pattern (8, 16, 24, 32, 48)                float64
bracketing_pattern (1, 5)                             float64
bracketing_pattern (1, 3, 5, 10, 20, 30)              float64
bracketing_pattern (1, 2, 5, 10, 25)                  float64
bracketing_pattern (1, 30)                            float64
bracketing_pattern (1, 2, 5, 10, 25, 50, 100)         float64
bracketing_pattern (1, 6, 20)                         float64
bracketing_pattern (3, 6, 9, 12)                      float64
bracketing_pattern (1, 2, 3, 5, 10, 20, 50, 100)      float64
bracketing_pattern (3, 5, 10, 20)                     float64
bracketing_pattern (10, 20, 30)                       float64
bracketing_pattern (1, 2, 5, 25, 35)                  float64
bracketing_pattern (1, 2)                             float64
bracketing_pattern (5, 10, 20)                        float64
bracketing_pattern (4, 8, 12, 16)                     float64
bracketing_pattern (10, 15, 25)                       float64
bracketing_pattern (2, 3, 5, 10, 20, 50, 100)         float64
dtypes: bool(6), float64(425), int64(8), object(9)
memory usage: 89.1+ MB

In [57]:
X_train_feats['components C-1312'].value_counts()


Out[57]:
0    23201
2     3024
1       86
dtype: int64

In [34]:
# Verify that the 'id' column in test_set.csv is equal to the row number (1-based).
from soln.dataset import load_raw_data
raw = load_raw_data()
id_col = raw['test_set']['id'].values
np.all(id_col - 1 == np.arange(30235))


Out[34]:
True

In [102]:
# Check 'ends' feature.
df = aug_train_set[['tube_assembly_id', 'end_a', 'end_x', 'ends']]
df[-10:]


Out[102]:
tube_assembly_id end_a end_x ends
30203 TA-21189 EF-018 NONE (EF-018, NONE)
30204 TA-21189 EF-018 NONE (EF-018, NONE)
30205 TA-21189 EF-018 NONE (EF-018, NONE)
30206 TA-21189 EF-018 NONE (EF-018, NONE)
30207 TA-21189 EF-018 NONE (EF-018, NONE)
30208 TA-21190 EF-003 EF-018 (EF-003, EF-018)
30209 TA-21191 EF-003 EF-018 (EF-003, EF-018)
30210 TA-21195 EF-003 EF-009 (EF-003, EF-009)
30211 TA-21196 EF-003 EF-017 (EF-003, EF-017)
30212 TA-21197 EF-003 EF-017 (EF-003, EF-017)

In [103]:
print aug_train_set.end_1x_count.value_counts()
print aug_train_set.end_2x_count.value_counts()


0    29162
1      787
2      264
dtype: int64
0    26513
1     2786
2      914
dtype: int64

In [104]:
# Check forming features.
df = aug_train_set[['tube_assembly_id', 'end_a', 'end_x', 'end_a_forming', 'end_x_forming', 'end_forming_count']]
df[30183:30193]


Out[104]:
tube_assembly_id end_a end_x end_a_forming end_x_forming end_forming_count
30183 TA-21164 EF-022 EF-023 True False 1
30184 TA-21165 EF-023 EF-023 False False 0
30185 TA-21166 EF-022 EF-023 True False 1
30186 TA-21167 EF-023 EF-023 False False 0
30187 TA-21171 EF-009 EF-009 True True 2
30188 TA-21175 EF-009 NONE True False 1
30189 TA-21176 EF-018 EF-018 True True 2
30190 TA-21177 EF-018 EF-018 True True 2
30191 TA-21179 EF-009 EF-009 True True 2
30192 TA-21180 EF-018 EF-003 True False 1

In [107]:
# Check component_groups and component_types features.
df = aug_train_set[['tube_assembly_id', 'components', 'component_groups', 'component_types']]
df[10:20]


Out[107]:
tube_assembly_id components component_groups component_types
10 TA-00004 [C-1312, C-1312] [adaptor, adaptor] [CP-028, CP-028]
11 TA-00004 [C-1312, C-1312] [adaptor, adaptor] [CP-028, CP-028]
12 TA-00004 [C-1312, C-1312] [adaptor, adaptor] [CP-028, CP-028]
13 TA-00004 [C-1312, C-1312] [adaptor, adaptor] [CP-028, CP-028]
14 TA-00004 [C-1312, C-1312] [adaptor, adaptor] [CP-028, CP-028]
15 TA-00004 [C-1312, C-1312] [adaptor, adaptor] [CP-028, CP-028]
16 TA-00005 [C-1624, C-1631, C-1641] [nut, sleeve, threaded] [CP-025, CP-024, CP-014]
17 TA-00005 [C-1624, C-1631, C-1641] [nut, sleeve, threaded] [CP-025, CP-024, CP-014]
18 TA-00005 [C-1624, C-1631, C-1641] [nut, sleeve, threaded] [CP-025, CP-024, CP-014]
19 TA-00005 [C-1624, C-1631, C-1641] [nut, sleeve, threaded] [CP-025, CP-024, CP-014]

In [111]:
# Check unique_feature_count and orientation_count features.
df = aug_train_set
print df.unique_feature_count.value_counts()
print df.orientation_count.value_counts()
print df.groove_count.value_counts()


0    14724
2     9185
1     6187
3       91
4       26
dtype: int64
0    27500
1     2128
2      493
3       61
4       24
5        7
dtype: int64
0    29356
1      790
2       67
dtype: int64

In [112]:
# Check total_component_weight feature.
df = aug_train_set[['tube_assembly_id', 'components', 'total_component_weight']]
df[10:20]


Out[112]:
tube_assembly_id components total_component_weight
10 TA-00004 [C-1312, C-1312] 0.018
11 TA-00004 [C-1312, C-1312] 0.018
12 TA-00004 [C-1312, C-1312] 0.018
13 TA-00004 [C-1312, C-1312] 0.018
14 TA-00004 [C-1312, C-1312] 0.018
15 TA-00004 [C-1312, C-1312] 0.018
16 TA-00005 [C-1624, C-1631, C-1641] 0.210
17 TA-00005 [C-1624, C-1631, C-1641] 0.210
18 TA-00005 [C-1624, C-1631, C-1641] 0.210
19 TA-00005 [C-1624, C-1631, C-1641] 0.210

In [113]:
# Check component_end_forms feature.
df = aug_train_set[['tube_assembly_id', 'components', 'component_end_forms']]
df[10:20]


Out[113]:
tube_assembly_id components component_end_forms
10 TA-00004 [C-1312, C-1312] [A-002, 9999, A-002, 9999]
11 TA-00004 [C-1312, C-1312] [A-002, 9999, A-002, 9999]
12 TA-00004 [C-1312, C-1312] [A-002, 9999, A-002, 9999]
13 TA-00004 [C-1312, C-1312] [A-002, 9999, A-002, 9999]
14 TA-00004 [C-1312, C-1312] [A-002, 9999, A-002, 9999]
15 TA-00004 [C-1312, C-1312] [A-002, 9999, A-002, 9999]
16 TA-00005 [C-1624, C-1631, C-1641] [A-001, A-004]
17 TA-00005 [C-1624, C-1631, C-1641] [A-001, A-004]
18 TA-00005 [C-1624, C-1631, C-1641] [A-001, A-004]
19 TA-00005 [C-1624, C-1631, C-1641] [A-001, A-004]

In [118]:
# Check component_part_names feature.
df = aug_train_set[['tube_assembly_id', 'components', 'component_part_names']]
df[30180:30190]


Out[118]:
tube_assembly_id components component_part_names
30180 TA-21163 [C-1420, C-1420, C-1421, C-1421] ()
30181 TA-21163 [C-1420, C-1420, C-1421, C-1421] ()
30182 TA-21163 [C-1420, C-1420, C-1421, C-1421] ()
30183 TA-21164 [C-2044] ()
30184 TA-21165 [C-1917, C-1917, C-2045, C-2046] (BLOCK, BLOCK, ADAPTER-INLET, SLEEVE)
30185 TA-21166 [C-2044] ()
30186 TA-21167 [C-1534, C-2046] (SLEEVE,)
30187 TA-21171 [] ()
30188 TA-21175 [] ()
30189 TA-21176 [C-1845, C-1845, C-1846, C-1846] ()

In [123]:
# Check component_clusters feature.
df = aug_train_set[['tube_assembly_id', 'components', 'component_clusters']]
df[30180:30190]


Out[123]:
tube_assembly_id components component_clusters
30180 TA-21163 [C-1420, C-1420, C-1421, C-1421] ()
30181 TA-21163 [C-1420, C-1420, C-1421, C-1421] ()
30182 TA-21163 [C-1420, C-1420, C-1421, C-1421] ()
30183 TA-21164 [C-2044] (C-1545,)
30184 TA-21165 [C-1917, C-1917, C-2045, C-2046] ()
30185 TA-21166 [C-2044] (C-1545,)
30186 TA-21167 [C-1534, C-2046] (C-1534,)
30187 TA-21171 [] ()
30188 TA-21175 [] ()
30189 TA-21176 [C-1845, C-1845, C-1846, C-1846] ()

In [132]:
np.log1p(aug_train_set.material_volume).hist(bins=100)


Out[132]:
<matplotlib.axes._subplots.AxesSubplot at 0x1064a590>

In [ ]: