In [124]:
%pylab inline
import pandas as pd
from soln.dataset import get_augmented_train_and_test_set
%time aug_train_set, aug_test_set = get_augmented_train_and_test_set()
print aug_train_set.shape
print aug_test_set.shape
aug_train_set.info()
Populating the interactive namespace from numpy and matplotlib
CPU times: user 14.1 s, sys: 332 ms, total: 14.4 s
Wall time: 15.3 s
(30213, 55)
(30235, 53)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 30213 entries, 0 to 30212
Data columns (total 55 columns):
tube_assembly_id 30213 non-null object
supplier 30213 non-null object
quote_date 30213 non-null object
annual_usage 30213 non-null int64
min_order_quantity 30213 non-null int64
bracket_pricing 30213 non-null bool
quantity 30213 non-null int64
log_cost 30213 non-null float64
material_id 29984 non-null object
diameter 30213 non-null float64
wall_thickness 30213 non-null float64
length 30213 non-null float64
num_bends 30213 non-null int64
bend_radius 30213 non-null float64
end_a_1x 30213 non-null bool
end_a_2x 30213 non-null bool
end_x_1x 30213 non-null bool
end_x_2x 30213 non-null bool
end_a 30213 non-null object
end_x 30213 non-null object
num_boss 30213 non-null int64
num_bracket 30213 non-null int64
num_other 30213 non-null int64
specs 30213 non-null object
components 30213 non-null object
quote_age 30213 non-null float64
adj_quantity 30213 non-null int64
adj_bracketing 30213 non-null bool
bracketing_pattern 30213 non-null object
physical_volume 30213 non-null float64
inner_radius 30213 non-null float64
material_volume 30213 non-null float64
ends 30213 non-null object
end_a_forming 30213 non-null bool
end_1x_count 30213 non-null int64
end_x_forming 30213 non-null bool
end_2x_count 30213 non-null int64
end_forming_count 30213 non-null int64
component_groups 30213 non-null object
component_types 30213 non-null object
unique_feature_count 30213 non-null float64
orientation_count 30213 non-null float64
groove_count 30213 non-null float64
total_component_weight 30213 non-null float64
component_end_forms 30213 non-null object
component_connection_types 30213 non-null object
component_max_length 30213 non-null float64
component_max_overall_length 30213 non-null float64
component_max_bolt_pattern_wide 30213 non-null float64
component_max_bolt_pattern_long 30213 non-null float64
component_max_thickness 30213 non-null float64
component_min_thread_pitch 30213 non-null float64
component_min_thread_size 30213 non-null float64
component_part_names 30213 non-null object
dev_fold 30213 non-null int64
dtypes: bool(8), float64(20), int64(12), object(15)
memory usage: 11.3+ MB
In [22]:
# Check that dev_fold is properly shuffled.
plot(aug_train_set.dev_fold.values, '.')
Out[22]:
[<matplotlib.lines.Line2D at 0xc00c390>]
In [23]:
# Check that the folds have roughly equal sizes.
aug_train_set.dev_fold.value_counts()
Out[23]:
1 3141
3 3121
8 3110
5 3013
9 2997
7 2995
2 2983
6 2965
4 2945
0 2943
dtype: int64
In [27]:
# Check that no taid occurs in more than one fold.
taid_to_fold = {}
arr = aug_train_set[['tube_assembly_id', 'dev_fold']].values
assert arr.shape == (len(aug_train_set), 2)
for taid, fold in arr:
if taid in taid_to_fold:
if taid_to_fold[taid] != fold:
print "found taid {} with multiple folds".format(taid)
else:
taid_to_fold[taid] = fold
print "done"
done
In [28]:
df = aug_train_set[['tube_assembly_id', 'adj_quantity', 'bracketing_pattern']]
df[500:520]
Out[28]:
tube_assembly_id
adj_quantity
bracketing_pattern
500
TA-00222
5
(1, 2, 5, 10, 25, 50, 100, 250)
501
TA-00222
10
(1, 2, 5, 10, 25, 50, 100, 250)
502
TA-00222
25
(1, 2, 5, 10, 25, 50, 100, 250)
503
TA-00222
50
(1, 2, 5, 10, 25, 50, 100, 250)
504
TA-00222
100
(1, 2, 5, 10, 25, 50, 100, 250)
505
TA-00222
250
(1, 2, 5, 10, 25, 50, 100, 250)
506
TA-00228
1
(1, 10)
507
TA-00228
10
(1, 10)
508
TA-00229
5
()
509
TA-00230
1
()
510
TA-00239
1
(1, 12)
511
TA-00239
12
(1, 12)
512
TA-00242
1
(1, 6, 20)
513
TA-00242
6
(1, 6, 20)
514
TA-00242
20
(1, 6, 20)
515
TA-00243
1
()
516
TA-00248
1
()
517
TA-00249
1
(1, 2, 5, 10, 25, 50, 100, 250)
518
TA-00249
2
(1, 2, 5, 10, 25, 50, 100, 250)
519
TA-00249
5
(1, 2, 5, 10, 25, 50, 100, 250)
In [37]:
aug_train_set.dev_fold.value_counts()
Out[37]:
9 4695
0 3902
1 3819
2 3769
3 2959
8 2925
4 2781
5 1918
6 1856
7 1589
dtype: int64
In [41]:
aug_train_set.dev_fold.plot()
Out[41]:
<matplotlib.axes._subplots.AxesSubplot at 0xb638190>
In [44]:
from soln.dataset import generate_xv_splits
split = next(generate_xv_splits(aug_train_set))
X_train, y_train, X_test, y_test = split
print aug_train_set.shape
print [thing.shape for thing in split]
(30213, 30)
[(26311, 28), (26311,), (3902, 28), (3902,)]
In [56]:
from soln.dataset import AllCategoricalsFeaturizer
featurizer = AllCategoricalsFeaturizer(keep_orig_feats=True)
%time featurizer.fit(X_train)
%time X_train_feats = featurizer.transform(X_train)
X_train_feats.info(verbose=True)
CPU times: user 452 ms, sys: 0 ns, total: 452 ms
Wall time: 470 ms
CPU times: user 644 ms, sys: 200 ms, total: 844 ms
Wall time: 864 ms
<class 'pandas.core.frame.DataFrame'>
Int64Index: 26311 entries, 0 to 26310
Data columns (total 448 columns):
tube_assembly_id object
supplier object
quote_date object
annual_usage int64
min_order_quantity int64
bracket_pricing bool
quantity int64
material_id object
diameter float64
wall_thickness float64
length float64
num_bends int64
bend_radius float64
end_a_1x bool
end_a_2x bool
end_x_1x bool
end_x_2x bool
end_a object
end_x object
num_boss int64
num_bracket int64
num_other int64
specs object
components object
quote_age float64
adj_quantity int64
adj_bracketing bool
bracketing_pattern object
supplier other float64
supplier S-0042 float64
supplier S-0005 float64
supplier S-0026 float64
supplier S-0027 float64
supplier S-0072 float64
supplier S-0062 float64
supplier S-0064 float64
supplier S-0043 float64
supplier S-0066 float64
supplier S-0041 float64
supplier S-0105 float64
supplier S-0080 float64
supplier S-0081 float64
supplier S-0104 float64
supplier S-0013 float64
supplier S-0014 float64
supplier S-0070 float64
supplier S-0031 float64
supplier S-0030 float64
supplier S-0058 float64
supplier S-0054 float64
supplier S-0092 float64
material_id other float64
material_id nan float64
material_id SP-0046 float64
material_id SP-0041 float64
material_id SP-0033 float64
material_id SP-0048 float64
material_id SP-0034 float64
material_id SP-0035 float64
material_id SP-0036 float64
material_id SP-0037 float64
material_id SP-0030 float64
material_id SP-0019 float64
material_id SP-0008 float64
material_id SP-0038 float64
material_id SP-0039 float64
material_id SP-0029 float64
material_id SP-0028 float64
end_a other float64
end_a EF-005 float64
end_a NONE float64
end_a EF-002 float64
end_a EF-003 float64
end_a EF-008 float64
end_a EF-009 float64
end_a EF-023 float64
end_a EF-021 float64
end_a EF-012 float64
end_a EF-017 float64
end_a EF-016 float64
end_a EF-015 float64
end_a EF-019 float64
end_a EF-018 float64
end_x other float64
end_x EF-005 float64
end_x NONE float64
end_x EF-002 float64
end_x EF-003 float64
end_x EF-008 float64
end_x EF-009 float64
end_x EF-023 float64
end_x EF-021 float64
end_x EF-006 float64
end_x EF-012 float64
end_x EF-010 float64
end_x EF-017 float64
end_x EF-015 float64
end_x EF-019 float64
end_x EF-018 float64
specs other float64
specs SP-0065 float64
specs SP-0050 float64
specs SP-0057 float64
specs SP-0009 float64
specs SP-0058 float64
specs SP-0025 float64
specs SP-0079 float64
specs SP-0002 float64
specs SP-0068 float64
specs SP-0070 float64
specs SP-0017 float64
specs SP-0072 float64
specs SP-0012 float64
specs SP-0013 float64
specs SP-0076 float64
specs SP-0022 float64
specs SP-0021 float64
specs SP-0016 float64
specs SP-0071 float64
specs SP-0088 float64
specs SP-0080 float64
specs SP-0082 float64
specs SP-0010 float64
specs SP-0075 float64
specs SP-0026 float64
specs SP-0069 float64
specs SP-0024 float64
specs SP-0005 float64
specs SP-0004 float64
specs SP-0007 float64
specs SP-0063 float64
specs SP-0062 float64
specs SP-0061 float64
specs SP-0067 float64
specs SP-0029 float64
components other float64
components C-1653 float64
components C-0218 float64
components C-0217 float64
components C-0215 float64
components C-0214 float64
components C-0211 float64
components C-0210 float64
components C-1867 float64
components C-1866 float64
components C-1860 float64
components C-1554 float64
components C-1869 float64
components C-1229 float64
components C-1898 float64
components C-0102 float64
components C-1359 float64
components C-1355 float64
components C-1354 float64
components C-1352 float64
components C-1577 float64
components C-1677 float64
components C-0122 float64
components C-0071 float64
components C-1017 float64
components C-1578 float64
components C-0250 float64
components C-0318 float64
components C-1779 float64
components C-0422 float64
components C-1821 float64
components C-0855 float64
components C-0227 float64
components C-0550 float64
components C-0494 float64
components C-0228 float64
components C-1910 float64
components C-1850 float64
components C-1313 float64
components C-1312 float64
components C-1619 float64
components C-1538 float64
components C-0577 float64
components C-1533 float64
components C-1536 float64
components C-0133 float64
components C-0134 float64
components C-1405 float64
components C-1625 float64
components C-1624 float64
components C-1627 float64
components C-1626 float64
components C-1621 float64
components C-1620 float64
components C-1623 float64
components C-1622 float64
components C-1743 float64
components C-1629 float64
components C-1628 float64
components C-1745 float64
components C-2030 float64
components C-1547 float64
components C-0048 float64
components C-1660 float64
components C-1663 float64
components C-0599 float64
components C-0616 float64
components C-1817 float64
components C-0448 float64
components C-1954 float64
components C-1218 float64
components C-1889 float64
components C-1445 float64
components C-1881 float64
components C-1880 float64
components C-1885 float64
components C-0199 float64
components C-1369 float64
components C-1059 float64
components C-1661 float64
components C-0095 float64
components C-0002 float64
components C-0003 float64
components C-0001 float64
components C-0007 float64
components C-0004 float64
components C-0165 float64
components C-1714 float64
components C-1716 float64
components C-1711 float64
components C-1718 float64
components C-1781 float64
components C-1715 float64
components C-0434 float64
components C-0539 float64
components C-0544 float64
components C-0548 float64
components C-1963 float64
components C-0679 float64
components C-1848 float64
components C-1845 float64
components C-1846 float64
components C-1244 float64
components C-1243 float64
components C-1242 float64
components C-0579 float64
components C-1902 float64
components C-0120 float64
components C-1183 float64
components C-1439 float64
components C-1430 float64
components C-1435 float64
components C-1758 float64
components C-1614 float64
components C-1615 float64
components C-1750 float64
components C-2008 float64
components C-2004 float64
components C-2005 float64
components C-2006 float64
components C-2001 float64
components C-2002 float64
components C-2003 float64
components C-0699 float64
components C-0751 float64
components C-2032 float64
components C-0058 float64
components C-1502 float64
components C-0051 float64
components C-1058 float64
components C-0052 float64
components C-0674 float64
components C-0826 float64
components C-0823 float64
components C-0580 float64
components C-0275 float64
components C-0401 float64
components C-1808 float64
components C-0409 float64
components C-1209 float64
components C-1203 float64
components C-1200 float64
components C-1206 float64
components C-1205 float64
components C-0345 float64
components C-1386 float64
components C-1385 float64
components C-1936 float64
components C-1375 float64
components C-1374 float64
components C-1373 float64
components C-1475 float64
components C-1476 float64
components C-1555 float64
components C-0389 float64
components C-0388 float64
components C-1988 float64
components C-1723 float64
components C-1727 float64
components C-1728 float64
components C-0333 float64
components C-1654 float64
components C-1655 float64
components C-1650 float64
components C-1651 float64
components C-1652 float64
components C-1658 float64
components C-0703 float64
components C-0449 float64
components C-0520 float64
components C-0444 float64
components C-0445 float64
components C-0208 float64
components C-0209 float64
components C-1970 float64
components C-1976 float64
components C-1873 float64
components C-1233 float64
components C-1230 float64
components C-1235 float64
components C-1332 float64
components C-1425 float64
components C-1421 float64
components C-1420 float64
components C-1428 float64
components C-1348 float64
components C-1349 float64
components C-1344 float64
components C-1345 float64
components C-1343 float64
components C-1565 float64
components C-2019 float64
components C-1630 float64
components C-0562 float64
components C-0244 float64
components C-1761 float64
components C-0369 float64
components C-1768 float64
components C-0844 float64
components C-1841 float64
components C-1398 float64
components C-1908 float64
components C-1909 float64
components C-1906 float64
components C-1907 float64
components C-1998 float64
components C-1995 float64
components C-1994 float64
components C-1411 float64
components C-1637 float64
components C-1635 float64
components C-1632 float64
components C-1633 float64
components C-1739 float64
components C-1631 float64
components C-1638 float64
components C-1639 float64
components C-2026 float64
components C-2027 float64
components C-2028 float64
components C-2029 float64
components C-1643 float64
components C-1642 float64
components C-1641 float64
components C-1640 float64
components C-1647 float64
components C-1646 float64
components C-1645 float64
components C-1644 float64
components C-1649 float64
components C-1648 float64
components C-1672 float64
bracketing_pattern other float64
bracketing_pattern (10, 15, 20, 25, 30) float64
bracketing_pattern (5, 10, 20, 50, 100) float64
bracketing_pattern (1, 2, 3, 5, 10, 20, 50) float64
bracketing_pattern (1, 3, 5, 7, 10) float64
bracketing_pattern (1, 2, 3, 4, 6) float64
bracketing_pattern (5, 19, 20) float64
bracketing_pattern (1, 3, 5, 7, 9) float64
bracketing_pattern (10, 15, 20, 30) float64
bracketing_pattern (1, 3, 5, 10, 25) float64
bracketing_pattern (5, 10) float64
bracketing_pattern (1, 2, 4, 9, 19) float64
bracketing_pattern (5, 20) float64
bracketing_pattern (25, 50, 75, 100) float64
bracketing_pattern (1, 3, 5, 10, 20) float64
bracketing_pattern (2, 3, 4, 6) float64
bracketing_pattern (15, 25, 35) float64
bracketing_pattern (1, 5, 10, 20) float64
bracketing_pattern (1, 3, 5, 10, 15, 25) float64
bracketing_pattern (17, 30) float64
bracketing_pattern (1, 2, 5, 10, 25, 50, 100, 250) float64
bracketing_pattern (1, 2, 4) float64
bracketing_pattern (1, 6) float64
bracketing_pattern (1, 2, 3, 5, 10) float64
bracketing_pattern (1, 3, 5) float64
bracketing_pattern (3, 4, 5, 6) float64
bracketing_pattern (1, 10) float64
bracketing_pattern (1, 2, 3, 5, 7) float64
bracketing_pattern (1, 3) float64
bracketing_pattern (5, 10, 15) float64
bracketing_pattern (6, 12, 18, 24) float64
bracketing_pattern (3, 5, 7, 9) float64
bracketing_pattern (10, 15, 20) float64
bracketing_pattern (1, 4) float64
bracketing_pattern (1, 15) float64
bracketing_pattern (1, 2, 3, 5) float64
bracketing_pattern (1, 60, 125, 150, 200) float64
bracketing_pattern () float64
bracketing_pattern (1, 3, 5, 10, 15) float64
bracketing_pattern (1, 8) float64
bracketing_pattern (10, 20, 30, 40) float64
bracketing_pattern (5, 10, 15, 20, 25) float64
bracketing_pattern (50, 50) float64
bracketing_pattern (20, 40, 60, 80) float64
bracketing_pattern (4, 10) float64
bracketing_pattern (10, 25, 40, 55, 70) float64
bracketing_pattern (5, 10, 25) float64
bracketing_pattern (1, 2, 3, 4, 5) float64
bracketing_pattern (1, 20, 50) float64
bracketing_pattern (1, 2, 3, 5, 10, 20) float64
bracketing_pattern (3, 5, 10) float64
bracketing_pattern (1, 2, 4, 8, 16) float64
bracketing_pattern (2, 3, 4, 5) float64
bracketing_pattern (1, 5, 10, 15, 20) float64
bracketing_pattern (1, 2, 5) float64
bracketing_pattern (8, 16, 24, 32) float64
bracketing_pattern (2, 4, 6, 8) float64
bracketing_pattern (1, 2, 5, 10, 25, 50) float64
bracketing_pattern (1, 2, 3, 4) float64
bracketing_pattern (5, 10, 15, 20) float64
bracketing_pattern (4, 15) float64
bracketing_pattern (4, 6, 8, 12) float64
bracketing_pattern (2, 4, 6, 8, 10) float64
bracketing_pattern (1, 3, 5, 10) float64
bracketing_pattern (8, 16, 24, 32, 48) float64
bracketing_pattern (1, 5) float64
bracketing_pattern (1, 3, 5, 10, 20, 30) float64
bracketing_pattern (1, 2, 5, 10, 25) float64
bracketing_pattern (1, 30) float64
bracketing_pattern (1, 2, 5, 10, 25, 50, 100) float64
bracketing_pattern (1, 6, 20) float64
bracketing_pattern (3, 6, 9, 12) float64
bracketing_pattern (1, 2, 3, 5, 10, 20, 50, 100) float64
bracketing_pattern (3, 5, 10, 20) float64
bracketing_pattern (10, 20, 30) float64
bracketing_pattern (1, 2, 5, 25, 35) float64
bracketing_pattern (1, 2) float64
bracketing_pattern (5, 10, 20) float64
bracketing_pattern (4, 8, 12, 16) float64
bracketing_pattern (10, 15, 25) float64
bracketing_pattern (2, 3, 5, 10, 20, 50, 100) float64
dtypes: bool(6), float64(425), int64(8), object(9)
memory usage: 89.1+ MB
In [57]:
X_train_feats['components C-1312'].value_counts()
Out[57]:
0 23201
2 3024
1 86
dtype: int64
In [34]:
# Verify that the 'id' column in test_set.csv is equal to the row number (1-based).
from soln.dataset import load_raw_data
raw = load_raw_data()
id_col = raw['test_set']['id'].values
np.all(id_col - 1 == np.arange(30235))
Out[34]:
True
In [102]:
# Check 'ends' feature.
df = aug_train_set[['tube_assembly_id', 'end_a', 'end_x', 'ends']]
df[-10:]
Out[102]:
tube_assembly_id
end_a
end_x
ends
30203
TA-21189
EF-018
NONE
(EF-018, NONE)
30204
TA-21189
EF-018
NONE
(EF-018, NONE)
30205
TA-21189
EF-018
NONE
(EF-018, NONE)
30206
TA-21189
EF-018
NONE
(EF-018, NONE)
30207
TA-21189
EF-018
NONE
(EF-018, NONE)
30208
TA-21190
EF-003
EF-018
(EF-003, EF-018)
30209
TA-21191
EF-003
EF-018
(EF-003, EF-018)
30210
TA-21195
EF-003
EF-009
(EF-003, EF-009)
30211
TA-21196
EF-003
EF-017
(EF-003, EF-017)
30212
TA-21197
EF-003
EF-017
(EF-003, EF-017)
In [103]:
print aug_train_set.end_1x_count.value_counts()
print aug_train_set.end_2x_count.value_counts()
0 29162
1 787
2 264
dtype: int64
0 26513
1 2786
2 914
dtype: int64
In [104]:
# Check forming features.
df = aug_train_set[['tube_assembly_id', 'end_a', 'end_x', 'end_a_forming', 'end_x_forming', 'end_forming_count']]
df[30183:30193]
Out[104]:
tube_assembly_id
end_a
end_x
end_a_forming
end_x_forming
end_forming_count
30183
TA-21164
EF-022
EF-023
True
False
1
30184
TA-21165
EF-023
EF-023
False
False
0
30185
TA-21166
EF-022
EF-023
True
False
1
30186
TA-21167
EF-023
EF-023
False
False
0
30187
TA-21171
EF-009
EF-009
True
True
2
30188
TA-21175
EF-009
NONE
True
False
1
30189
TA-21176
EF-018
EF-018
True
True
2
30190
TA-21177
EF-018
EF-018
True
True
2
30191
TA-21179
EF-009
EF-009
True
True
2
30192
TA-21180
EF-018
EF-003
True
False
1
In [107]:
# Check component_groups and component_types features.
df = aug_train_set[['tube_assembly_id', 'components', 'component_groups', 'component_types']]
df[10:20]
Out[107]:
tube_assembly_id
components
component_groups
component_types
10
TA-00004
[C-1312, C-1312]
[adaptor, adaptor]
[CP-028, CP-028]
11
TA-00004
[C-1312, C-1312]
[adaptor, adaptor]
[CP-028, CP-028]
12
TA-00004
[C-1312, C-1312]
[adaptor, adaptor]
[CP-028, CP-028]
13
TA-00004
[C-1312, C-1312]
[adaptor, adaptor]
[CP-028, CP-028]
14
TA-00004
[C-1312, C-1312]
[adaptor, adaptor]
[CP-028, CP-028]
15
TA-00004
[C-1312, C-1312]
[adaptor, adaptor]
[CP-028, CP-028]
16
TA-00005
[C-1624, C-1631, C-1641]
[nut, sleeve, threaded]
[CP-025, CP-024, CP-014]
17
TA-00005
[C-1624, C-1631, C-1641]
[nut, sleeve, threaded]
[CP-025, CP-024, CP-014]
18
TA-00005
[C-1624, C-1631, C-1641]
[nut, sleeve, threaded]
[CP-025, CP-024, CP-014]
19
TA-00005
[C-1624, C-1631, C-1641]
[nut, sleeve, threaded]
[CP-025, CP-024, CP-014]
In [111]:
# Check unique_feature_count and orientation_count features.
df = aug_train_set
print df.unique_feature_count.value_counts()
print df.orientation_count.value_counts()
print df.groove_count.value_counts()
0 14724
2 9185
1 6187
3 91
4 26
dtype: int64
0 27500
1 2128
2 493
3 61
4 24
5 7
dtype: int64
0 29356
1 790
2 67
dtype: int64
In [112]:
# Check total_component_weight feature.
df = aug_train_set[['tube_assembly_id', 'components', 'total_component_weight']]
df[10:20]
Out[112]:
tube_assembly_id
components
total_component_weight
10
TA-00004
[C-1312, C-1312]
0.018
11
TA-00004
[C-1312, C-1312]
0.018
12
TA-00004
[C-1312, C-1312]
0.018
13
TA-00004
[C-1312, C-1312]
0.018
14
TA-00004
[C-1312, C-1312]
0.018
15
TA-00004
[C-1312, C-1312]
0.018
16
TA-00005
[C-1624, C-1631, C-1641]
0.210
17
TA-00005
[C-1624, C-1631, C-1641]
0.210
18
TA-00005
[C-1624, C-1631, C-1641]
0.210
19
TA-00005
[C-1624, C-1631, C-1641]
0.210
In [113]:
# Check component_end_forms feature.
df = aug_train_set[['tube_assembly_id', 'components', 'component_end_forms']]
df[10:20]
Out[113]:
tube_assembly_id
components
component_end_forms
10
TA-00004
[C-1312, C-1312]
[A-002, 9999, A-002, 9999]
11
TA-00004
[C-1312, C-1312]
[A-002, 9999, A-002, 9999]
12
TA-00004
[C-1312, C-1312]
[A-002, 9999, A-002, 9999]
13
TA-00004
[C-1312, C-1312]
[A-002, 9999, A-002, 9999]
14
TA-00004
[C-1312, C-1312]
[A-002, 9999, A-002, 9999]
15
TA-00004
[C-1312, C-1312]
[A-002, 9999, A-002, 9999]
16
TA-00005
[C-1624, C-1631, C-1641]
[A-001, A-004]
17
TA-00005
[C-1624, C-1631, C-1641]
[A-001, A-004]
18
TA-00005
[C-1624, C-1631, C-1641]
[A-001, A-004]
19
TA-00005
[C-1624, C-1631, C-1641]
[A-001, A-004]
In [118]:
# Check component_part_names feature.
df = aug_train_set[['tube_assembly_id', 'components', 'component_part_names']]
df[30180:30190]
Out[118]:
tube_assembly_id
components
component_part_names
30180
TA-21163
[C-1420, C-1420, C-1421, C-1421]
()
30181
TA-21163
[C-1420, C-1420, C-1421, C-1421]
()
30182
TA-21163
[C-1420, C-1420, C-1421, C-1421]
()
30183
TA-21164
[C-2044]
()
30184
TA-21165
[C-1917, C-1917, C-2045, C-2046]
(BLOCK, BLOCK, ADAPTER-INLET, SLEEVE)
30185
TA-21166
[C-2044]
()
30186
TA-21167
[C-1534, C-2046]
(SLEEVE,)
30187
TA-21171
[]
()
30188
TA-21175
[]
()
30189
TA-21176
[C-1845, C-1845, C-1846, C-1846]
()
In [123]:
# Check component_clusters feature.
df = aug_train_set[['tube_assembly_id', 'components', 'component_clusters']]
df[30180:30190]
Out[123]:
tube_assembly_id
components
component_clusters
30180
TA-21163
[C-1420, C-1420, C-1421, C-1421]
()
30181
TA-21163
[C-1420, C-1420, C-1421, C-1421]
()
30182
TA-21163
[C-1420, C-1420, C-1421, C-1421]
()
30183
TA-21164
[C-2044]
(C-1545,)
30184
TA-21165
[C-1917, C-1917, C-2045, C-2046]
()
30185
TA-21166
[C-2044]
(C-1545,)
30186
TA-21167
[C-1534, C-2046]
(C-1534,)
30187
TA-21171
[]
()
30188
TA-21175
[]
()
30189
TA-21176
[C-1845, C-1845, C-1846, C-1846]
()
In [132]:
np.log1p(aug_train_set.material_volume).hist(bins=100)
Out[132]:
<matplotlib.axes._subplots.AxesSubplot at 0x1064a590>
In [ ]:
Content source: arorahardeep/kaggle-caterpillar
Similar notebooks: