In [256]:
%pylab inline
from collections import defaultdict
import pandas as pd
from soln.dataset import get_augmented_train_and_test_set
from soln.dataset import get_component_info_df
from soln.dataset import load_raw_components
pd.set_option('display.max_columns', None)
Populating the interactive namespace from numpy and matplotlib
In [257]:
comp_types, group_dfs, cluster_dfs = load_raw_components()
group_dfs['other'].columns
Out[257]:
Index([u'component_id', u'part_name', u'weight'], dtype='object')
In [192]:
# Look for columns that occur in multiple groups:
col_to_groups = defaultdict(list)
for group_name, df in group_dfs.iteritems():
for col in df.columns:
col_to_groups[col].append(group_name)
col_groups = col_to_groups.items()
col_groups.sort(key=lambda (col, groups): len(groups), reverse=True)
print len(group_dfs)
for col, groups in col_groups:
print col, len(groups), groups
11
weight 11 ['threaded', 'sleeve', 'adaptor', 'nut', 'float', 'tee', 'boss', 'other', 'hfl', 'elbow', 'straight']
component_id 11 ['threaded', 'sleeve', 'adaptor', 'nut', 'float', 'tee', 'boss', 'other', 'hfl', 'elbow', 'straight']
orientation 10 ['threaded', 'sleeve', 'adaptor', 'nut', 'float', 'tee', 'boss', 'hfl', 'elbow', 'straight']
component_type_id 10 ['threaded', 'sleeve', 'adaptor', 'nut', 'float', 'tee', 'boss', 'hfl', 'elbow', 'straight']
unique_feature 7 ['threaded', 'sleeve', 'adaptor', 'tee', 'boss', 'elbow', 'straight']
bolt_pattern_wide 5 ['float', 'tee', 'boss', 'elbow', 'straight']
overall_length 5 ['threaded', 'adaptor', 'tee', 'elbow', 'straight']
bolt_pattern_long 5 ['float', 'tee', 'boss', 'elbow', 'straight']
thickness 4 ['float', 'tee', 'elbow', 'straight']
groove 4 ['tee', 'boss', 'elbow', 'straight']
mj_class_code 3 ['tee', 'elbow', 'straight']
nominal_size_2 2 ['threaded', 'adaptor']
nominal_size_1 2 ['threaded', 'adaptor']
adaptor_angle 2 ['threaded', 'adaptor']
connection_type_id 2 ['sleeve', 'boss']
length 2 ['sleeve', 'nut']
mj_plug_class_code 2 ['tee', 'elbow']
hex_size 2 ['threaded', 'adaptor']
connection_type_id_1 2 ['threaded', 'adaptor']
connection_type_id_2 2 ['threaded', 'adaptor']
thread_size_1 2 ['threaded', 'adaptor']
thread_size_2 2 ['threaded', 'adaptor']
plating 2 ['sleeve', 'hfl']
end_form_id_1 2 ['threaded', 'adaptor']
end_form_id_2 2 ['threaded', 'adaptor']
extension_length 2 ['tee', 'elbow']
drop_length 2 ['tee', 'elbow']
length_1 2 ['threaded', 'adaptor']
length_2 2 ['threaded', 'adaptor']
thread_pitch_1 2 ['threaded', 'adaptor']
thread_pitch_2 2 ['threaded', 'adaptor']
diameter 1 ['nut']
nominal_size_3 1 ['threaded']
nominal_size_4 1 ['threaded']
head_diameter 1 ['straight']
type 1 ['boss']
corresponding_shell 1 ['hfl']
outside_shape 1 ['boss']
base_diameter 1 ['boss']
elbow_angle 1 ['elbow']
base_type 1 ['boss']
seat_angle 1 ['nut']
thread_size_4 1 ['threaded']
connection_type_id_3 1 ['threaded']
connection_type_id_4 1 ['threaded']
thread_size_3 1 ['threaded']
material 1 ['hfl']
coupling_class 1 ['hfl']
thread_size 1 ['nut']
intended_nut_pitch 1 ['sleeve']
plug_diameter 1 ['elbow']
end_form_id_3 1 ['threaded']
hex_nut_size 1 ['nut']
end_form_id_4 1 ['threaded']
hose_diameter 1 ['hfl']
thread_pitch 1 ['nut']
shoulder_diameter 1 ['boss']
part_name 1 ['other']
intended_nut_thread 1 ['sleeve']
blind_hole 1 ['nut']
height_over_tube 1 ['boss']
length_3 1 ['threaded']
length_4 1 ['threaded']
thread_pitch_4 1 ['threaded']
thread_pitch_3 1 ['threaded']
In [258]:
cinfo_df = get_component_info_df(comp_types, group_dfs, cluster_dfs)
cinfo_df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2047 entries, 0 to 2046
Data columns (total 48 columns):
adaptor_angle 55 non-null float64
base_diameter 57 non-null float64
base_type 124 non-null object
blind_hole 23 non-null object
bolt_pattern_long 2047 non-null float64
bolt_pattern_wide 2047 non-null float64
component_group_id 2047 non-null object
component_id 2047 non-null object
component_type_id 2047 non-null object
corresponding_shell 6 non-null object
coupling_class 6 non-null object
diameter 23 non-null float64
drop_length 175 non-null float64
elbow_angle 130 non-null float64
extension_length 174 non-null float64
groove 2047 non-null bool
head_diameter 70 non-null float64
height_over_tube 147 non-null float64
hex_nut_size 42 non-null float64
hex_size 129 non-null float64
hose_diameter 6 non-null float64
intended_nut_pitch 50 non-null float64
intended_nut_thread 50 non-null float64
material 6 non-null object
mj_class_code 165 non-null object
mj_plug_class_code 44 non-null object
orientation 2047 non-null bool
outside_shape 124 non-null object
overall_length 2047 non-null float64
part_name 1001 non-null object
plating 56 non-null object
plug_diameter 7 non-null float64
seat_angle 15 non-null float64
shoulder_diameter 30 non-null float64
thickness 2047 non-null float64
type 124 non-null object
unique_feature 2047 non-null bool
weight 2047 non-null float64
cluster 361 non-null object
nominal_sizes 2047 non-null object
lengths 2047 non-null object
thread_sizes 2047 non-null object
connection_types 2047 non-null object
end_forms 2047 non-null object
thread_pitches 2047 non-null object
max_length 2047 non-null float64
min_thread_pitch 2047 non-null float64
min_thread_size 2047 non-null float64
dtypes: bool(3), float64(24), object(21)
memory usage: 741.6+ KB
In [195]:
%time aug_train_set, aug_test_set = get_augmented_train_and_test_set()
CPU times: user 13.6 s, sys: 72 ms, total: 13.7 s
Wall time: 13.9 s
In [211]:
from soln.utils import count_components
train_counts = count_components(aug_train_set, cinfo_df)
train_counts.rename(columns={'count': 'train_count'}, inplace=True)
test_counts = count_components(aug_test_set, cinfo_df)
test_counts.rename(columns={'count': 'test_count'}, inplace=True)
all_counts = cinfo_df[['component_id', 'component_type_id', 'component_group_id']]
all_counts = all_counts.merge(train_counts, on='component_id')
all_counts = all_counts.merge(test_counts, on='component_id')
In [212]:
all_counts
Out[212]:
component_id
component_type_id
component_group_id
train_count
test_count
0
C-0007
CP-014
threaded
13
7
1
C-0030
CP-015
threaded
0
0
2
C-0041
CP-014
threaded
0
0
3
C-0043
CP-014
threaded
0
0
4
C-0044
CP-014
threaded
0
0
5
C-0069
CP-015
threaded
0
0
6
C-0070
CP-015
threaded
0
0
7
C-0072
CP-015
threaded
0
0
8
C-0073
CP-015
threaded
0
0
9
C-0074
CP-014
threaded
0
0
10
C-0077
CP-014
threaded
0
1
11
C-0078
CP-014
threaded
0
1
12
C-0119
CP-015
threaded
0
0
13
C-0133
CP-014
threaded
10
12
14
C-0159
CP-016
threaded
2
0
15
C-0173
CP-014
threaded
1
0
16
C-0178
CP-014
threaded
1
0
17
C-0179
CP-014
threaded
0
0
18
C-0180
CP-014
threaded
0
0
19
C-0181
CP-014
threaded
0
1
20
C-0182
CP-014
threaded
0
0
21
C-0189
CP-015
threaded
1
0
22
C-0210
CP-015
threaded
25
23
23
C-0211
CP-015
threaded
25
22
24
C-0212
CP-015
threaded
2
3
25
C-0216
CP-014
threaded
2
0
26
C-0217
CP-014
threaded
2
2
27
C-0218
CP-014
threaded
4
1
28
C-0223
CP-016
threaded
1
1
29
C-0240
CP-014
threaded
0
0
...
...
...
...
...
...
2017
C-1895
CP-002
straight
0
0
2018
C-1897
CP-006
straight
1
0
2019
C-1900
CP-007
straight
1
0
2020
C-1901
CP-002
straight
5
1
2021
C-1904
CP-002
straight
0
3
2022
C-1943
CP-004
straight
4
2
2023
C-1946
CP-003
straight
1
1
2024
C-1947
CP-003
straight
5
6
2025
C-1953
CP-004
straight
0
1
2026
C-1956
CP-004
straight
3
2
2027
C-1958
CP-003
straight
0
1
2028
C-1983
CP-004
straight
1
0
2029
C-1986
CP-003
straight
1
1
2030
C-1996
CP-004
straight
0
1
2031
C-1997
CP-002
straight
2
0
2032
C-1999
CP-007
straight
1
2
2033
C-2000
CP-002
straight
2
0
2034
C-2009
CP-002
straight
0
1
2035
C-2015
CP-002
straight
2
4
2036
C-2016
CP-002
straight
1
0
2037
C-2017
CP-002
straight
3
1
2038
C-2018
CP-002
straight
0
1
2039
C-2019
CP-002
straight
6
2
2040
C-2020
CP-002
straight
1
2
2041
C-2021
CP-002
straight
3
6
2042
C-2022
CP-002
straight
6
1
2043
C-2023
CP-002
straight
3
4
2044
C-2034
CP-007
straight
0
0
2045
C-2043
CP-004
straight
5
7
2046
C-2044
CP-006
straight
2
1
2047 rows × 5 columns
In [214]:
print len(all_counts)
print len(all_counts[(all_counts.train_count > 0) & (all_counts.test_count == 0)])
print len(all_counts[(all_counts.train_count == 0) & (all_counts.test_count > 0)])
2047
520
482
In [223]:
bins = [(0, 0), (1, 1), (2, 5), (5, 10), (10, 20), (20, 50), (50, 100), (100, np.inf)]
all_counts['train_bin'] = -1
for i, (cmin, cmax) in enumerate(bins):
all_counts.train_bin.loc[(all_counts.train_count >= cmin) & (all_counts.train_count <= cmax)] = i
all_counts['test_bin'] = -1
for i, (cmin, cmax) in enumerate(bins):
all_counts.test_bin.loc[(all_counts.test_count >= cmin) & (all_counts.test_count <= cmax)] = i
In [240]:
grouped = all_counts.groupby(['train_bin', 'test_bin'])
df = grouped.size().unstack()
df
Out[240]:
test_bin
0
1
2
3
4
5
6
7
train_bin
0
346
399
81
2
NaN
NaN
NaN
NaN
1
407
140
83
9
NaN
NaN
NaN
NaN
2
111
94
112
38
2
NaN
NaN
NaN
3
2
5
43
37
8
NaN
NaN
NaN
4
NaN
NaN
NaN
13
41
2
NaN
NaN
5
NaN
NaN
NaN
NaN
9
23
1
NaN
6
NaN
NaN
NaN
NaN
NaN
3
6
NaN
7
NaN
NaN
NaN
NaN
NaN
NaN
NaN
30
In [242]:
in_train_not_test = all_counts[(all_counts.train_count > 0) & (all_counts.test_count == 0)]
print in_train_not_test.component_group_id.value_counts()
in_test_not_train = all_counts[(all_counts.train_count == 0) & (all_counts.test_count > 0)]
print in_test_not_train.component_group_id.value_counts()
other 313
straight 70
elbow 36
boss 36
threaded 31
adaptor 13
nut 9
float 7
hfl 3
sleeve 1
tee 1
dtype: int64
other 301
straight 77
elbow 35
boss 27
threaded 20
nut 8
sleeve 4
adaptor 4
hfl 2
tee 2
float 2
dtype: int64
In [243]:
df_other = group_dfs['other']
df_other.part_name.value_counts()
Out[243]:
FLANGE 158
PLATE 94
TUBE 91
ADAPTER 53
BOSS 41
ELBOW 34
BLOCK 27
BRACKET 27
TUBE AS 27
FITTING 12
CLIP 10
ORIFICE 10
CONNECTOR-WELD 9
NUT-WELD 8
TUBE AS. 8
WASHER 8
HEAD-FLANGED 8
NUT-A/C 7
CONNECTOR 7
ADAPTER-OIL LIN 6
NUT 6
LUG 6
SPACER 6
COUPLING AS 6
ELBOW-HYDRAULIC 6
SEAL-O-RING 6
PIPE 5
RING 5
COUPLING 5
NUT-FUEL INJ 5
...
PLUG-PIPE 1
TUBE AS-RH 1
TUBE AS-O SUPPL 1
ADAPTER RING 1
SHEET 1
COUPLING-PIPE 1
SLEEVE-REDUCING 1
HASP AS 1
BLOCK CONNECTION 1
ELBOW-AIR (CRS) 1
CAP-ORFS 1
TEE-ORFS BRAZE 1
FILLER AS 1
PLUG-DETENT 1
SCREEN AS 1
TEE-FLARED 1
ADAPTER-OIL (GA 1
ADAPTER - GAGE 1
PLUG-CORE 1
ELBOW-OIL INLET 1
FLANGE-ONE PIECE 1
CAP AS-O FILLER 1
ANGLE AS 1
ELBOW-CAST 1
PIN-SPRING 1
WASHER-HARD 1
RIVET-POP 1
SHROUD 1
COUPLING AS-PER 1
ADAPTER-STR 1
dtype: int64
In [282]:
plot(df_other.part_name.value_counts().values)
Out[282]:
[<matplotlib.lines.Line2D at 0x95d1110>]
In [280]:
flared = ['flared' in part_name.lower() for part_name in df_other.part_name.values]
df_other[flared].sort('component_id')
Out[280]:
component_id
part_name
weight
component_group_id
130
C-0225
NUT-FLARED
0.070
other
477
C-0901
NUT-37° FLARED
0.015
other
834
C-1237
TEE-FLARED
0.409
other
333
C-1380
NUT-FLARED
0.290
other
387
C-1381
SLEEVE-FLARED
0.033
other
176
C-1382
NUT-FLARED
0.025
other
293
C-1383
SLEEVE-FLARED
0.009
other
0
C-1385
NUT-FLARED
0.014
other
1
C-1386
SLEEVE-FLARED
0.005
other
In [285]:
in_test_not_train[in_test_not_train.component_group_id == 'other'].sort('test_count', ascending=False)
Out[285]:
component_id
component_type_id
component_group_id
train_count
test_count
train_bin
test_bin
575
C-1402
other
other
0
7
0
3
634
C-0085
other
other
0
4
0
2
637
C-1978
other
other
0
4
0
2
642
C-0121
other
other
0
4
0
2
682
C-0749
other
other
0
3
0
2
722
C-1184
other
other
0
3
0
2
716
C-1083
other
other
0
3
0
2
709
C-1032
other
other
0
3
0
2
688
C-1490
other
other
0
3
0
2
677
C-1382
other
other
0
3
0
2
670
C-1311
other
other
0
3
0
2
872
C-1160
other
other
0
2
0
2
836
C-0639
other
other
0
2
0
2
834
C-1380
other
other
0
2
0
2
838
C-0609
other
other
0
2
0
2
857
C-0412
other
other
0
2
0
2
859
C-1068
other
other
0
2
0
2
868
C-1130
other
other
0
2
0
2
871
C-1157
other
other
0
2
0
2
895
C-1309
other
other
0
2
0
2
879
C-1283
other
other
0
2
0
2
881
C-1298
other
other
0
2
0
2
884
C-1831
other
other
0
2
0
2
888
C-1381
other
other
0
2
0
2
894
C-1940
other
other
0
2
0
2
819
C-0145
other
other
0
2
0
2
908
C-1007
other
other
0
2
0
2
909
C-1169
other
other
0
2
0
2
821
C-0201
other
other
0
2
0
2
853
C-1022
other
other
0
2
0
2
...
...
...
...
...
...
...
...
1085
C-0375
other
other
0
1
0
1
1053
C-0906
other
other
0
1
0
1
1054
C-0089
other
other
0
1
0
1
1057
C-0233
other
other
0
1
0
1
1059
C-0725
other
other
0
1
0
1
1063
C-0525
other
other
0
1
0
1
1064
C-0505
other
other
0
1
0
1
1065
C-0645
other
other
0
1
0
1
1069
C-1674
other
other
0
1
0
1
1070
C-1800
other
other
0
1
0
1
1075
C-0902
other
other
0
1
0
1
1077
C-0167
other
other
0
1
0
1
1081
C-0788
other
other
0
1
0
1
1083
C-1971
other
other
0
1
0
1
1087
C-1933
other
other
0
1
0
1
1119
C-0396
other
other
0
1
0
1
1088
C-0024
other
other
0
1
0
1
1089
C-0032
other
other
0
1
0
1
1092
C-0109
other
other
0
1
0
1
1093
C-0114
other
other
0
1
0
1
1096
C-0136
other
other
0
1
0
1
1100
C-1504
other
other
0
1
0
1
1102
C-1573
other
other
0
1
0
1
1104
C-1367
other
other
0
1
0
1
1106
C-0280
other
other
0
1
0
1
1111
C-0326
other
other
0
1
0
1
1112
C-0088
other
other
0
1
0
1
1113
C-0310
other
other
0
1
0
1
1118
C-0346
other
other
0
1
0
1
1501
C-1049
other
other
0
1
0
1
301 rows × 7 columns
In [276]:
df = all_counts[all_counts.component_group_id == 'elbow'].copy()
df.sort('train_count', ascending=False, inplace=True)
df[:10]
Out[276]:
component_id
component_type_id
component_group_id
train_count
test_count
train_bin
test_bin
1625
C-1565
CP-010
elbow
10
11
4
4
1590
C-1208
CP-008
elbow
10
6
4
3
1615
C-1428
CP-008
elbow
7
8
3
3
1603
C-1345
CP-010
elbow
7
5
3
3
1594
C-1317
CP-008
elbow
6
3
3
2
1553
C-0401
CP-008
elbow
6
3
3
2
1631
C-1587
CP-012
elbow
5
6
3
3
1678
C-1935
CP-008
elbow
5
3
3
2
1611
C-1404
CP-008
elbow
5
1
3
1
1605
C-1349
CP-008
elbow
4
3
2
2
In [277]:
plot(df.train_count.values)
Out[277]:
[<matplotlib.lines.Line2D at 0x9824290>]
In [278]:
freq_cids = df.component_id[df.train_count >= 3].values
print len(freq_cids)
freq_cids
22
Out[278]:
array(['C-1565', 'C-1208', 'C-1428', 'C-1345', 'C-1317', 'C-0401',
'C-1587', 'C-1935', 'C-1404', 'C-1349', 'C-1585', 'C-1527',
'C-1586', 'C-1783', 'C-1598', 'C-0063', 'C-1405', 'C-0598',
'C-0616', 'C-1432', 'C-1500', 'C-1006'], dtype=object)
In [ ]:
straight = all_counts[all_counts.component_group_id == 'straight'].copy()
straight.sort('train_count', ascending=False, inplace=True)
straight[:10]
In [295]:
known_cids = set(all_counts.component_id[all_counts.train_count > 0].values)
print len(all_counts), len(known_cids)
known_cids
2047 1219
Out[295]:
{'C-0459',
'C-0513',
'C-0516',
'C-0517',
'C-0518',
'C-0453',
'C-0455',
'C-0457',
'C-0456',
'C-0294',
'C-0047',
'C-0965',
'C-1698',
'C-0967',
'C-0966',
'C-1044',
'C-0802',
'C-0803',
'C-0800',
'C-0801',
'C-0808',
'C-1168',
'C-0804',
'C-1129',
'C-0356',
'C-0605',
'C-0354',
'C-0355',
'C-0352',
'C-0353',
'C-0350',
'C-0218',
'C-0217',
'C-0216',
'C-0215',
'C-0214',
'C-0213',
'C-0212',
'C-0211',
'C-0210',
'C-1867',
'C-1866',
'C-1863',
'C-1862',
'C-1861',
'C-1860',
'C-1944',
'C-1945',
'C-1946',
'C-1514',
'C-1941',
'C-1869',
'C-1868',
'C-1690',
'C-0880',
'C-1983',
'C-0061',
'C-1790',
'C-0062',
'C-1223',
'C-1222',
'C-0017',
'C-1227',
'C-1229',
'C-0064',
'C-1154',
'C-1555',
'C-1164',
'C-0598',
'C-0560',
'C-1152',
'C-1677',
'C-0063',
'C-1452',
'C-0554',
'C-1898',
'C-1598',
'C-1456',
'C-1457',
'C-1455',
'C-0102',
'C-1458',
'C-0100',
'C-1597',
'C-1596',
'C-1595',
'C-1294',
'C-0648',
'C-1359',
'C-1358',
'C-1292',
'C-1293',
'C-1355',
'C-0222',
'C-1357',
'C-1356',
'C-1351',
'C-1353',
'C-0498',
'C-1577',
'C-1679',
'C-1571',
'C-1570',
'C-0964',
'C-0082',
'C-1673',
'C-0884',
'C-1671',
'C-1676',
'C-1578',
'C-0122',
'C-0711',
'C-0125',
'C-0152',
'C-1026',
'C-1687',
'C-0076',
'C-1683',
'C-1682',
'C-0071',
'C-0081',
'C-0774',
'C-0775',
'C-0776',
'C-0777',
'C-0770',
'C-0771',
'C-0772',
'C-1688',
'C-1179',
'C-0504',
'C-1072',
'C-1074',
'C-1075',
'C-0178',
'C-1077',
'C-1078',
'C-0177',
'C-1173',
'C-1172',
'C-0172',
'C-0173',
'C-0170',
'C-0171',
'C-0796',
'C-1707',
'C-0794',
'C-0795',
'C-0792',
'C-0793',
'C-0790',
'C-0791',
'C-0656',
'C-0655',
'C-0654',
'C-0652',
'C-0651',
'C-0650',
'C-1982',
'C-0929',
'C-0252',
'C-0251',
'C-0250',
'C-0257',
'C-0357',
'C-0921',
'C-0923',
'C-0258',
'C-0924',
'C-0927',
'C-0318',
'C-1770',
'C-1777',
'C-0312',
'C-0313',
'C-1779',
'C-1921',
'C-0317',
'C-0314',
'C-0315',
'C-0420',
'C-1889',
'C-0422',
'C-1828',
'C-0424',
'C-0425',
'C-0426',
'C-0427',
'C-0428',
'C-0429',
'C-1821',
'C-0351',
'C-0968',
'C-0247',
'C-0342',
'C-0636',
'C-1444',
'C-0855',
'C-0857',
'C-0856',
'C-0850',
'C-0853',
'C-0852',
'C-1434',
'C-1686',
'C-0859',
'C-0858',
'C-0226',
'C-0227',
'C-0556',
'C-0225',
'C-0550',
'C-0223',
'C-0552',
'C-0553',
'C-0495',
'C-0494',
'C-0497',
'C-0496',
'C-0558',
'C-0559',
'C-0228',
'C-0229',
'C-1917',
'C-1916',
'C-1914',
'C-1913',
'C-1911',
'C-1910',
'C-1919',
'C-1081',
'C-1080',
'C-1082',
'C-1085',
'C-1084',
'C-1087',
'C-1859',
'C-1089',
'C-1088',
'C-1852',
'C-1850',
'C-1949',
'C-1310',
'C-1313',
'C-1312',
'C-1314',
'C-1317',
'C-1316',
'C-1526',
'C-1259',
'C-0079',
'C-1750',
'C-1253',
'C-1122',
'C-1256',
'C-1257',
'C-1538',
'C-1947',
'C-1531',
'C-1530',
'C-1533',
'C-1535',
'C-1534',
'C-1536',
'C-1943',
'C-1362',
'C-0132',
'C-1035',
'C-0130',
'C-1037',
'C-1030',
'C-1031',
'C-0134',
'C-0135',
'C-1768',
'C-0031',
'C-0139',
'C-0037',
'C-1079',
'C-0035',
'C-0362',
'C-2031',
'C-1405',
'C-1404',
'C-1400',
'C-1403',
'C-1625',
'C-1624',
'C-1627',
'C-1626',
'C-1621',
'C-1620',
'C-1623',
'C-1622',
'C-1742',
'C-1743',
'C-1740',
'C-1629',
'C-1628',
'C-1744',
'C-1745',
'C-0297',
'C-2030',
'C-0295',
'C-2032',
'C-0293',
'C-0292',
'C-2037',
'C-1180',
'C-1838',
'C-0299',
'C-1839',
'C-0727',
'C-1126',
'C-0720',
'C-0684',
'C-0685',
'C-0680',
'C-1656',
'C-0682',
'C-0683',
'C-1548',
'C-1102',
'C-1103',
'C-1104',
'C-1105',
'C-0048',
'C-1107',
'C-0046',
'C-1109',
'C-0447',
'C-0045',
'C-0093',
'C-1651',
'C-0664',
'C-0665',
'C-0662',
'C-1120',
'C-0660',
'C-0508',
'C-0507',
'C-1121',
'C-0503',
'C-0502',
'C-0501',
'C-0500',
'C-1795',
'C-1182',
'C-0704',
'C-1615',
'C-1797',
'C-0819',
'C-1658',
'C-0813',
'C-0812',
'C-0815',
'C-0814',
'C-0817',
'C-0816',
'C-0345',
'C-0599',
'C-0347',
'C-0269',
'C-0617',
'C-0616',
'C-0615',
'C-1228',
'C-0263',
'C-0260',
'C-0261',
'C-0349',
'C-0348',
'C-0596',
'C-0597',
'C-1812',
'C-1813',
'C-1811',
'C-1817',
'C-1814',
'C-1815',
'C-1952',
'C-1951',
'C-1950',
'C-1957',
'C-0527',
'C-1954',
'C-1459',
'C-1218',
'C-1219',
'C-1057',
'C-1214',
'C-1215',
'C-1216',
'C-1210',
'C-0196',
'C-1213',
'C-0466',
'C-0197',
'C-0460',
'C-0462',
'C-0190',
'C-0469',
'C-1144',
'C-1365',
'C-0676',
'C-1119',
'C-1785',
'C-0446',
'C-1051',
'C-1043',
'C-1116',
'C-1926',
'C-1140',
'C-1924',
'C-1925',
'C-1922',
'C-1115',
'C-1920',
'C-1141',
'C-1441',
'C-1888',
'C-1445',
'C-1142',
'C-1449',
'C-0198',
'C-1883',
'C-1882',
'C-1885',
'C-1884',
'C-0199',
'C-1368',
'C-1369',
'C-1280',
'C-1058',
'C-1285',
'C-1360',
'C-1289',
'C-1059',
'C-1364',
'C-0761',
'C-1366',
'C-1669',
'C-1668',
'C-1166',
'C-2038',
'C-1545',
'C-1547',
'C-1661',
'C-1660',
'C-1663',
'C-0095',
'C-1664',
'C-1667',
'C-1666',
'C-0390',
'C-0391',
'C-0146',
'C-0395',
'C-0491',
'C-0398',
'C-0002',
'C-0003',
'C-1146',
'C-0001',
'C-0006',
'C-0007',
'C-0004',
'C-1143',
'C-0762',
'C-0008',
'C-0766',
'C-0765',
'C-0764',
'C-0169',
'C-0168',
'C-1069',
'C-0490',
'C-0165',
'C-0164',
'C-0166',
'C-0161',
'C-1062',
'C-1061',
'C-0162',
'C-1715',
'C-1714',
'C-0628',
'C-1716',
'C-0781',
'C-0780',
'C-0783',
'C-0133',
'C-0622',
'C-0623',
'C-0620',
'C-0621',
'C-0626',
'C-1718',
'C-0624',
'C-0577',
'C-1067',
'C-0917',
'C-0911',
'C-0912',
'C-0918',
'C-0604',
'C-2040',
'C-2042',
'C-2043',
'C-2044',
'C-2045',
'C-2046',
'C-2047',
'C-0309',
'C-0308',
'C-0492',
'C-0719',
'C-1786',
'C-0717',
'C-0302',
'C-0712',
'C-0713',
'C-0307',
'C-1781',
'C-0785',
'C-0534',
'C-0532',
'C-0533',
'C-0439',
'C-0438',
'C-0437',
'C-0435',
'C-0434',
'C-0433',
'C-0432',
'C-0539',
'C-0629',
'C-1711',
'C-0979',
'C-0860',
'C-0862',
'C-0863',
'C-0864',
'C-0865',
'C-0867',
'C-0868',
'C-0869',
'C-0977',
'C-0237',
'C-0123',
'C-0547',
'C-0230',
'C-0544',
'C-0549',
'C-0548',
'C-0239',
'C-0784',
'C-1962',
'C-1963',
'C-1960',
'C-1961',
'C-1966',
'C-1967',
'C-1965',
'C-1719',
'C-1969',
'C-1599',
'C-0627',
'C-0679',
'C-1451',
'C-1849',
'C-1848',
'C-1345',
'C-1845',
'C-1844',
'C-1847',
'C-1846',
'C-1841',
'C-1843',
'C-1842',
'C-1247',
'C-1246',
'C-1245',
'C-1244',
'C-1243',
'C-1242',
'C-0915',
'C-1327',
'C-1321',
'C-1322',
'C-0022',
'C-1893',
'C-1093',
'C-0224',
'C-0148',
'C-1591',
'C-1288',
'C-1891',
'C-0175',
'C-1748',
'C-0618',
'C-1897',
'C-1749',
'C-1894',
'C-1098',
'C-0120',
'C-1021',
'C-1020',
'C-1027',
'C-0124',
'C-1025',
'C-0126',
'C-0129',
'C-0128',
'C-1029',
'C-1183',
'C-1187',
'C-1439',
'C-0703',
'C-1430',
'C-1431',
'C-1432',
'C-1433',
'C-0739',
'C-1435',
'C-1436',
'C-1437',
'C-1759',
'C-1758',
'C-1612',
'C-1613',
'C-1614',
'C-0894',
'C-1616',
'C-1751',
'C-1619',
'C-1752',
'C-1300',
'C-1757',
'C-1756',
'C-2008',
'C-1017',
'C-2004',
'C-2005',
'C-2006',
'C-2007',
'C-2000',
'C-2001',
'C-2002',
'C-2003',
'C-0752',
'C-0753',
'C-0699',
'C-0751',
'C-0756',
'C-0757',
'C-0754',
'C-0755',
'C-0690',
'C-0697',
'C-0696',
'C-0194',
'C-1505',
'C-1506',
'C-1507',
'C-1500',
'C-0058',
'C-1502',
'C-1503',
'C-1117',
'C-0054',
'C-0057',
'C-1114',
'C-0051',
'C-0050',
'C-0053',
'C-0052',
'C-0675',
'C-0674',
'C-0578',
'C-0579',
'C-0671',
'C-0670',
'C-0673',
'C-0672',
'C-0572',
'C-0573',
'C-0576',
'C-0678',
'C-0575',
'C-0825',
'C-0826',
'C-0827',
'C-0821',
'C-0823',
'C-1820',
'C-1590',
'C-0828',
'C-0829',
'C-0585',
'C-0583',
'C-0581',
'C-0580',
'C-1806',
'C-0306',
'C-0278',
'C-0372',
'C-0376',
'C-0377',
'C-0378',
'C-1354',
'C-0298',
'C-0275',
'C-0274',
'C-1801',
'C-1291',
'C-0404',
'C-0405',
'C-0402',
'C-0400',
'C-0401',
'C-1809',
'C-1808',
'C-0734',
'C-0409',
'C-1161',
'C-0300',
'C-1618',
'C-1118',
'C-1209',
'C-1208',
'C-0370',
'C-1203',
'C-1782',
'C-1201',
'C-1200',
'C-1207',
'C-1206',
'C-1205',
'C-1783',
'C-0473',
'C-0472',
'C-1499',
'C-0470',
'C-0710',
'C-0475',
'C-0474',
'C-0479',
'C-0478',
'C-1331',
'C-1239',
'C-1330',
'C-0787',
'C-1386',
'C-1387',
'C-1384',
'C-1385',
'C-0988',
'C-1938',
'C-1935',
'C-1934',
'C-1937',
'C-1936',
'C-1930',
'C-1540',
'C-1377',
'C-0219',
'C-1375',
'C-1374',
'C-1478',
'C-1479',
'C-1101',
'C-1474',
'C-1475',
'C-1476',
'C-1477',
'C-1470',
'C-1379',
'C-1473',
'C-1278',
'C-1275',
'C-1541',
'C-1050',
'C-1270',
'C-1271',
'C-1496',
'C-1497',
'C-1494',
'C-0980',
'C-0998',
'C-1106',
'C-1559',
'C-1491',
'C-1557',
'C-0987',
'C-0996',
'C-1554',
'C-1986',
'C-1551',
'C-0993',
'C-0380',
'C-0383',
'C-0385',
'C-0786',
'C-0387',
'C-0386',
'C-0389',
'C-0388',
'C-1948',
'C-1717',
'C-1980',
'C-1981',
'C-1856',
'C-0012',
'C-1985',
'C-1155',
'C-1987',
'C-1988',
'C-1159',
'C-1158',
'C-1286',
'C-1113',
'C-0159',
'C-1018',
'C-1012',
'C-1010',
'C-0153',
'C-0155',
'C-1015',
'C-0638',
'C-1722',
'C-1723',
'C-1724',
'C-1725',
'C-1726',
'C-1727',
'C-1728',
'C-1729',
'C-0633',
'C-0635',
'C-0634',
'C-1112',
'C-0903',
'C-0900',
'C-1689',
'C-0738',
'C-0904',
'C-0011',
'C-0663',
'C-0334',
'C-1070',
'C-0336',
'C-0337',
'C-0330',
'C-0332',
'C-1178',
'C-0661',
'C-1111',
'C-1654',
'C-1655',
'C-1124',
'C-1125',
'C-1650',
'C-1123',
'C-1652',
'C-1653',
'C-0705',
'C-1794',
'C-0707',
'C-0706',
'C-0701',
'C-1659',
'C-1793',
'C-0476',
'C-0471',
'C-1680',
'C-0448',
'C-0449',
'C-0521',
'C-0520',
'C-0890',
'C-0522',
'C-0442',
'C-0443',
'C-0440',
'C-0441',
'C-0529',
'C-0528',
'C-0444',
'C-0445',
'C-1734',
'C-1170',
'C-0143',
'C-0266',
'C-0974',
'C-0975',
'C-1999',
'C-0877',
'C-0876',
'C-0873',
'C-0872',
'C-0871',
'C-1174',
'C-0200',
'C-0202',
'C-0203',
'C-1558',
'C-0205',
'C-0206',
'C-0208',
'C-0209',
'C-1352',
'C-1970',
'C-1973',
'C-1972',
'C-1975',
'C-1977',
'C-1976',
'C-1979',
'C-0021',
'C-1878',
'C-1879',
'C-1389',
'C-1991',
'C-1870',
'C-1871',
'C-1872',
'C-1873',
'C-1874',
'C-1877',
'C-1232',
'C-1233',
'C-1230',
'C-1231',
'C-0659',
'C-1235',
'C-1333',
'C-1332',
'C-1238',
'C-0658',
'C-1549',
'C-1336',
'C-0540',
'C-0265',
'C-1189',
'C-1518',
'C-0333',
'C-1196',
'C-1195',
'C-1194',
'C-1708',
'C-1000',
'C-0799',
'C-1003',
'C-1198',
'C-1427',
'C-1426',
'C-1425',
'C-1415',
'C-1588',
'C-1589',
'C-1421',
'C-1420',
'C-1584',
'C-1585',
'C-1586',
'C-1587',
'C-1580',
'C-1581',
'C-1582',
'C-1428',
'C-0625',
'C-0728',
'C-1348',
'C-1349',
'C-1344',
'C-1006',
'C-1343',
'C-1340',
'C-1341',
'C-1567',
'C-1564',
'C-1565',
'C-1562',
'C-1563',
'C-1560',
'C-1607',
'C-1603',
'C-1881',
'C-1600',
'C-2019',
'C-1429',
'C-1544',
'C-2017',
'C-2016',
'C-2015',
'C-2014',
'C-0740',
'C-0742',
'C-0745',
...}
In [297]:
has_unk = []
for cids in aug_test_set.components:
has_unk.append(any([cid not in known_cids for cid in cids]))
print len(aug_test_set), len(has_unk)
30235 30235
In [301]:
df = aug_test_set[['tube_assembly_id', 'components']].copy()
df['has_unk'] = has_unk
print df.has_unk.value_counts()
print df.has_unk.value_counts(normalize=True)
False 29213
True 1022
dtype: int64
False 0.966198
True 0.033802
dtype: float64
In [312]:
df2 = df[['tube_assembly_id', 'has_unk']].drop_duplicates()
print len(df), len(df2)
print df2.has_unk.value_counts()
print df2.has_unk.value_counts(normalize=True)
30235 8856
False 8409
True 447
dtype: int64
False 0.949526
True 0.050474
dtype: float64
In [ ]:
Content source: arorahardeep/kaggle-caterpillar
Similar notebooks: