In [256]:
%pylab inline

from collections import defaultdict
import pandas as pd

from soln.dataset import get_augmented_train_and_test_set
from soln.dataset import get_component_info_df
from soln.dataset import load_raw_components

pd.set_option('display.max_columns', None)


Populating the interactive namespace from numpy and matplotlib

In [257]:
comp_types, group_dfs, cluster_dfs = load_raw_components()
group_dfs['other'].columns


Out[257]:
Index([u'component_id', u'part_name', u'weight'], dtype='object')

In [192]:
# Look for columns that occur in multiple groups:

col_to_groups = defaultdict(list)
for group_name, df in group_dfs.iteritems():
    for col in df.columns:
        col_to_groups[col].append(group_name)

col_groups = col_to_groups.items()
col_groups.sort(key=lambda (col, groups): len(groups), reverse=True)
print len(group_dfs)
for col, groups in col_groups:
    print col, len(groups), groups


11
weight 11 ['threaded', 'sleeve', 'adaptor', 'nut', 'float', 'tee', 'boss', 'other', 'hfl', 'elbow', 'straight']
component_id 11 ['threaded', 'sleeve', 'adaptor', 'nut', 'float', 'tee', 'boss', 'other', 'hfl', 'elbow', 'straight']
orientation 10 ['threaded', 'sleeve', 'adaptor', 'nut', 'float', 'tee', 'boss', 'hfl', 'elbow', 'straight']
component_type_id 10 ['threaded', 'sleeve', 'adaptor', 'nut', 'float', 'tee', 'boss', 'hfl', 'elbow', 'straight']
unique_feature 7 ['threaded', 'sleeve', 'adaptor', 'tee', 'boss', 'elbow', 'straight']
bolt_pattern_wide 5 ['float', 'tee', 'boss', 'elbow', 'straight']
overall_length 5 ['threaded', 'adaptor', 'tee', 'elbow', 'straight']
bolt_pattern_long 5 ['float', 'tee', 'boss', 'elbow', 'straight']
thickness 4 ['float', 'tee', 'elbow', 'straight']
groove 4 ['tee', 'boss', 'elbow', 'straight']
mj_class_code 3 ['tee', 'elbow', 'straight']
nominal_size_2 2 ['threaded', 'adaptor']
nominal_size_1 2 ['threaded', 'adaptor']
adaptor_angle 2 ['threaded', 'adaptor']
connection_type_id 2 ['sleeve', 'boss']
length 2 ['sleeve', 'nut']
mj_plug_class_code 2 ['tee', 'elbow']
hex_size 2 ['threaded', 'adaptor']
connection_type_id_1 2 ['threaded', 'adaptor']
connection_type_id_2 2 ['threaded', 'adaptor']
thread_size_1 2 ['threaded', 'adaptor']
thread_size_2 2 ['threaded', 'adaptor']
plating 2 ['sleeve', 'hfl']
end_form_id_1 2 ['threaded', 'adaptor']
end_form_id_2 2 ['threaded', 'adaptor']
extension_length 2 ['tee', 'elbow']
drop_length 2 ['tee', 'elbow']
length_1 2 ['threaded', 'adaptor']
length_2 2 ['threaded', 'adaptor']
thread_pitch_1 2 ['threaded', 'adaptor']
thread_pitch_2 2 ['threaded', 'adaptor']
diameter 1 ['nut']
nominal_size_3 1 ['threaded']
nominal_size_4 1 ['threaded']
head_diameter 1 ['straight']
type 1 ['boss']
corresponding_shell 1 ['hfl']
outside_shape 1 ['boss']
base_diameter 1 ['boss']
elbow_angle 1 ['elbow']
base_type 1 ['boss']
seat_angle 1 ['nut']
thread_size_4 1 ['threaded']
connection_type_id_3 1 ['threaded']
connection_type_id_4 1 ['threaded']
thread_size_3 1 ['threaded']
material 1 ['hfl']
coupling_class 1 ['hfl']
thread_size 1 ['nut']
intended_nut_pitch 1 ['sleeve']
plug_diameter 1 ['elbow']
end_form_id_3 1 ['threaded']
hex_nut_size 1 ['nut']
end_form_id_4 1 ['threaded']
hose_diameter 1 ['hfl']
thread_pitch 1 ['nut']
shoulder_diameter 1 ['boss']
part_name 1 ['other']
intended_nut_thread 1 ['sleeve']
blind_hole 1 ['nut']
height_over_tube 1 ['boss']
length_3 1 ['threaded']
length_4 1 ['threaded']
thread_pitch_4 1 ['threaded']
thread_pitch_3 1 ['threaded']

In [258]:
cinfo_df = get_component_info_df(comp_types, group_dfs, cluster_dfs)
cinfo_df.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 2047 entries, 0 to 2046
Data columns (total 48 columns):
adaptor_angle          55 non-null float64
base_diameter          57 non-null float64
base_type              124 non-null object
blind_hole             23 non-null object
bolt_pattern_long      2047 non-null float64
bolt_pattern_wide      2047 non-null float64
component_group_id     2047 non-null object
component_id           2047 non-null object
component_type_id      2047 non-null object
corresponding_shell    6 non-null object
coupling_class         6 non-null object
diameter               23 non-null float64
drop_length            175 non-null float64
elbow_angle            130 non-null float64
extension_length       174 non-null float64
groove                 2047 non-null bool
head_diameter          70 non-null float64
height_over_tube       147 non-null float64
hex_nut_size           42 non-null float64
hex_size               129 non-null float64
hose_diameter          6 non-null float64
intended_nut_pitch     50 non-null float64
intended_nut_thread    50 non-null float64
material               6 non-null object
mj_class_code          165 non-null object
mj_plug_class_code     44 non-null object
orientation            2047 non-null bool
outside_shape          124 non-null object
overall_length         2047 non-null float64
part_name              1001 non-null object
plating                56 non-null object
plug_diameter          7 non-null float64
seat_angle             15 non-null float64
shoulder_diameter      30 non-null float64
thickness              2047 non-null float64
type                   124 non-null object
unique_feature         2047 non-null bool
weight                 2047 non-null float64
cluster                361 non-null object
nominal_sizes          2047 non-null object
lengths                2047 non-null object
thread_sizes           2047 non-null object
connection_types       2047 non-null object
end_forms              2047 non-null object
thread_pitches         2047 non-null object
max_length             2047 non-null float64
min_thread_pitch       2047 non-null float64
min_thread_size        2047 non-null float64
dtypes: bool(3), float64(24), object(21)
memory usage: 741.6+ KB

In [195]:
%time aug_train_set, aug_test_set = get_augmented_train_and_test_set()


CPU times: user 13.6 s, sys: 72 ms, total: 13.7 s
Wall time: 13.9 s

In [211]:
from soln.utils import count_components
train_counts = count_components(aug_train_set, cinfo_df)
train_counts.rename(columns={'count': 'train_count'}, inplace=True)
test_counts = count_components(aug_test_set, cinfo_df)
test_counts.rename(columns={'count': 'test_count'}, inplace=True)
all_counts = cinfo_df[['component_id', 'component_type_id', 'component_group_id']]
all_counts = all_counts.merge(train_counts, on='component_id')
all_counts = all_counts.merge(test_counts, on='component_id')

In [212]:
all_counts


Out[212]:
component_id component_type_id component_group_id train_count test_count
0 C-0007 CP-014 threaded 13 7
1 C-0030 CP-015 threaded 0 0
2 C-0041 CP-014 threaded 0 0
3 C-0043 CP-014 threaded 0 0
4 C-0044 CP-014 threaded 0 0
5 C-0069 CP-015 threaded 0 0
6 C-0070 CP-015 threaded 0 0
7 C-0072 CP-015 threaded 0 0
8 C-0073 CP-015 threaded 0 0
9 C-0074 CP-014 threaded 0 0
10 C-0077 CP-014 threaded 0 1
11 C-0078 CP-014 threaded 0 1
12 C-0119 CP-015 threaded 0 0
13 C-0133 CP-014 threaded 10 12
14 C-0159 CP-016 threaded 2 0
15 C-0173 CP-014 threaded 1 0
16 C-0178 CP-014 threaded 1 0
17 C-0179 CP-014 threaded 0 0
18 C-0180 CP-014 threaded 0 0
19 C-0181 CP-014 threaded 0 1
20 C-0182 CP-014 threaded 0 0
21 C-0189 CP-015 threaded 1 0
22 C-0210 CP-015 threaded 25 23
23 C-0211 CP-015 threaded 25 22
24 C-0212 CP-015 threaded 2 3
25 C-0216 CP-014 threaded 2 0
26 C-0217 CP-014 threaded 2 2
27 C-0218 CP-014 threaded 4 1
28 C-0223 CP-016 threaded 1 1
29 C-0240 CP-014 threaded 0 0
... ... ... ... ... ...
2017 C-1895 CP-002 straight 0 0
2018 C-1897 CP-006 straight 1 0
2019 C-1900 CP-007 straight 1 0
2020 C-1901 CP-002 straight 5 1
2021 C-1904 CP-002 straight 0 3
2022 C-1943 CP-004 straight 4 2
2023 C-1946 CP-003 straight 1 1
2024 C-1947 CP-003 straight 5 6
2025 C-1953 CP-004 straight 0 1
2026 C-1956 CP-004 straight 3 2
2027 C-1958 CP-003 straight 0 1
2028 C-1983 CP-004 straight 1 0
2029 C-1986 CP-003 straight 1 1
2030 C-1996 CP-004 straight 0 1
2031 C-1997 CP-002 straight 2 0
2032 C-1999 CP-007 straight 1 2
2033 C-2000 CP-002 straight 2 0
2034 C-2009 CP-002 straight 0 1
2035 C-2015 CP-002 straight 2 4
2036 C-2016 CP-002 straight 1 0
2037 C-2017 CP-002 straight 3 1
2038 C-2018 CP-002 straight 0 1
2039 C-2019 CP-002 straight 6 2
2040 C-2020 CP-002 straight 1 2
2041 C-2021 CP-002 straight 3 6
2042 C-2022 CP-002 straight 6 1
2043 C-2023 CP-002 straight 3 4
2044 C-2034 CP-007 straight 0 0
2045 C-2043 CP-004 straight 5 7
2046 C-2044 CP-006 straight 2 1

2047 rows × 5 columns


In [214]:
print len(all_counts)
print len(all_counts[(all_counts.train_count > 0) & (all_counts.test_count == 0)])
print len(all_counts[(all_counts.train_count == 0) & (all_counts.test_count > 0)])


2047
520
482

In [223]:
bins = [(0, 0), (1, 1), (2, 5), (5, 10), (10, 20), (20, 50), (50, 100), (100, np.inf)]
all_counts['train_bin'] = -1
for i, (cmin, cmax) in enumerate(bins):
    all_counts.train_bin.loc[(all_counts.train_count >= cmin) & (all_counts.train_count <= cmax)] = i
all_counts['test_bin'] = -1
for i, (cmin, cmax) in enumerate(bins):
    all_counts.test_bin.loc[(all_counts.test_count >= cmin) & (all_counts.test_count <= cmax)] = i

In [240]:
grouped = all_counts.groupby(['train_bin', 'test_bin'])
df = grouped.size().unstack()
df


Out[240]:
test_bin 0 1 2 3 4 5 6 7
train_bin
0 346 399 81 2 NaN NaN NaN NaN
1 407 140 83 9 NaN NaN NaN NaN
2 111 94 112 38 2 NaN NaN NaN
3 2 5 43 37 8 NaN NaN NaN
4 NaN NaN NaN 13 41 2 NaN NaN
5 NaN NaN NaN NaN 9 23 1 NaN
6 NaN NaN NaN NaN NaN 3 6 NaN
7 NaN NaN NaN NaN NaN NaN NaN 30

In [242]:
in_train_not_test = all_counts[(all_counts.train_count > 0) & (all_counts.test_count == 0)]
print in_train_not_test.component_group_id.value_counts()
in_test_not_train = all_counts[(all_counts.train_count == 0) & (all_counts.test_count > 0)]
print in_test_not_train.component_group_id.value_counts()


other       313
straight     70
elbow        36
boss         36
threaded     31
adaptor      13
nut           9
float         7
hfl           3
sleeve        1
tee           1
dtype: int64
other       301
straight     77
elbow        35
boss         27
threaded     20
nut           8
sleeve        4
adaptor       4
hfl           2
tee           2
float         2
dtype: int64

In [243]:
df_other = group_dfs['other']
df_other.part_name.value_counts()


Out[243]:
FLANGE              158
PLATE                94
TUBE                 91
ADAPTER              53
BOSS                 41
ELBOW                34
BLOCK                27
BRACKET              27
TUBE AS              27
FITTING              12
CLIP                 10
ORIFICE              10
CONNECTOR-WELD        9
NUT-WELD              8
TUBE AS.              8
WASHER                8
HEAD-FLANGED          8
NUT-A/C               7
CONNECTOR             7
ADAPTER-OIL LIN       6
NUT                   6
LUG                   6
SPACER                6
COUPLING AS           6
ELBOW-HYDRAULIC       6
SEAL-O-RING           6
PIPE                  5
RING                  5
COUPLING              5
NUT-FUEL INJ          5
                   ... 
PLUG-PIPE             1
TUBE AS-RH            1
TUBE AS-O SUPPL       1
ADAPTER RING          1
SHEET                 1
COUPLING-PIPE         1
SLEEVE-REDUCING       1
HASP AS               1
BLOCK CONNECTION      1
ELBOW-AIR (CRS)       1
CAP-ORFS              1
TEE-ORFS BRAZE        1
FILLER AS             1
PLUG-DETENT           1
SCREEN AS             1
TEE-FLARED            1
ADAPTER-OIL (GA       1
ADAPTER - GAGE        1
PLUG-CORE             1
ELBOW-OIL INLET       1
FLANGE-ONE PIECE      1
CAP AS-O FILLER       1
ANGLE AS              1
ELBOW-CAST            1
PIN-SPRING            1
WASHER-HARD           1
RIVET-POP             1
SHROUD                1
COUPLING AS-PER       1
ADAPTER-STR           1
dtype: int64

In [282]:
plot(df_other.part_name.value_counts().values)


Out[282]:
[<matplotlib.lines.Line2D at 0x95d1110>]

In [280]:
flared = ['flared' in part_name.lower() for part_name in df_other.part_name.values]
df_other[flared].sort('component_id')


Out[280]:
component_id part_name weight component_group_id
130 C-0225 NUT-FLARED 0.070 other
477 C-0901 NUT-37° FLARED 0.015 other
834 C-1237 TEE-FLARED 0.409 other
333 C-1380 NUT-FLARED 0.290 other
387 C-1381 SLEEVE-FLARED 0.033 other
176 C-1382 NUT-FLARED 0.025 other
293 C-1383 SLEEVE-FLARED 0.009 other
0 C-1385 NUT-FLARED 0.014 other
1 C-1386 SLEEVE-FLARED 0.005 other

In [285]:
in_test_not_train[in_test_not_train.component_group_id == 'other'].sort('test_count', ascending=False)


Out[285]:
component_id component_type_id component_group_id train_count test_count train_bin test_bin
575 C-1402 other other 0 7 0 3
634 C-0085 other other 0 4 0 2
637 C-1978 other other 0 4 0 2
642 C-0121 other other 0 4 0 2
682 C-0749 other other 0 3 0 2
722 C-1184 other other 0 3 0 2
716 C-1083 other other 0 3 0 2
709 C-1032 other other 0 3 0 2
688 C-1490 other other 0 3 0 2
677 C-1382 other other 0 3 0 2
670 C-1311 other other 0 3 0 2
872 C-1160 other other 0 2 0 2
836 C-0639 other other 0 2 0 2
834 C-1380 other other 0 2 0 2
838 C-0609 other other 0 2 0 2
857 C-0412 other other 0 2 0 2
859 C-1068 other other 0 2 0 2
868 C-1130 other other 0 2 0 2
871 C-1157 other other 0 2 0 2
895 C-1309 other other 0 2 0 2
879 C-1283 other other 0 2 0 2
881 C-1298 other other 0 2 0 2
884 C-1831 other other 0 2 0 2
888 C-1381 other other 0 2 0 2
894 C-1940 other other 0 2 0 2
819 C-0145 other other 0 2 0 2
908 C-1007 other other 0 2 0 2
909 C-1169 other other 0 2 0 2
821 C-0201 other other 0 2 0 2
853 C-1022 other other 0 2 0 2
... ... ... ... ... ... ... ...
1085 C-0375 other other 0 1 0 1
1053 C-0906 other other 0 1 0 1
1054 C-0089 other other 0 1 0 1
1057 C-0233 other other 0 1 0 1
1059 C-0725 other other 0 1 0 1
1063 C-0525 other other 0 1 0 1
1064 C-0505 other other 0 1 0 1
1065 C-0645 other other 0 1 0 1
1069 C-1674 other other 0 1 0 1
1070 C-1800 other other 0 1 0 1
1075 C-0902 other other 0 1 0 1
1077 C-0167 other other 0 1 0 1
1081 C-0788 other other 0 1 0 1
1083 C-1971 other other 0 1 0 1
1087 C-1933 other other 0 1 0 1
1119 C-0396 other other 0 1 0 1
1088 C-0024 other other 0 1 0 1
1089 C-0032 other other 0 1 0 1
1092 C-0109 other other 0 1 0 1
1093 C-0114 other other 0 1 0 1
1096 C-0136 other other 0 1 0 1
1100 C-1504 other other 0 1 0 1
1102 C-1573 other other 0 1 0 1
1104 C-1367 other other 0 1 0 1
1106 C-0280 other other 0 1 0 1
1111 C-0326 other other 0 1 0 1
1112 C-0088 other other 0 1 0 1
1113 C-0310 other other 0 1 0 1
1118 C-0346 other other 0 1 0 1
1501 C-1049 other other 0 1 0 1

301 rows × 7 columns


In [276]:
df = all_counts[all_counts.component_group_id == 'elbow'].copy()
df.sort('train_count', ascending=False, inplace=True)
df[:10]


Out[276]:
component_id component_type_id component_group_id train_count test_count train_bin test_bin
1625 C-1565 CP-010 elbow 10 11 4 4
1590 C-1208 CP-008 elbow 10 6 4 3
1615 C-1428 CP-008 elbow 7 8 3 3
1603 C-1345 CP-010 elbow 7 5 3 3
1594 C-1317 CP-008 elbow 6 3 3 2
1553 C-0401 CP-008 elbow 6 3 3 2
1631 C-1587 CP-012 elbow 5 6 3 3
1678 C-1935 CP-008 elbow 5 3 3 2
1611 C-1404 CP-008 elbow 5 1 3 1
1605 C-1349 CP-008 elbow 4 3 2 2

In [277]:
plot(df.train_count.values)


Out[277]:
[<matplotlib.lines.Line2D at 0x9824290>]

In [278]:
freq_cids = df.component_id[df.train_count >= 3].values
print len(freq_cids)
freq_cids


22
Out[278]:
array(['C-1565', 'C-1208', 'C-1428', 'C-1345', 'C-1317', 'C-0401',
       'C-1587', 'C-1935', 'C-1404', 'C-1349', 'C-1585', 'C-1527',
       'C-1586', 'C-1783', 'C-1598', 'C-0063', 'C-1405', 'C-0598',
       'C-0616', 'C-1432', 'C-1500', 'C-1006'], dtype=object)

In [ ]:
straight = all_counts[all_counts.component_group_id == 'straight'].copy()
straight.sort('train_count', ascending=False, inplace=True)
straight[:10]

In [295]:
known_cids = set(all_counts.component_id[all_counts.train_count > 0].values)
print len(all_counts), len(known_cids)
known_cids


2047 1219
Out[295]:
{'C-0459',
 'C-0513',
 'C-0516',
 'C-0517',
 'C-0518',
 'C-0453',
 'C-0455',
 'C-0457',
 'C-0456',
 'C-0294',
 'C-0047',
 'C-0965',
 'C-1698',
 'C-0967',
 'C-0966',
 'C-1044',
 'C-0802',
 'C-0803',
 'C-0800',
 'C-0801',
 'C-0808',
 'C-1168',
 'C-0804',
 'C-1129',
 'C-0356',
 'C-0605',
 'C-0354',
 'C-0355',
 'C-0352',
 'C-0353',
 'C-0350',
 'C-0218',
 'C-0217',
 'C-0216',
 'C-0215',
 'C-0214',
 'C-0213',
 'C-0212',
 'C-0211',
 'C-0210',
 'C-1867',
 'C-1866',
 'C-1863',
 'C-1862',
 'C-1861',
 'C-1860',
 'C-1944',
 'C-1945',
 'C-1946',
 'C-1514',
 'C-1941',
 'C-1869',
 'C-1868',
 'C-1690',
 'C-0880',
 'C-1983',
 'C-0061',
 'C-1790',
 'C-0062',
 'C-1223',
 'C-1222',
 'C-0017',
 'C-1227',
 'C-1229',
 'C-0064',
 'C-1154',
 'C-1555',
 'C-1164',
 'C-0598',
 'C-0560',
 'C-1152',
 'C-1677',
 'C-0063',
 'C-1452',
 'C-0554',
 'C-1898',
 'C-1598',
 'C-1456',
 'C-1457',
 'C-1455',
 'C-0102',
 'C-1458',
 'C-0100',
 'C-1597',
 'C-1596',
 'C-1595',
 'C-1294',
 'C-0648',
 'C-1359',
 'C-1358',
 'C-1292',
 'C-1293',
 'C-1355',
 'C-0222',
 'C-1357',
 'C-1356',
 'C-1351',
 'C-1353',
 'C-0498',
 'C-1577',
 'C-1679',
 'C-1571',
 'C-1570',
 'C-0964',
 'C-0082',
 'C-1673',
 'C-0884',
 'C-1671',
 'C-1676',
 'C-1578',
 'C-0122',
 'C-0711',
 'C-0125',
 'C-0152',
 'C-1026',
 'C-1687',
 'C-0076',
 'C-1683',
 'C-1682',
 'C-0071',
 'C-0081',
 'C-0774',
 'C-0775',
 'C-0776',
 'C-0777',
 'C-0770',
 'C-0771',
 'C-0772',
 'C-1688',
 'C-1179',
 'C-0504',
 'C-1072',
 'C-1074',
 'C-1075',
 'C-0178',
 'C-1077',
 'C-1078',
 'C-0177',
 'C-1173',
 'C-1172',
 'C-0172',
 'C-0173',
 'C-0170',
 'C-0171',
 'C-0796',
 'C-1707',
 'C-0794',
 'C-0795',
 'C-0792',
 'C-0793',
 'C-0790',
 'C-0791',
 'C-0656',
 'C-0655',
 'C-0654',
 'C-0652',
 'C-0651',
 'C-0650',
 'C-1982',
 'C-0929',
 'C-0252',
 'C-0251',
 'C-0250',
 'C-0257',
 'C-0357',
 'C-0921',
 'C-0923',
 'C-0258',
 'C-0924',
 'C-0927',
 'C-0318',
 'C-1770',
 'C-1777',
 'C-0312',
 'C-0313',
 'C-1779',
 'C-1921',
 'C-0317',
 'C-0314',
 'C-0315',
 'C-0420',
 'C-1889',
 'C-0422',
 'C-1828',
 'C-0424',
 'C-0425',
 'C-0426',
 'C-0427',
 'C-0428',
 'C-0429',
 'C-1821',
 'C-0351',
 'C-0968',
 'C-0247',
 'C-0342',
 'C-0636',
 'C-1444',
 'C-0855',
 'C-0857',
 'C-0856',
 'C-0850',
 'C-0853',
 'C-0852',
 'C-1434',
 'C-1686',
 'C-0859',
 'C-0858',
 'C-0226',
 'C-0227',
 'C-0556',
 'C-0225',
 'C-0550',
 'C-0223',
 'C-0552',
 'C-0553',
 'C-0495',
 'C-0494',
 'C-0497',
 'C-0496',
 'C-0558',
 'C-0559',
 'C-0228',
 'C-0229',
 'C-1917',
 'C-1916',
 'C-1914',
 'C-1913',
 'C-1911',
 'C-1910',
 'C-1919',
 'C-1081',
 'C-1080',
 'C-1082',
 'C-1085',
 'C-1084',
 'C-1087',
 'C-1859',
 'C-1089',
 'C-1088',
 'C-1852',
 'C-1850',
 'C-1949',
 'C-1310',
 'C-1313',
 'C-1312',
 'C-1314',
 'C-1317',
 'C-1316',
 'C-1526',
 'C-1259',
 'C-0079',
 'C-1750',
 'C-1253',
 'C-1122',
 'C-1256',
 'C-1257',
 'C-1538',
 'C-1947',
 'C-1531',
 'C-1530',
 'C-1533',
 'C-1535',
 'C-1534',
 'C-1536',
 'C-1943',
 'C-1362',
 'C-0132',
 'C-1035',
 'C-0130',
 'C-1037',
 'C-1030',
 'C-1031',
 'C-0134',
 'C-0135',
 'C-1768',
 'C-0031',
 'C-0139',
 'C-0037',
 'C-1079',
 'C-0035',
 'C-0362',
 'C-2031',
 'C-1405',
 'C-1404',
 'C-1400',
 'C-1403',
 'C-1625',
 'C-1624',
 'C-1627',
 'C-1626',
 'C-1621',
 'C-1620',
 'C-1623',
 'C-1622',
 'C-1742',
 'C-1743',
 'C-1740',
 'C-1629',
 'C-1628',
 'C-1744',
 'C-1745',
 'C-0297',
 'C-2030',
 'C-0295',
 'C-2032',
 'C-0293',
 'C-0292',
 'C-2037',
 'C-1180',
 'C-1838',
 'C-0299',
 'C-1839',
 'C-0727',
 'C-1126',
 'C-0720',
 'C-0684',
 'C-0685',
 'C-0680',
 'C-1656',
 'C-0682',
 'C-0683',
 'C-1548',
 'C-1102',
 'C-1103',
 'C-1104',
 'C-1105',
 'C-0048',
 'C-1107',
 'C-0046',
 'C-1109',
 'C-0447',
 'C-0045',
 'C-0093',
 'C-1651',
 'C-0664',
 'C-0665',
 'C-0662',
 'C-1120',
 'C-0660',
 'C-0508',
 'C-0507',
 'C-1121',
 'C-0503',
 'C-0502',
 'C-0501',
 'C-0500',
 'C-1795',
 'C-1182',
 'C-0704',
 'C-1615',
 'C-1797',
 'C-0819',
 'C-1658',
 'C-0813',
 'C-0812',
 'C-0815',
 'C-0814',
 'C-0817',
 'C-0816',
 'C-0345',
 'C-0599',
 'C-0347',
 'C-0269',
 'C-0617',
 'C-0616',
 'C-0615',
 'C-1228',
 'C-0263',
 'C-0260',
 'C-0261',
 'C-0349',
 'C-0348',
 'C-0596',
 'C-0597',
 'C-1812',
 'C-1813',
 'C-1811',
 'C-1817',
 'C-1814',
 'C-1815',
 'C-1952',
 'C-1951',
 'C-1950',
 'C-1957',
 'C-0527',
 'C-1954',
 'C-1459',
 'C-1218',
 'C-1219',
 'C-1057',
 'C-1214',
 'C-1215',
 'C-1216',
 'C-1210',
 'C-0196',
 'C-1213',
 'C-0466',
 'C-0197',
 'C-0460',
 'C-0462',
 'C-0190',
 'C-0469',
 'C-1144',
 'C-1365',
 'C-0676',
 'C-1119',
 'C-1785',
 'C-0446',
 'C-1051',
 'C-1043',
 'C-1116',
 'C-1926',
 'C-1140',
 'C-1924',
 'C-1925',
 'C-1922',
 'C-1115',
 'C-1920',
 'C-1141',
 'C-1441',
 'C-1888',
 'C-1445',
 'C-1142',
 'C-1449',
 'C-0198',
 'C-1883',
 'C-1882',
 'C-1885',
 'C-1884',
 'C-0199',
 'C-1368',
 'C-1369',
 'C-1280',
 'C-1058',
 'C-1285',
 'C-1360',
 'C-1289',
 'C-1059',
 'C-1364',
 'C-0761',
 'C-1366',
 'C-1669',
 'C-1668',
 'C-1166',
 'C-2038',
 'C-1545',
 'C-1547',
 'C-1661',
 'C-1660',
 'C-1663',
 'C-0095',
 'C-1664',
 'C-1667',
 'C-1666',
 'C-0390',
 'C-0391',
 'C-0146',
 'C-0395',
 'C-0491',
 'C-0398',
 'C-0002',
 'C-0003',
 'C-1146',
 'C-0001',
 'C-0006',
 'C-0007',
 'C-0004',
 'C-1143',
 'C-0762',
 'C-0008',
 'C-0766',
 'C-0765',
 'C-0764',
 'C-0169',
 'C-0168',
 'C-1069',
 'C-0490',
 'C-0165',
 'C-0164',
 'C-0166',
 'C-0161',
 'C-1062',
 'C-1061',
 'C-0162',
 'C-1715',
 'C-1714',
 'C-0628',
 'C-1716',
 'C-0781',
 'C-0780',
 'C-0783',
 'C-0133',
 'C-0622',
 'C-0623',
 'C-0620',
 'C-0621',
 'C-0626',
 'C-1718',
 'C-0624',
 'C-0577',
 'C-1067',
 'C-0917',
 'C-0911',
 'C-0912',
 'C-0918',
 'C-0604',
 'C-2040',
 'C-2042',
 'C-2043',
 'C-2044',
 'C-2045',
 'C-2046',
 'C-2047',
 'C-0309',
 'C-0308',
 'C-0492',
 'C-0719',
 'C-1786',
 'C-0717',
 'C-0302',
 'C-0712',
 'C-0713',
 'C-0307',
 'C-1781',
 'C-0785',
 'C-0534',
 'C-0532',
 'C-0533',
 'C-0439',
 'C-0438',
 'C-0437',
 'C-0435',
 'C-0434',
 'C-0433',
 'C-0432',
 'C-0539',
 'C-0629',
 'C-1711',
 'C-0979',
 'C-0860',
 'C-0862',
 'C-0863',
 'C-0864',
 'C-0865',
 'C-0867',
 'C-0868',
 'C-0869',
 'C-0977',
 'C-0237',
 'C-0123',
 'C-0547',
 'C-0230',
 'C-0544',
 'C-0549',
 'C-0548',
 'C-0239',
 'C-0784',
 'C-1962',
 'C-1963',
 'C-1960',
 'C-1961',
 'C-1966',
 'C-1967',
 'C-1965',
 'C-1719',
 'C-1969',
 'C-1599',
 'C-0627',
 'C-0679',
 'C-1451',
 'C-1849',
 'C-1848',
 'C-1345',
 'C-1845',
 'C-1844',
 'C-1847',
 'C-1846',
 'C-1841',
 'C-1843',
 'C-1842',
 'C-1247',
 'C-1246',
 'C-1245',
 'C-1244',
 'C-1243',
 'C-1242',
 'C-0915',
 'C-1327',
 'C-1321',
 'C-1322',
 'C-0022',
 'C-1893',
 'C-1093',
 'C-0224',
 'C-0148',
 'C-1591',
 'C-1288',
 'C-1891',
 'C-0175',
 'C-1748',
 'C-0618',
 'C-1897',
 'C-1749',
 'C-1894',
 'C-1098',
 'C-0120',
 'C-1021',
 'C-1020',
 'C-1027',
 'C-0124',
 'C-1025',
 'C-0126',
 'C-0129',
 'C-0128',
 'C-1029',
 'C-1183',
 'C-1187',
 'C-1439',
 'C-0703',
 'C-1430',
 'C-1431',
 'C-1432',
 'C-1433',
 'C-0739',
 'C-1435',
 'C-1436',
 'C-1437',
 'C-1759',
 'C-1758',
 'C-1612',
 'C-1613',
 'C-1614',
 'C-0894',
 'C-1616',
 'C-1751',
 'C-1619',
 'C-1752',
 'C-1300',
 'C-1757',
 'C-1756',
 'C-2008',
 'C-1017',
 'C-2004',
 'C-2005',
 'C-2006',
 'C-2007',
 'C-2000',
 'C-2001',
 'C-2002',
 'C-2003',
 'C-0752',
 'C-0753',
 'C-0699',
 'C-0751',
 'C-0756',
 'C-0757',
 'C-0754',
 'C-0755',
 'C-0690',
 'C-0697',
 'C-0696',
 'C-0194',
 'C-1505',
 'C-1506',
 'C-1507',
 'C-1500',
 'C-0058',
 'C-1502',
 'C-1503',
 'C-1117',
 'C-0054',
 'C-0057',
 'C-1114',
 'C-0051',
 'C-0050',
 'C-0053',
 'C-0052',
 'C-0675',
 'C-0674',
 'C-0578',
 'C-0579',
 'C-0671',
 'C-0670',
 'C-0673',
 'C-0672',
 'C-0572',
 'C-0573',
 'C-0576',
 'C-0678',
 'C-0575',
 'C-0825',
 'C-0826',
 'C-0827',
 'C-0821',
 'C-0823',
 'C-1820',
 'C-1590',
 'C-0828',
 'C-0829',
 'C-0585',
 'C-0583',
 'C-0581',
 'C-0580',
 'C-1806',
 'C-0306',
 'C-0278',
 'C-0372',
 'C-0376',
 'C-0377',
 'C-0378',
 'C-1354',
 'C-0298',
 'C-0275',
 'C-0274',
 'C-1801',
 'C-1291',
 'C-0404',
 'C-0405',
 'C-0402',
 'C-0400',
 'C-0401',
 'C-1809',
 'C-1808',
 'C-0734',
 'C-0409',
 'C-1161',
 'C-0300',
 'C-1618',
 'C-1118',
 'C-1209',
 'C-1208',
 'C-0370',
 'C-1203',
 'C-1782',
 'C-1201',
 'C-1200',
 'C-1207',
 'C-1206',
 'C-1205',
 'C-1783',
 'C-0473',
 'C-0472',
 'C-1499',
 'C-0470',
 'C-0710',
 'C-0475',
 'C-0474',
 'C-0479',
 'C-0478',
 'C-1331',
 'C-1239',
 'C-1330',
 'C-0787',
 'C-1386',
 'C-1387',
 'C-1384',
 'C-1385',
 'C-0988',
 'C-1938',
 'C-1935',
 'C-1934',
 'C-1937',
 'C-1936',
 'C-1930',
 'C-1540',
 'C-1377',
 'C-0219',
 'C-1375',
 'C-1374',
 'C-1478',
 'C-1479',
 'C-1101',
 'C-1474',
 'C-1475',
 'C-1476',
 'C-1477',
 'C-1470',
 'C-1379',
 'C-1473',
 'C-1278',
 'C-1275',
 'C-1541',
 'C-1050',
 'C-1270',
 'C-1271',
 'C-1496',
 'C-1497',
 'C-1494',
 'C-0980',
 'C-0998',
 'C-1106',
 'C-1559',
 'C-1491',
 'C-1557',
 'C-0987',
 'C-0996',
 'C-1554',
 'C-1986',
 'C-1551',
 'C-0993',
 'C-0380',
 'C-0383',
 'C-0385',
 'C-0786',
 'C-0387',
 'C-0386',
 'C-0389',
 'C-0388',
 'C-1948',
 'C-1717',
 'C-1980',
 'C-1981',
 'C-1856',
 'C-0012',
 'C-1985',
 'C-1155',
 'C-1987',
 'C-1988',
 'C-1159',
 'C-1158',
 'C-1286',
 'C-1113',
 'C-0159',
 'C-1018',
 'C-1012',
 'C-1010',
 'C-0153',
 'C-0155',
 'C-1015',
 'C-0638',
 'C-1722',
 'C-1723',
 'C-1724',
 'C-1725',
 'C-1726',
 'C-1727',
 'C-1728',
 'C-1729',
 'C-0633',
 'C-0635',
 'C-0634',
 'C-1112',
 'C-0903',
 'C-0900',
 'C-1689',
 'C-0738',
 'C-0904',
 'C-0011',
 'C-0663',
 'C-0334',
 'C-1070',
 'C-0336',
 'C-0337',
 'C-0330',
 'C-0332',
 'C-1178',
 'C-0661',
 'C-1111',
 'C-1654',
 'C-1655',
 'C-1124',
 'C-1125',
 'C-1650',
 'C-1123',
 'C-1652',
 'C-1653',
 'C-0705',
 'C-1794',
 'C-0707',
 'C-0706',
 'C-0701',
 'C-1659',
 'C-1793',
 'C-0476',
 'C-0471',
 'C-1680',
 'C-0448',
 'C-0449',
 'C-0521',
 'C-0520',
 'C-0890',
 'C-0522',
 'C-0442',
 'C-0443',
 'C-0440',
 'C-0441',
 'C-0529',
 'C-0528',
 'C-0444',
 'C-0445',
 'C-1734',
 'C-1170',
 'C-0143',
 'C-0266',
 'C-0974',
 'C-0975',
 'C-1999',
 'C-0877',
 'C-0876',
 'C-0873',
 'C-0872',
 'C-0871',
 'C-1174',
 'C-0200',
 'C-0202',
 'C-0203',
 'C-1558',
 'C-0205',
 'C-0206',
 'C-0208',
 'C-0209',
 'C-1352',
 'C-1970',
 'C-1973',
 'C-1972',
 'C-1975',
 'C-1977',
 'C-1976',
 'C-1979',
 'C-0021',
 'C-1878',
 'C-1879',
 'C-1389',
 'C-1991',
 'C-1870',
 'C-1871',
 'C-1872',
 'C-1873',
 'C-1874',
 'C-1877',
 'C-1232',
 'C-1233',
 'C-1230',
 'C-1231',
 'C-0659',
 'C-1235',
 'C-1333',
 'C-1332',
 'C-1238',
 'C-0658',
 'C-1549',
 'C-1336',
 'C-0540',
 'C-0265',
 'C-1189',
 'C-1518',
 'C-0333',
 'C-1196',
 'C-1195',
 'C-1194',
 'C-1708',
 'C-1000',
 'C-0799',
 'C-1003',
 'C-1198',
 'C-1427',
 'C-1426',
 'C-1425',
 'C-1415',
 'C-1588',
 'C-1589',
 'C-1421',
 'C-1420',
 'C-1584',
 'C-1585',
 'C-1586',
 'C-1587',
 'C-1580',
 'C-1581',
 'C-1582',
 'C-1428',
 'C-0625',
 'C-0728',
 'C-1348',
 'C-1349',
 'C-1344',
 'C-1006',
 'C-1343',
 'C-1340',
 'C-1341',
 'C-1567',
 'C-1564',
 'C-1565',
 'C-1562',
 'C-1563',
 'C-1560',
 'C-1607',
 'C-1603',
 'C-1881',
 'C-1600',
 'C-2019',
 'C-1429',
 'C-1544',
 'C-2017',
 'C-2016',
 'C-2015',
 'C-2014',
 'C-0740',
 'C-0742',
 'C-0745',
 ...}

In [297]:
has_unk = []
for cids in aug_test_set.components:
    has_unk.append(any([cid not in known_cids for cid in cids]))
print len(aug_test_set), len(has_unk)


30235 30235

In [301]:
df = aug_test_set[['tube_assembly_id', 'components']].copy()
df['has_unk'] = has_unk
print df.has_unk.value_counts()
print df.has_unk.value_counts(normalize=True)


False    29213
True      1022
dtype: int64
False    0.966198
True     0.033802
dtype: float64

In [312]:
df2 = df[['tube_assembly_id', 'has_unk']].drop_duplicates()
print len(df), len(df2)
print df2.has_unk.value_counts()
print df2.has_unk.value_counts(normalize=True)


30235 8856
False    8409
True      447
dtype: int64
False    0.949526
True     0.050474
dtype: float64

In [ ]: