In [14]:
%pylab inline

from collections import defaultdict
from sklearn.preprocessing import LabelBinarizer
import pandas as pd

from soln.dataset import get_augmented_train_and_test_set
from soln.dataset import get_component_info_df
from soln.dataset import load_raw_components

pd.set_option('display.max_columns', None)


Populating the interactive namespace from numpy and matplotlib

In [15]:
comp_types, group_dfs, cluster_dfs = load_raw_components()
straight = group_dfs['straight']
for feat in ('groove', 'unique_feature', 'orientation'):
    straight[feat] = (straight[feat] == 'Yes')
straight.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 361 entries, 0 to 360
Data columns (total 12 columns):
component_id         361 non-null object
component_type_id    361 non-null object
bolt_pattern_long    291 non-null float64
bolt_pattern_wide    204 non-null float64
head_diameter        70 non-null float64
overall_length       41 non-null float64
thickness            361 non-null float64
mj_class_code        120 non-null object
groove               361 non-null bool
unique_feature       361 non-null bool
orientation          361 non-null bool
weight               354 non-null float64
dtypes: bool(3), float64(6), object(3)
memory usage: 29.3+ KB

In [16]:
# Convert mj_class_code to binary features.
straight.mj_class_code.fillna('MJ-other', inplace=True)
print straight.mj_class_code.value_counts()
lb = LabelBinarizer()
bin_feats = lb.fit_transform(straight.mj_class_code.values)
print lb.classes_
tmp_df = pd.DataFrame(bin_feats, columns=lb.classes_)
straight = straight.join(tmp_df)
straight.pop('mj_class_code')
straight.info()


MJ-other    241
MJ-003       71
MJ-001       46
MJ-002        2
MJ-007        1
dtype: int64
['MJ-001' 'MJ-002' 'MJ-003' 'MJ-007' 'MJ-other']
<class 'pandas.core.frame.DataFrame'>
Int64Index: 361 entries, 0 to 360
Data columns (total 16 columns):
component_id         361 non-null object
component_type_id    361 non-null object
bolt_pattern_long    291 non-null float64
bolt_pattern_wide    204 non-null float64
head_diameter        70 non-null float64
overall_length       41 non-null float64
thickness            361 non-null float64
groove               361 non-null bool
unique_feature       361 non-null bool
orientation          361 non-null bool
weight               354 non-null float64
MJ-001               361 non-null int64
MJ-002               361 non-null int64
MJ-003               361 non-null int64
MJ-007               361 non-null int64
MJ-other             361 non-null int64
dtypes: bool(3), float64(6), int64(5), object(2)
memory usage: 40.5+ KB

In [17]:
# Convert component_type_id to binary features.
print straight.component_type_id.value_counts()
lb = LabelBinarizer()
bin_feats = lb.fit_transform(straight.component_type_id.values)
print lb.classes_
tmp_df = pd.DataFrame(bin_feats, columns=lb.classes_)
straight = straight.join(tmp_df)
straight.pop('component_type_id')
straight.info()


CP-002    93
CP-004    85
CP-003    83
CP-006    45
CP-001    28
CP-007    25
CP-005     2
dtype: int64
['CP-001' 'CP-002' 'CP-003' 'CP-004' 'CP-005' 'CP-006' 'CP-007']
<class 'pandas.core.frame.DataFrame'>
Int64Index: 361 entries, 0 to 360
Data columns (total 22 columns):
component_id         361 non-null object
bolt_pattern_long    291 non-null float64
bolt_pattern_wide    204 non-null float64
head_diameter        70 non-null float64
overall_length       41 non-null float64
thickness            361 non-null float64
groove               361 non-null bool
unique_feature       361 non-null bool
orientation          361 non-null bool
weight               354 non-null float64
MJ-001               361 non-null int64
MJ-002               361 non-null int64
MJ-003               361 non-null int64
MJ-007               361 non-null int64
MJ-other             361 non-null int64
CP-001               361 non-null int64
CP-002               361 non-null int64
CP-003               361 non-null int64
CP-004               361 non-null int64
CP-005               361 non-null int64
CP-006               361 non-null int64
CP-007               361 non-null int64
dtypes: bool(3), float64(6), int64(12), object(1)
memory usage: 57.5+ KB

In [18]:
# Add features to capture whether a value is missing, and fill missing values with average.
for feat in ('bolt_pattern_long', 'bolt_pattern_wide', 'head_diameter', 'overall_length', 'weight'):
    straight[feat + '_missing'] = straight[feat].isnull()
    straight[feat].fillna(straight[feat].mean(), inplace=True)
straight.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 361 entries, 0 to 360
Data columns (total 27 columns):
component_id                 361 non-null object
bolt_pattern_long            361 non-null float64
bolt_pattern_wide            361 non-null float64
head_diameter                361 non-null float64
overall_length               361 non-null float64
thickness                    361 non-null float64
groove                       361 non-null bool
unique_feature               361 non-null bool
orientation                  361 non-null bool
weight                       361 non-null float64
MJ-001                       361 non-null int64
MJ-002                       361 non-null int64
MJ-003                       361 non-null int64
MJ-007                       361 non-null int64
MJ-other                     361 non-null int64
CP-001                       361 non-null int64
CP-002                       361 non-null int64
CP-003                       361 non-null int64
CP-004                       361 non-null int64
CP-005                       361 non-null int64
CP-006                       361 non-null int64
CP-007                       361 non-null int64
bolt_pattern_long_missing    361 non-null bool
bolt_pattern_wide_missing    361 non-null bool
head_diameter_missing        361 non-null bool
overall_length_missing       361 non-null bool
weight_missing               361 non-null bool
dtypes: bool(8), float64(6), int64(12), object(1)
memory usage: 59.2+ KB

In [19]:
# Get list of vectors to work with.
straight.set_index('component_id', drop=True, inplace=True)
straight.to_csv('straight_vecs.csv')
straight_np = straight.astype(np.float).values
straight_np.shape


Out[19]:
(361, 26)

In [20]:
# Approach 1: Cluster agglomeratively, then extract flat clusters.

from scipy.cluster.hierarchy import linkage
from scipy.cluster.vq import whiten
y = whiten(straight_np)
%time Z = linkage(y, method='ward')


CPU times: user 36 ms, sys: 4 ms, total: 40 ms
Wall time: 44 ms

In [21]:
from scipy.cluster.hierarchy import dendrogram
figsize(14, 8)
%time out = dendrogram(Z)


CPU times: user 1.12 s, sys: 32 ms, total: 1.16 s
Wall time: 1.19 s

In [22]:
from scipy.cluster import hierarchy
fcluster = hierarchy.fcluster(Z, 10.0, criterion='distance')
print fcluster.shape, np.min(fcluster), np.max(fcluster)


(361,) 1 25

In [176]:
# Approach 2: Use K-means

from scipy.cluster import vq

y = vq.whiten(straight_np)
codebook, distortion = vq.kmeans(y, 20)
print len(codebook), distortion
fcluster, _ = vq.vq(y, codebook)


20 1.6973658663

In [23]:
cid_to_clu = pd.Series(fcluster, index=straight.index, name='cluster')
cid_to_clu.sort()
cid_to_clu = cid_to_clu.reset_index()
cid_to_clu.cluster.value_counts()


Out[23]:
16    45
13    36
23    29
15    26
1     26
10    25
21    25
12    25
20    19
6     17
19    13
2      9
17     8
24     8
3      8
25     8
14     7
18     7
11     5
5      5
4      4
9      2
8      2
22     1
7      1
dtype: int64

In [24]:
fcluster_str = ["straight_clu_{}".format(clu) for clu in fcluster]
cid_to_clu_str = pd.Series(fcluster_str, index=straight.index, name='cluster')
cid_to_clu_str = cid_to_clu_str.reset_index()
cid_to_clu_str.to_csv("clu_straight.csv", index=False)

In [ ]:


In [112]:
# Approach 3: PCA with 2 components, to plot and see if there are any obvious clusters.

from sklearn.decomposition import PCA
pca = PCA(n_components=2, whiten=True)
%time pca.fit(straight_np)


CPU times: user 4 ms, sys: 0 ns, total: 4 ms
Wall time: 3.1 ms
Out[112]:
PCA(copy=True, n_components=2, whiten=True)

In [113]:
pca.explained_variance_ratio_


Out[113]:
array([ 0.67927172,  0.17201545])

In [116]:
pts = pca.transform(straight_np)
print pts.shape
scatter(pts[:, 0], pts[:, 1])


(361, 2)
Out[116]:
<matplotlib.collections.PathCollection at 0x9ee0f90>

In [ ]:


In [218]:
# Approach 4: map infrequent components to more frequent substitutes.

from scipy.spatial.distance import euclidean

#freq_cids = [
#    'C-1739', 'C-1203', 'C-0244', 'C-0275', 'C-1555', 'C-1344',
#    'C-1615', 'C-1750', 'C-1373']

#freq_cids = [
#    'C-1739', 'C-1203', 'C-0244', 'C-0275', 'C-1555', 'C-1344',
#    'C-1615', 'C-1750', 'C-1373', 'C-1670', 'C-0808', 'C-0703',
#    'C-0170', 'C-1449', 'C-1352', 'C-1545', 'C-0679', 'C-1430',
#    'C-0250', 'C-0618', 'C-1354', 'C-1614', 'C-2022', 'C-0122',
#    'C-2019', 'C-0434']

freq_cids = ['C-1739', 'C-1203', 'C-0244', 'C-0275', 'C-1555', 'C-1344',
       'C-1615', 'C-1750', 'C-1373', 'C-1670', 'C-0808', 'C-0703',
       'C-0170', 'C-1449', 'C-1352', 'C-1545', 'C-0679', 'C-1430',
       'C-0250', 'C-0618', 'C-1354', 'C-1614', 'C-2022', 'C-0122',
       'C-2019', 'C-0434', 'C-1901', 'C-0846', 'C-1343', 'C-2043',
       'C-1711', 'C-1947', 'C-1881', 'C-0829', 'C-0247', 'C-1768',
       'C-0485', 'C-0423', 'C-1843', 'C-1749', 'C-1740', 'C-1242',
       'C-1719', 'C-0162', 'C-1893', 'C-1943', 'C-1412', 'C-0292',
       'C-1678', 'C-1534', 'C-0568', 'C-1792', 'C-0172', 'C-1956',
       'C-2023', 'C-1811', 'C-2021', 'C-0734', 'C-1235', 'C-1880',
       'C-2017', 'C-1005']

all_cids = list(straight.index)

straight_np_wh = vq.whiten(straight_np)

freq_cid_vecs = []
for cid in freq_cids:
    row_idx = all_cids.index(cid)
    assert row_idx != -1
    row = straight_np_wh[row_idx, :]
    freq_cid_vecs.append(row)

cid_to_freq_cid = {}
cid_to_dist = {}
for row_idx, cid in enumerate(straight.index.values):
    if cid in freq_cids:
        cid_to_freq_cid[cid] = cid
        cid_to_dist[cid] = 0.0
        print "cid {} is frequent enough".format(cid)
    else:
        best_i = None
        best_dist = np.inf
        for i in xrange(len(freq_cids)):
            row = straight_np_wh[row_idx, :]
            dist = euclidean(row, freq_cid_vecs[i])
            if dist < best_dist:
                best_dist = dist
                best_i = i
        cid_to_freq_cid[cid] = freq_cids[best_i]
        cid_to_dist[cid] = best_dist
        print "cid {} mapped to freq_cid {} with dist {}".format(cid, freq_cids[best_i], best_dist)


cid C-0012 mapped to freq_cid C-1768 with dist 0.00572083395993
cid C-0014 mapped to freq_cid C-1768 with dist 2.48103262195
cid C-0015 mapped to freq_cid C-1768 with dist 2.01303649676
cid C-0019 mapped to freq_cid C-2021 with dist 0.165916168176
cid C-0029 mapped to freq_cid C-1792 with dist 3.84755369882
cid C-0036 mapped to freq_cid C-1412 with dist 2.08913613284
cid C-0038 mapped to freq_cid C-1373 with dist 0.34580193603
cid C-0046 mapped to freq_cid C-2017 with dist 13.8679358888
cid C-0062 mapped to freq_cid C-1811 with dist 1.86693052278
cid C-0079 mapped to freq_cid C-0568 with dist 3.28931616597
cid C-0080 mapped to freq_cid C-1430 with dist 0.292100966583
cid C-0081 mapped to freq_cid C-1956 with dist 0.167002277025
cid C-0086 mapped to freq_cid C-1430 with dist 0.830196206164
cid C-0093 mapped to freq_cid C-1430 with dist 0.83015620797
cid C-0110 mapped to freq_cid C-0568 with dist 0.873807514456
cid C-0115 mapped to freq_cid C-1430 with dist 0.524291252267
cid C-0122 is frequent enough
cid C-0131 mapped to freq_cid C-0679 with dist 3.70809247932
cid C-0132 mapped to freq_cid C-1242 with dist 1.43253210862
cid C-0137 mapped to freq_cid C-0679 with dist 3.67080563026
cid C-0138 mapped to freq_cid C-1811 with dist 1.00010346656
cid C-0141 mapped to freq_cid C-1943 with dist 0.689574499523
cid C-0142 mapped to freq_cid C-0703 with dist 1.67193222358
cid C-0146 mapped to freq_cid C-1739 with dist 0.0846597030023
cid C-0147 mapped to freq_cid C-1843 with dist 4.39545103733
cid C-0150 mapped to freq_cid C-0618 with dist 1.4949389403
cid C-0154 mapped to freq_cid C-1893 with dist 0.0423341713034
cid C-0156 mapped to freq_cid C-1235 with dist 4.17452489667
cid C-0157 mapped to freq_cid C-1880 with dist 0.18395722623
cid C-0158 mapped to freq_cid C-1739 with dist 0.0823075331793
cid C-0162 is frequent enough
cid C-0166 mapped to freq_cid C-1956 with dist 0.58670910095
cid C-0170 is frequent enough
cid C-0172 is frequent enough
cid C-0174 mapped to freq_cid C-2021 with dist 0.886898382658
cid C-0190 mapped to freq_cid C-1880 with dist 4.36972688677
cid C-0191 mapped to freq_cid C-1943 with dist 0.850382617178
cid C-0193 mapped to freq_cid C-1811 with dist 0.967636217779
cid C-0202 mapped to freq_cid C-1943 with dist 0.776283967038
cid C-0205 mapped to freq_cid C-1881 with dist 4.27310411982
cid C-0219 mapped to freq_cid C-1792 with dist 5.42166036068
cid C-0220 mapped to freq_cid C-1792 with dist 4.9976817437
cid C-0221 mapped to freq_cid C-1792 with dist 4.94606605171
cid C-0229 mapped to freq_cid C-0172 with dist 0.0128033688365
cid C-0231 mapped to freq_cid C-1354 with dist 0.318941670344
cid C-0237 mapped to freq_cid C-0703 with dist 1.33554365977
cid C-0244 is frequent enough
cid C-0246 mapped to freq_cid C-1956 with dist 0.244228394836
cid C-0247 is frequent enough
cid C-0250 is frequent enough
cid C-0268 mapped to freq_cid C-0275 with dist 0.968311505444
cid C-0275 is frequent enough
cid C-0292 is frequent enough
cid C-0295 mapped to freq_cid C-1354 with dist 2.23130582179
cid C-0306 mapped to freq_cid C-1811 with dist 1.23653002604
cid C-0325 mapped to freq_cid C-1534 with dist 0.939299456248
cid C-0332 mapped to freq_cid C-0122 with dist 2.11290758453
cid C-0333 mapped to freq_cid C-0703 with dist 1.66903055104
cid C-0334 mapped to freq_cid C-0703 with dist 0.911480367602
cid C-0335 mapped to freq_cid C-0703 with dist 0.672441050993
cid C-0342 mapped to freq_cid C-1005 with dist 1.346888064
cid C-0343 mapped to freq_cid C-0679 with dist 0.51635148131
cid C-0344 mapped to freq_cid C-1412 with dist 0.0408646990732
cid C-0359 mapped to freq_cid C-1881 with dist 0.080091675439
cid C-0360 mapped to freq_cid C-1843 with dist 4.40248547509
cid C-0362 mapped to freq_cid C-1719 with dist 0.71138229099
cid C-0370 mapped to freq_cid C-1545 with dist 6.0829173488
cid C-0380 mapped to freq_cid C-1843 with dist 4.05070752747
cid C-0385 mapped to freq_cid C-1678 with dist 2.80156297108
cid C-0392 mapped to freq_cid C-0485 with dist 2.12406748658
cid C-0398 mapped to freq_cid C-1555 with dist 0.086308763586
cid C-0400 mapped to freq_cid C-1811 with dist 1.02381358899
cid C-0403 mapped to freq_cid C-1811 with dist 2.53887656558
cid C-0406 mapped to freq_cid C-0247 with dist 0.0798690354515
cid C-0423 is frequent enough
cid C-0434 is frequent enough
cid C-0450 mapped to freq_cid C-1235 with dist 0.0755314621064
cid C-0451 mapped to freq_cid C-1235 with dist 0.435511607422
cid C-0452 mapped to freq_cid C-1235 with dist 1.14550020129
cid C-0453 mapped to freq_cid C-0568 with dist 0.875484611497
cid C-0456 mapped to freq_cid C-0829 with dist 4.60263253121
cid C-0457 mapped to freq_cid C-0829 with dist 5.45462707475
cid C-0458 mapped to freq_cid C-0829 with dist 4.1399020039
cid C-0459 mapped to freq_cid C-0829 with dist 5.20248542758
cid C-0460 mapped to freq_cid C-0829 with dist 5.05490242339
cid C-0461 mapped to freq_cid C-1449 with dist 5.01893257292
cid C-0462 mapped to freq_cid C-1449 with dist 5.02145044209
cid C-0463 mapped to freq_cid C-0829 with dist 2.9295129816
cid C-0465 mapped to freq_cid C-0808 with dist 1.1259991377
cid C-0466 mapped to freq_cid C-1235 with dist 2.0151056396
cid C-0468 mapped to freq_cid C-1750 with dist 0.0683269316686
cid C-0471 mapped to freq_cid C-1943 with dist 7.27736215451
cid C-0479 mapped to freq_cid C-0434 with dist 0.273863282064
cid C-0480 mapped to freq_cid C-1749 with dist 7.30220285437
cid C-0485 is frequent enough
cid C-0492 mapped to freq_cid C-0703 with dist 2.49436320517
cid C-0498 mapped to freq_cid C-0703 with dist 1.33554365977
cid C-0500 mapped to freq_cid C-0485 with dist 3.46123897666
cid C-0509 mapped to freq_cid C-1943 with dist 0.305965477718
cid C-0510 mapped to freq_cid C-1678 with dist 2.34676311932
cid C-0532 mapped to freq_cid C-0485 with dist 3.46223714737
cid C-0538 mapped to freq_cid C-1678 with dist 3.16281126114
cid C-0543 mapped to freq_cid C-0485 with dist 3.46218417122
cid C-0546 mapped to freq_cid C-1792 with dist 5.3403994844
cid C-0551 mapped to freq_cid C-1843 with dist 0.393617231664
cid C-0554 mapped to freq_cid C-0244 with dist 0.286041697996
cid C-0568 is frequent enough
cid C-0573 mapped to freq_cid C-0703 with dist 3.22343755695
cid C-0591 mapped to freq_cid C-0703 with dist 4.85110423976
cid C-0599 mapped to freq_cid C-1843 with dist 0.295717951438
cid C-0601 mapped to freq_cid C-1430 with dist 0.65379263372
cid C-0602 mapped to freq_cid C-1880 with dist 0.0537989421637
cid C-0606 mapped to freq_cid C-0172 with dist 0.262124713809
cid C-0612 mapped to freq_cid C-0275 with dist 3.90793672197
cid C-0618 is frequent enough
cid C-0619 mapped to freq_cid C-1893 with dist 0.643830021151
cid C-0621 mapped to freq_cid C-0829 with dist 4.42335172005
cid C-0622 mapped to freq_cid C-1545 with dist 0.673704857892
cid C-0628 mapped to freq_cid C-1344 with dist 2.06576848862
cid C-0632 mapped to freq_cid C-0679 with dist 1.52371636395
cid C-0652 mapped to freq_cid C-0618 with dist 0.0
cid C-0653 mapped to freq_cid C-0122 with dist 2.18517867188
cid C-0679 is frequent enough
cid C-0698 mapped to freq_cid C-1943 with dist 1.36997349018
cid C-0703 is frequent enough
cid C-0709 mapped to freq_cid C-1343 with dist 0.0324145982073
cid C-0722 mapped to freq_cid C-1811 with dist 1.66506000206
cid C-0732 mapped to freq_cid C-1943 with dist 0.433944103504
cid C-0734 is frequent enough
cid C-0737 mapped to freq_cid C-1615 with dist 1.53282903444
cid C-0739 mapped to freq_cid C-1678 with dist 2.23808048098
cid C-0741 mapped to freq_cid C-0703 with dist 1.32737122577
cid C-0747 mapped to freq_cid C-0485 with dist 0.981684535421
cid C-0748 mapped to freq_cid C-0485 with dist 1.72741248951
cid C-0750 mapped to freq_cid C-0172 with dist 0.00235011024647
cid C-0761 mapped to freq_cid C-0244 with dist 2.52544238323
cid C-0778 mapped to freq_cid C-1943 with dist 0.62800180157
cid C-0782 mapped to freq_cid C-1792 with dist 3.32659270935
cid C-0787 mapped to freq_cid C-1719 with dist 0.252144252501
cid C-0792 mapped to freq_cid C-0247 with dist 0.308972882473
cid C-0799 mapped to freq_cid C-1947 with dist 0.0828248643761
cid C-0808 is frequent enough
cid C-0819 mapped to freq_cid C-1354 with dist 0.935057773262
cid C-0829 is frequent enough
cid C-0836 mapped to freq_cid C-1943 with dist 7.28298555827
cid C-0840 mapped to freq_cid C-0244 with dist 2.51434782754
cid C-0846 is frequent enough
cid C-0850 mapped to freq_cid C-0846 with dist 0.383228422836
cid C-0875 mapped to freq_cid C-1354 with dist 2.39927262174
cid C-0880 mapped to freq_cid C-2022 with dist 14.0045347034
cid C-0881 mapped to freq_cid C-2022 with dist 13.8440325724
cid C-0894 mapped to freq_cid C-1354 with dist 0.191274354606
cid C-0917 mapped to freq_cid C-1242 with dist 3.98760720174
cid C-0927 mapped to freq_cid C-1792 with dist 4.90521557476
cid C-0940 mapped to freq_cid C-1843 with dist 1.17011185889
cid C-0941 mapped to freq_cid C-1792 with dist 3.90779499488
cid C-0942 mapped to freq_cid C-1843 with dist 0.59674490912
cid C-0944 mapped to freq_cid C-1792 with dist 3.83876192881
cid C-0946 mapped to freq_cid C-1792 with dist 3.50106062178
cid C-0959 mapped to freq_cid C-1792 with dist 3.36010673362
cid C-0960 mapped to freq_cid C-0679 with dist 0.767507391156
cid C-0961 mapped to freq_cid C-1430 with dist 0.348142050164
cid C-0966 mapped to freq_cid C-1678 with dist 5.55962932813
cid C-0970 mapped to freq_cid C-1768 with dist 4.45296981529
cid C-0973 mapped to freq_cid C-0122 with dist 1.52772950974
cid C-0976 mapped to freq_cid C-1792 with dist 5.38685238317
cid C-0977 mapped to freq_cid C-1943 with dist 0.422305422404
cid C-0978 mapped to freq_cid C-1811 with dist 1.89731194378
cid C-0994 mapped to freq_cid C-0679 with dist 3.67200390196
cid C-1005 is frequent enough
cid C-1014 mapped to freq_cid C-1678 with dist 4.42625978668
cid C-1066 mapped to freq_cid C-1352 with dist 8.39401573049
cid C-1071 mapped to freq_cid C-1943 with dist 0.612852126498
cid C-1113 mapped to freq_cid C-0485 with dist 3.4624537721
cid C-1115 mapped to freq_cid C-1534 with dist 1.682830572
cid C-1145 mapped to freq_cid C-0703 with dist 1.33843999013
cid C-1203 is frequent enough
cid C-1207 mapped to freq_cid C-0829 with dist 4.10536229992
cid C-1210 mapped to freq_cid C-1893 with dist 0.0377955133626
cid C-1212 mapped to freq_cid C-2021 with dist 0.156763533542
cid C-1224 mapped to freq_cid C-0679 with dist 0.827978423417
cid C-1234 mapped to freq_cid C-1943 with dist 0.956148084111
cid C-1235 is frequent enough
cid C-1236 mapped to freq_cid C-1235 with dist 0.473696274232
cid C-1242 is frequent enough
cid C-1245 mapped to freq_cid C-1843 with dist 0.754432610883
cid C-1314 mapped to freq_cid C-1354 with dist 0.879696671679
cid C-1315 mapped to freq_cid C-1843 with dist 7.28146076305
cid C-1326 mapped to freq_cid C-1792 with dist 2.0137774412
cid C-1328 mapped to freq_cid C-1792 with dist 0.412551570614
cid C-1329 mapped to freq_cid C-1792 with dist 2.05501785503
cid C-1342 mapped to freq_cid C-0703 with dist 1.33663125369
cid C-1343 is frequent enough
cid C-1344 is frequent enough
cid C-1346 mapped to freq_cid C-0703 with dist 1.3354833753
cid C-1347 mapped to freq_cid C-0703 with dist 0.837203272902
cid C-1351 mapped to freq_cid C-1719 with dist 0.310174879753
cid C-1352 is frequent enough
cid C-1354 is frequent enough
cid C-1357 mapped to freq_cid C-1534 with dist 0.876554650991
cid C-1365 mapped to freq_cid C-1678 with dist 0.67073956572
cid C-1368 mapped to freq_cid C-1373 with dist 0.382955188318
cid C-1370 mapped to freq_cid C-1943 with dist 0.934854191869
cid C-1373 is frequent enough
cid C-1376 mapped to freq_cid C-1373 with dist 0.352688688474
cid C-1390 mapped to freq_cid C-1843 with dist 0.433686212799
cid C-1392 mapped to freq_cid C-1749 with dist 0.151312155521
cid C-1394 mapped to freq_cid C-0703 with dist 1.33701170255
cid C-1407 mapped to freq_cid C-0679 with dist 7.28192868865
cid C-1408 mapped to freq_cid C-0679 with dist 0.561818220356
cid C-1412 is frequent enough
cid C-1413 mapped to freq_cid C-1412 with dist 0.888474627159
cid C-1415 mapped to freq_cid C-1373 with dist 0.340571839707
cid C-1418 mapped to freq_cid C-1739 with dist 0.0811567751465
cid C-1427 mapped to freq_cid C-1843 with dist 1.1256003245
cid C-1430 is frequent enough
cid C-1433 mapped to freq_cid C-1235 with dist 2.0139041076
cid C-1434 mapped to freq_cid C-1430 with dist 0.221006653681
cid C-1440 mapped to freq_cid C-0808 with dist 0.874924440081
cid C-1444 mapped to freq_cid C-0568 with dist 1.29984410203
cid C-1446 mapped to freq_cid C-1893 with dist 0.495331115191
cid C-1447 mapped to freq_cid C-1235 with dist 0.389558295539
cid C-1448 mapped to freq_cid C-1235 with dist 2.04772705017
cid C-1449 is frequent enough
cid C-1450 mapped to freq_cid C-0568 with dist 0.819279452378
cid C-1451 mapped to freq_cid C-1449 with dist 0.0800962081531
cid C-1454 mapped to freq_cid C-1893 with dist 0.800171283461
cid C-1455 mapped to freq_cid C-2021 with dist 0.641401308293
cid C-1456 mapped to freq_cid C-2021 with dist 3.93269837378
cid C-1457 mapped to freq_cid C-1005 with dist 0.303226146112
cid C-1458 mapped to freq_cid C-2021 with dist 0.581315312953
cid C-1459 mapped to freq_cid C-1893 with dist 0.7637312177
cid C-1460 mapped to freq_cid C-1893 with dist 0.701003751695
cid C-1461 mapped to freq_cid C-1235 with dist 2.08052341688
cid C-1462 mapped to freq_cid C-2023 with dist 1.21495492651
cid C-1463 mapped to freq_cid C-2023 with dist 0.772358758748
cid C-1464 mapped to freq_cid C-1893 with dist 0.571031218887
cid C-1465 mapped to freq_cid C-2021 with dist 0.152187247786
cid C-1467 mapped to freq_cid C-2017 with dist 0.590605258312
cid C-1471 mapped to freq_cid C-2017 with dist 0.41389610434
cid C-1482 mapped to freq_cid C-0829 with dist 4.14087521306
cid C-1484 mapped to freq_cid C-2017 with dist 0.076659175063
cid C-1487 mapped to freq_cid C-2021 with dist 0.170480852006
cid C-1492 mapped to freq_cid C-1750 with dist 0.0610581656045
cid C-1493 mapped to freq_cid C-0122 with dist 0.824716763138
cid C-1494 mapped to freq_cid C-0829 with dist 4.42158625947
cid C-1495 mapped to freq_cid C-0829 with dist 4.42238484753
cid C-1501 mapped to freq_cid C-0734 with dist 0.0686500075191
cid C-1513 mapped to freq_cid C-1354 with dist 7.63792867362
cid C-1518 mapped to freq_cid C-0434 with dist 0.336600345695
cid C-1520 mapped to freq_cid C-1811 with dist 1.27644607262
cid C-1534 is frequent enough
cid C-1543 mapped to freq_cid C-1373 with dist 0.191722090148
cid C-1544 mapped to freq_cid C-0829 with dist 5.23372667652
cid C-1545 is frequent enough
cid C-1549 mapped to freq_cid C-1740 with dist 0.0863014648659
cid C-1551 mapped to freq_cid C-0703 with dist 2.50792674512
cid C-1552 mapped to freq_cid C-1430 with dist 0.364971679708
cid C-1553 mapped to freq_cid C-0703 with dist 2.51182688341
cid C-1555 is frequent enough
cid C-1568 mapped to freq_cid C-1534 with dist 0.431005526755
cid C-1590 mapped to freq_cid C-2023 with dist 1.17148159946
cid C-1591 mapped to freq_cid C-1235 with dist 2.04590014389
cid C-1592 mapped to freq_cid C-2017 with dist 0.42763627738
cid C-1594 mapped to freq_cid C-1235 with dist 0.374060479182
cid C-1614 is frequent enough
cid C-1615 is frequent enough
cid C-1616 mapped to freq_cid C-2017 with dist 0.411588652674
cid C-1617 mapped to freq_cid C-2023 with dist 0.806914353583
cid C-1665 mapped to freq_cid C-1843 with dist 1.16146757723
cid C-1670 is frequent enough
cid C-1673 mapped to freq_cid C-0172 with dist 0.0234576746309
cid C-1678 is frequent enough
cid C-1680 mapped to freq_cid C-0423 with dist 0.197484551544
cid C-1682 mapped to freq_cid C-1843 with dist 1.93013268892
cid C-1683 mapped to freq_cid C-1843 with dist 0.539358313452
cid C-1684 mapped to freq_cid C-1843 with dist 7.27813916886
cid C-1685 mapped to freq_cid C-1811 with dist 0.934856980549
cid C-1687 mapped to freq_cid C-1412 with dist 0.296085448576
cid C-1705 mapped to freq_cid C-1792 with dist 5.58490690407
cid C-1706 mapped to freq_cid C-0568 with dist 0.765822452452
cid C-1711 is frequent enough
cid C-1712 mapped to freq_cid C-1344 with dist 1.75689499938
cid C-1713 mapped to freq_cid C-0829 with dist 19.6423176173
cid C-1717 mapped to freq_cid C-0829 with dist 0.0881008429828
cid C-1719 is frequent enough
cid C-1720 mapped to freq_cid C-0829 with dist 4.10732628996
cid C-1721 mapped to freq_cid C-0829 with dist 3.91455184006
cid C-1734 mapped to freq_cid C-1719 with dist 0.35725497302
cid C-1736 mapped to freq_cid C-1749 with dist 0.060077222859
cid C-1739 is frequent enough
cid C-1740 is frequent enough
cid C-1746 mapped to freq_cid C-0808 with dist 0.0457666716794
cid C-1747 mapped to freq_cid C-0568 with dist 0.0514875056393
cid C-1749 is frequent enough
cid C-1750 is frequent enough
cid C-1762 mapped to freq_cid C-2023 with dist 0.704806743863
cid C-1763 mapped to freq_cid C-1881 with dist 0.0446546711356
cid C-1768 is frequent enough
cid C-1770 mapped to freq_cid C-1768 with dist 2.01435648443
cid C-1771 mapped to freq_cid C-1843 with dist 1.66127072604
cid C-1772 mapped to freq_cid C-1768 with dist 2.74093347654
cid C-1773 mapped to freq_cid C-1768 with dist 1.89054160014
cid C-1775 mapped to freq_cid C-1792 with dist 3.82204532797
cid C-1784 mapped to freq_cid C-1792 with dist 3.79792157627
cid C-1785 mapped to freq_cid C-1792 with dist 2.10667024635
cid C-1792 is frequent enough
cid C-1795 mapped to freq_cid C-2017 with dist 0.463102298993
cid C-1796 mapped to freq_cid C-1893 with dist 0.826813326764
cid C-1798 mapped to freq_cid C-1901 with dist 0.786200098033
cid C-1801 mapped to freq_cid C-1901 with dist 0.424823593483
cid C-1804 mapped to freq_cid C-1893 with dist 13.8579932551
cid C-1805 mapped to freq_cid C-1880 with dist 0.0812511375719
cid C-1806 mapped to freq_cid C-1901 with dist 0.391572754458
cid C-1807 mapped to freq_cid C-1235 with dist 4.08187855076
cid C-1811 is frequent enough
cid C-1834 mapped to freq_cid C-1354 with dist 2.40511574177
cid C-1843 is frequent enough
cid C-1849 mapped to freq_cid C-0434 with dist 0.0334818316045
cid C-1854 mapped to freq_cid C-2043 with dist 0.220410095731
cid C-1855 mapped to freq_cid C-1678 with dist 0.400656946053
cid C-1880 is frequent enough
cid C-1881 is frequent enough
cid C-1882 mapped to freq_cid C-2019 with dist 0.00572083395993
cid C-1883 mapped to freq_cid C-2019 with dist 0.141876682206
cid C-1884 mapped to freq_cid C-1880 with dist 0.0949789280315
cid C-1886 mapped to freq_cid C-1881 with dist 0.113285187817
cid C-1887 mapped to freq_cid C-1881 with dist 0.0594966731832
cid C-1890 mapped to freq_cid C-2019 with dist 0.0538317834515
cid C-1893 is frequent enough
cid C-1894 mapped to freq_cid C-1449 with dist 5.1636173867
cid C-1895 mapped to freq_cid C-1901 with dist 0.40534051986
cid C-1897 mapped to freq_cid C-1344 with dist 0.887588960897
cid C-1900 mapped to freq_cid C-0829 with dist 4.21185164783
cid C-1901 is frequent enough
cid C-1904 mapped to freq_cid C-2021 with dist 0.00478893633914
cid C-1943 is frequent enough
cid C-1946 mapped to freq_cid C-0618 with dist 1.23645232089
cid C-1947 is frequent enough
cid C-1953 mapped to freq_cid C-1670 with dist 0.388660430831
cid C-1956 is frequent enough
cid C-1958 mapped to freq_cid C-1843 with dist 1.78967136185
cid C-1983 mapped to freq_cid C-1943 with dist 1.18800424116
cid C-1986 mapped to freq_cid C-0618 with dist 0.319021784987
cid C-1996 mapped to freq_cid C-1943 with dist 0.45206746931
cid C-1997 mapped to freq_cid C-1893 with dist 0.0972541773187
cid C-1999 mapped to freq_cid C-0829 with dist 5.84079748338
cid C-2000 mapped to freq_cid C-0568 with dist 1.36733291976
cid C-2009 mapped to freq_cid C-1880 with dist 0.102975011279
cid C-2015 mapped to freq_cid C-2021 with dist 0.767808491658
cid C-2016 mapped to freq_cid C-2021 with dist 0.226553800637
cid C-2017 is frequent enough
cid C-2018 mapped to freq_cid C-2019 with dist 0.129290847494
cid C-2019 is frequent enough
cid C-2020 mapped to freq_cid C-2023 with dist 0.766056791286
cid C-2021 is frequent enough
cid C-2022 is frequent enough
cid C-2023 is frequent enough
cid C-2034 mapped to freq_cid C-0829 with dist 4.41818350159
cid C-2043 is frequent enough
cid C-2044 mapped to freq_cid C-1545 with dist 5.84409192483

In [193]:
# A good substitute:
cids = ['C-0122', 'C-1344']
straight[straight.index.isin(cids)]


Out[193]:
bolt_pattern_long bolt_pattern_wide head_diameter overall_length thickness groove unique_feature orientation weight MJ-001 MJ-002 MJ-003 MJ-007 MJ-other CP-001 CP-002 CP-003 CP-004 CP-005 CP-006 CP-007 bolt_pattern_long_missing bolt_pattern_wide_missing head_diameter_missing overall_length_missing weight_missing
component_id
C-0122 71.77567 40.841225 44.45 19.05 8 True False False 0.080 0 0 0 0 1 0 0 0 0 0 1 0 True True False False False
C-1344 71.77567 40.841225 44.45 19.05 8 True False False 0.136 0 0 0 0 1 0 0 0 0 0 1 0 True True False False False

In [205]:
# A crappy substitute:
cids = ['C-2023', 'C-2019']
straight[straight.index.isin(cids)]


Out[205]:
bolt_pattern_long bolt_pattern_wide head_diameter overall_length thickness groove unique_feature orientation weight MJ-001 MJ-002 MJ-003 MJ-007 MJ-other CP-001 CP-002 CP-003 CP-004 CP-005 CP-006 CP-007 bolt_pattern_long_missing bolt_pattern_wide_missing head_diameter_missing overall_length_missing weight_missing
component_id
C-2019 69.9 35.7 58.414 27.906098 36.5 False False True 1.440 0 0 1 0 0 0 1 0 0 0 0 0 False False True True False
C-2023 88.9 50.8 58.414 27.906098 36.5 True False True 1.621 0 0 1 0 0 0 1 0 0 0 0 0 False False True True False

In [204]:
straight[straight.index.isin(freq_cids)]


Out[204]:
bolt_pattern_long bolt_pattern_wide head_diameter overall_length thickness groove unique_feature orientation weight MJ-001 MJ-002 MJ-003 MJ-007 MJ-other CP-001 CP-002 CP-003 CP-004 CP-005 CP-006 CP-007 bolt_pattern_long_missing bolt_pattern_wide_missing head_diameter_missing overall_length_missing weight_missing
component_id
C-0122 71.77567 40.841225 44.450 19.050000 8.00 True False False 0.080 0 0 0 0 1 0 0 0 0 0 1 0 True True False False False
C-0170 52.38000 40.841225 58.414 27.906098 16.00 False False True 0.090 0 0 0 0 1 0 0 0 1 0 0 0 False True True True False
C-0244 152.40000 92.100000 58.414 27.906098 19.05 True False True 2.127 0 0 0 0 1 0 0 1 0 0 0 0 False False True True False
C-0250 130.20000 77.800000 58.414 27.906098 28.00 False False True 2.460 0 0 0 0 1 0 0 1 0 0 0 0 False False True True False
C-0275 120.70000 69.900000 58.414 27.906098 36.50 False False True 3.268 0 0 0 0 1 0 0 1 0 0 0 0 False False True True False
C-0434 57.15000 40.841225 58.414 27.906098 9.72 False False True 0.191 0 0 0 0 1 0 0 0 1 0 0 0 False True True True False
C-0618 106.40000 61.900000 58.414 27.906098 15.90 True False True 0.955 0 0 0 0 1 0 0 1 0 0 0 0 False False True True False
C-0679 52.37000 26.190000 58.414 27.906098 32.00 False False True 0.474 0 0 0 0 1 0 0 1 0 0 0 0 False False True True False
C-0703 71.77567 40.841225 41.280 27.906098 14.22 True False False 0.150 0 0 0 0 1 0 0 0 0 0 1 0 True True False True False
C-0808 88.90000 50.800000 58.414 27.906098 28.00 False False True 1.545 1 0 0 0 0 0 1 0 0 0 0 0 False False True True False
C-1203 130.20000 77.800000 58.414 27.906098 22.35 True False True 1.835 0 0 0 0 1 0 0 1 0 0 0 0 False False True True False
C-1344 71.77567 40.841225 44.450 19.050000 8.00 True False False 0.136 0 0 0 0 1 0 0 0 0 0 1 0 True True False False False
C-1352 79.38000 40.841225 58.414 27.906098 9.53 False False True 2.560 0 0 0 0 1 0 0 0 1 0 0 0 False True True True False
C-1354 79.38000 40.841225 58.414 27.906098 9.27 False False True 0.200 0 0 0 0 1 0 0 0 1 0 0 0 False True True True False
C-1373 52.37000 40.841225 58.414 27.906098 12.00 True False True 0.118 0 0 0 0 1 0 0 0 1 0 0 0 False True True True False
C-1430 47.60000 22.200000 58.414 27.906098 15.70 False False True 0.340 0 0 0 0 1 0 0 1 0 0 0 0 False False True True False
C-1449 71.77567 40.841225 101.600 27.906098 9.52 True False False 0.420 1 0 0 0 0 0 0 0 0 0 0 1 True True False True False
C-1545 71.77567 40.841225 84.120 17.500000 9.53 True False False 0.244 0 0 0 0 1 0 0 0 0 0 1 0 True True False False False
C-1555 52.37000 40.841225 58.414 27.906098 10.00 False False True 0.090 0 0 0 0 1 0 0 0 1 0 0 0 False True True True False
C-1614 130.20000 77.800000 58.414 27.906098 24.38 True False True 2.350 0 0 0 0 1 0 0 1 0 0 0 0 False False True True False
C-1615 120.70000 69.900000 58.414 27.906098 25.14 True False True 2.183 0 0 0 0 1 0 0 1 0 0 0 0 False False True True False
C-1670 56.00000 40.841225 58.414 27.906098 8.00 False False True 0.079 0 0 0 0 1 0 0 0 1 0 0 0 False True True True False
C-1739 52.30000 40.841225 58.414 27.906098 10.00 False False True 0.107 0 0 0 0 1 0 0 0 1 0 0 0 False True True True False
C-1750 50.80000 40.841225 58.414 27.906098 8.00 False False True 0.067 0 0 0 0 1 0 0 0 1 0 0 0 False True True True False
C-2019 69.90000 35.700000 58.414 27.906098 36.50 False False True 1.440 0 0 1 0 0 0 1 0 0 0 0 0 False False True True False
C-2022 106.38000 61.930000 58.414 27.906098 36.50 False False True 2.748 0 0 1 0 0 0 1 0 0 0 0 0 False False True True False

In [219]:
dists = cid_to_dist.values()
hist(dists, bins=20);



In [220]:
df = pd.Series(cid_to_freq_cid, name='cluster')
df.index.name = 'component_id'
df = df.reset_index()
df.to_csv("clu_straight.csv", index=False)

In [ ]: