In [15]:
%pylab inline

from collections import defaultdict
from scipy.spatial.distance import euclidean
from scipy.cluster import vq
from sklearn.preprocessing import LabelBinarizer
import pandas as pd

from soln.dataset import get_augmented_train_and_test_set
from soln.dataset import get_component_info_df
from soln.dataset import load_raw_components

pd.set_option('display.max_columns', None)


Populating the interactive namespace from numpy and matplotlib

In [5]:
comp_types, group_dfs, cluster_dfs = load_raw_components()
group_df = group_dfs['elbow']
for feat in ('groove', 'unique_feature', 'orientation'):
    group_df[feat] = (group_df[feat] == 'Yes')
group_df.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 178 entries, 0 to 177
Data columns (total 16 columns):
component_id          178 non-null object
component_type_id     178 non-null object
bolt_pattern_long     171 non-null float64
bolt_pattern_wide     138 non-null float64
extension_length      170 non-null float64
overall_length        175 non-null float64
thickness             171 non-null float64
drop_length           171 non-null float64
elbow_angle           130 non-null float64
mj_class_code         41 non-null object
mj_plug_class_code    40 non-null object
plug_diameter         7 non-null float64
groove                178 non-null bool
unique_feature        178 non-null bool
orientation           178 non-null bool
weight                176 non-null float64
dtypes: bool(3), float64(9), object(4)
memory usage: 20.0+ KB

In [6]:
# Convert mj_class_code to binary features.
group_df.mj_class_code.fillna('MJ-other', inplace=True)
print group_df.mj_class_code.value_counts()
lb = LabelBinarizer()
bin_feats = lb.fit_transform(group_df.mj_class_code.values)
print lb.classes_
tmp_df = pd.DataFrame(bin_feats, columns=lb.classes_)
group_df = group_df.join(tmp_df)
group_df.pop('mj_class_code')
group_df.info()


MJ-other    137
MJ-003       39
MJ-007        1
MJ-004        1
dtype: int64
['MJ-003' 'MJ-004' 'MJ-007' 'MJ-other']
<class 'pandas.core.frame.DataFrame'>
Int64Index: 178 entries, 0 to 177
Data columns (total 19 columns):
component_id          178 non-null object
component_type_id     178 non-null object
bolt_pattern_long     171 non-null float64
bolt_pattern_wide     138 non-null float64
extension_length      170 non-null float64
overall_length        175 non-null float64
thickness             171 non-null float64
drop_length           171 non-null float64
elbow_angle           130 non-null float64
mj_plug_class_code    40 non-null object
plug_diameter         7 non-null float64
groove                178 non-null bool
unique_feature        178 non-null bool
orientation           178 non-null bool
weight                176 non-null float64
MJ-003                178 non-null int64
MJ-004                178 non-null int64
MJ-007                178 non-null int64
MJ-other              178 non-null int64
dtypes: bool(3), float64(9), int64(4), object(3)
memory usage: 24.2+ KB

In [8]:
# Convert mj_plug_class_code to binary features.
group_df.mj_plug_class_code.fillna('MJP-other', inplace=True)
print group_df.mj_plug_class_code.value_counts()
lb = LabelBinarizer()
bin_feats = lb.fit_transform(group_df.mj_plug_class_code.values)
print lb.classes_
tmp_df = pd.DataFrame(bin_feats, columns=lb.classes_)
group_df = group_df.join(tmp_df)
group_df.pop('mj_plug_class_code')
group_df.info()


MJP-other    138
MJ-005        23
Threaded       9
MJ-006         8
dtype: int64
['MJ-005' 'MJ-006' 'MJP-other' 'Threaded']
<class 'pandas.core.frame.DataFrame'>
Int64Index: 178 entries, 0 to 177
Data columns (total 22 columns):
component_id         178 non-null object
component_type_id    178 non-null object
bolt_pattern_long    171 non-null float64
bolt_pattern_wide    138 non-null float64
extension_length     170 non-null float64
overall_length       175 non-null float64
thickness            171 non-null float64
drop_length          171 non-null float64
elbow_angle          130 non-null float64
plug_diameter        7 non-null float64
groove               178 non-null bool
unique_feature       178 non-null bool
orientation          178 non-null bool
weight               176 non-null float64
MJ-003               178 non-null int64
MJ-004               178 non-null int64
MJ-007               178 non-null int64
MJ-other             178 non-null int64
MJ-005               178 non-null int64
MJ-006               178 non-null int64
MJP-other            178 non-null int64
Threaded             178 non-null int64
dtypes: bool(3), float64(9), int64(8), object(2)
memory usage: 28.3+ KB

In [9]:
# Convert component_type_id to binary features.
print group_df.component_type_id.value_counts()
lb = LabelBinarizer()
bin_feats = lb.fit_transform(group_df.component_type_id.values)
print lb.classes_
tmp_df = pd.DataFrame(bin_feats, columns=lb.classes_)
group_df = group_df.join(tmp_df)
group_df.pop('component_type_id')
group_df.info()


CP-008    70
CP-011    41
CP-010    33
CP-009    27
CP-012     7
dtype: int64
['CP-008' 'CP-009' 'CP-010' 'CP-011' 'CP-012']
<class 'pandas.core.frame.DataFrame'>
Int64Index: 178 entries, 0 to 177
Data columns (total 26 columns):
component_id         178 non-null object
bolt_pattern_long    171 non-null float64
bolt_pattern_wide    138 non-null float64
extension_length     170 non-null float64
overall_length       175 non-null float64
thickness            171 non-null float64
drop_length          171 non-null float64
elbow_angle          130 non-null float64
plug_diameter        7 non-null float64
groove               178 non-null bool
unique_feature       178 non-null bool
orientation          178 non-null bool
weight               176 non-null float64
MJ-003               178 non-null int64
MJ-004               178 non-null int64
MJ-007               178 non-null int64
MJ-other             178 non-null int64
MJ-005               178 non-null int64
MJ-006               178 non-null int64
MJP-other            178 non-null int64
Threaded             178 non-null int64
CP-008               178 non-null int64
CP-009               178 non-null int64
CP-010               178 non-null int64
CP-011               178 non-null int64
CP-012               178 non-null int64
dtypes: bool(3), float64(9), int64(13), object(1)
memory usage: 33.9+ KB

In [11]:
# Add features to capture whether a value is missing, and fill missing values with average.
for feat in (
        'bolt_pattern_long', 'bolt_pattern_wide', 'extension_length', 'overall_length',
        'thickness', 'drop_length', 'elbow_angle', 'plug_diameter', 'weight'):
    group_df[feat + '_missing'] = group_df[feat].isnull()
    group_df[feat].fillna(group_df[feat].mean(), inplace=True)
group_df.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 178 entries, 0 to 177
Data columns (total 35 columns):
component_id                 178 non-null object
bolt_pattern_long            178 non-null float64
bolt_pattern_wide            178 non-null float64
extension_length             178 non-null float64
overall_length               178 non-null float64
thickness                    178 non-null float64
drop_length                  178 non-null float64
elbow_angle                  178 non-null float64
plug_diameter                178 non-null float64
groove                       178 non-null bool
unique_feature               178 non-null bool
orientation                  178 non-null bool
weight                       178 non-null float64
MJ-003                       178 non-null int64
MJ-004                       178 non-null int64
MJ-007                       178 non-null int64
MJ-other                     178 non-null int64
MJ-005                       178 non-null int64
MJ-006                       178 non-null int64
MJP-other                    178 non-null int64
Threaded                     178 non-null int64
CP-008                       178 non-null int64
CP-009                       178 non-null int64
CP-010                       178 non-null int64
CP-011                       178 non-null int64
CP-012                       178 non-null int64
bolt_pattern_long_missing    178 non-null bool
bolt_pattern_wide_missing    178 non-null bool
extension_length_missing     178 non-null bool
overall_length_missing       178 non-null bool
thickness_missing            178 non-null bool
drop_length_missing          178 non-null bool
elbow_angle_missing          178 non-null bool
plug_diameter_missing        178 non-null bool
weight_missing               178 non-null bool
dtypes: bool(12), float64(9), int64(13), object(1)
memory usage: 35.5+ KB

In [12]:
# Get list of vectors to work with.
group_df.set_index('component_id', drop=True, inplace=True)
group_np = group_df.astype(np.float).values
group_np.shape


Out[12]:
(178, 34)

In [16]:
# Approach 4: map infrequent components to more frequent substitutes.

freq_cids = ['C-1565', 'C-1208', 'C-1428', 'C-1345', 'C-1317', 'C-0401',
       'C-1587', 'C-1935', 'C-1404', 'C-1349', 'C-1585', 'C-1527',
       'C-1586', 'C-1783', 'C-1598', 'C-0063', 'C-1405', 'C-0598',
       'C-0616', 'C-1432', 'C-1500', 'C-1006']

all_cids = list(group_df.index)

group_np_wh = vq.whiten(group_np)

freq_cid_vecs = []
for cid in freq_cids:
    row_idx = all_cids.index(cid)
    assert row_idx != -1
    row = group_np_wh[row_idx, :]
    freq_cid_vecs.append(row)

cid_to_freq_cid = {}
cid_to_dist = {}
for row_idx, cid in enumerate(group_df.index.values):
    if cid in freq_cids:
        cid_to_freq_cid[cid] = cid
        cid_to_dist[cid] = 0.0
        print "cid {} is frequent enough".format(cid)
    else:
        best_i = None
        best_dist = np.inf
        for i in xrange(len(freq_cids)):
            row = group_np_wh[row_idx, :]
            dist = euclidean(row, freq_cid_vecs[i])
            if dist < best_dist:
                best_dist = dist
                best_i = i
        cid_to_freq_cid[cid] = freq_cids[best_i]
        cid_to_dist[cid] = best_dist
        print "cid {} mapped to freq_cid {} with dist {}".format(cid, freq_cids[best_i], best_dist)


cid C-0013 mapped to freq_cid C-1500 with dist 6.75609910629
cid C-0016 mapped to freq_cid C-1783 with dist 2.07053817998
cid C-0017 mapped to freq_cid C-1783 with dist 0.0931874685552
cid C-0018 mapped to freq_cid C-1783 with dist 1.04356459808
cid C-0021 mapped to freq_cid C-1565 with dist 2.32422795501
cid C-0022 mapped to freq_cid C-0063 with dist 2.05924503405
cid C-0023 mapped to freq_cid C-1349 with dist 5.06544101155
cid C-0028 mapped to freq_cid C-1006 with dist 0.818683276634
cid C-0042 mapped to freq_cid C-1317 with dist 1.18616027948
cid C-0061 mapped to freq_cid C-0063 with dist 1.76151744441
cid C-0063 is frequent enough
cid C-0065 mapped to freq_cid C-1783 with dist 1.24631991514
cid C-0066 mapped to freq_cid C-1783 with dist 1.16068886932
cid C-0075 mapped to freq_cid C-1006 with dist 5.74073844166
cid C-0076 mapped to freq_cid C-1006 with dist 5.57271354837
cid C-0090 mapped to freq_cid C-1598 with dist 5.4514728899
cid C-0094 mapped to freq_cid C-0063 with dist 1.55791227371
cid C-0104 mapped to freq_cid C-1404 with dist 0.488917175285
cid C-0105 mapped to freq_cid C-1500 with dist 4.01184606512
cid C-0106 mapped to freq_cid C-1500 with dist 4.50685856506
cid C-0107 mapped to freq_cid C-1428 with dist 0.601937266602
cid C-0140 mapped to freq_cid C-0063 with dist 2.32049468966
cid C-0143 mapped to freq_cid C-1500 with dist 8.72461461052
cid C-0152 mapped to freq_cid C-1500 with dist 3.90508394773
cid C-0160 mapped to freq_cid C-1783 with dist 0.835932572342
cid C-0163 mapped to freq_cid C-1006 with dist 1.22924339081
cid C-0169 mapped to freq_cid C-1500 with dist 5.5224926295
cid C-0175 mapped to freq_cid C-1598 with dist 5.50062202035
cid C-0187 mapped to freq_cid C-1404 with dist 1.2712579319
cid C-0188 mapped to freq_cid C-1935 with dist 4.52498247961
cid C-0192 mapped to freq_cid C-1405 with dist 0.711465837768
cid C-0207 mapped to freq_cid C-1006 with dist 5.6351153581
cid C-0238 mapped to freq_cid C-1006 with dist 1.83160872325
cid C-0254 mapped to freq_cid C-0063 with dist 2.06812611828
cid C-0259 mapped to freq_cid C-1935 with dist 3.25077130601
cid C-0266 mapped to freq_cid C-1935 with dist 2.7546021717
cid C-0273 mapped to freq_cid C-1404 with dist 0.00999778934329
cid C-0284 mapped to freq_cid C-1006 with dist 0.414429238407
cid C-0285 mapped to freq_cid C-1006 with dist 7.10645103995
cid C-0296 mapped to freq_cid C-0063 with dist 2.83000278537
cid C-0303 mapped to freq_cid C-1432 with dist 2.81686614179
cid C-0320 mapped to freq_cid C-1587 with dist 10.6496633968
cid C-0331 mapped to freq_cid C-1935 with dist 2.60804361193
cid C-0367 mapped to freq_cid C-1783 with dist 0.614477775465
cid C-0394 mapped to freq_cid C-1935 with dist 1.45104111924
cid C-0401 is frequent enough
cid C-0407 mapped to freq_cid C-0401 with dist 2.62470839908
cid C-0447 mapped to freq_cid C-1565 with dist 0.971818969637
cid C-0464 mapped to freq_cid C-1598 with dist 5.68902358466
cid C-0501 mapped to freq_cid C-1935 with dist 1.54103695704
cid C-0516 mapped to freq_cid C-1783 with dist 0.487807349147
cid C-0526 mapped to freq_cid C-0598 with dist 2.62065772822
cid C-0537 mapped to freq_cid C-0063 with dist 17.5082344997
cid C-0565 mapped to freq_cid C-1783 with dist 0.619179253354
cid C-0566 mapped to freq_cid C-1598 with dist 5.4947011932
cid C-0567 mapped to freq_cid C-1598 with dist 0.696920502128
cid C-0585 mapped to freq_cid C-1428 with dist 0.488535278981
cid C-0598 is frequent enough
cid C-0608 mapped to freq_cid C-1405 with dist 1.64670301825
cid C-0615 mapped to freq_cid C-1404 with dist 0.830892482127
cid C-0616 is frequent enough
cid C-0625 mapped to freq_cid C-1405 with dist 0.779513351152
cid C-0646 mapped to freq_cid C-1500 with dist 4.90362764787
cid C-0688 mapped to freq_cid C-1783 with dist 2.49248193104
cid C-0789 mapped to freq_cid C-0598 with dist 5.67945836488
cid C-0802 mapped to freq_cid C-1404 with dist 3.19248912666
cid C-0820 mapped to freq_cid C-1935 with dist 1.04548254342
cid C-0821 mapped to freq_cid C-1405 with dist 1.76756960494
cid C-0822 mapped to freq_cid C-1428 with dist 0.499534588849
cid C-0839 mapped to freq_cid C-1349 with dist 0.679450463921
cid C-0905 mapped to freq_cid C-0616 with dist 0.394134784155
cid C-0910 mapped to freq_cid C-1783 with dist 3.4625002249
cid C-0931 mapped to freq_cid C-1935 with dist 3.71844575048
cid C-0943 mapped to freq_cid C-0616 with dist 1.11002051031
cid C-0956 mapped to freq_cid C-0616 with dist 0.853766938577
cid C-0963 mapped to freq_cid C-0616 with dist 1.1703479839
cid C-0972 mapped to freq_cid C-1783 with dist 2.14787713026
cid C-0999 mapped to freq_cid C-1405 with dist 0.560788428776
cid C-1004 mapped to freq_cid C-1527 with dist 3.25456692933
cid C-1006 is frequent enough
cid C-1024 mapped to freq_cid C-1783 with dist 3.21959168561
cid C-1069 mapped to freq_cid C-1935 with dist 3.15625994827
cid C-1208 is frequent enough
cid C-1239 mapped to freq_cid C-1345 with dist 3.88966270958
cid C-1247 mapped to freq_cid C-0598 with dist 2.89334927889
cid C-1255 mapped to freq_cid C-1405 with dist 1.67712550531
cid C-1317 is frequent enough
cid C-1327 mapped to freq_cid C-1783 with dist 0.0481147691223
cid C-1331 mapped to freq_cid C-1783 with dist 2.30260804682
cid C-1333 mapped to freq_cid C-1783 with dist 2.05817348012
cid C-1334 mapped to freq_cid C-1783 with dist 0.795613876913
cid C-1335 mapped to freq_cid C-1783 with dist 2.61350453707
cid C-1336 mapped to freq_cid C-1783 with dist 2.31339416057
cid C-1337 mapped to freq_cid C-1783 with dist 0.779102228028
cid C-1339 mapped to freq_cid C-1317 with dist 0.553633524147
cid C-1345 is frequent enough
cid C-1348 mapped to freq_cid C-0616 with dist 7.79932509452
cid C-1349 is frequent enough
cid C-1350 mapped to freq_cid C-1428 with dist 0.92749581593
cid C-1362 mapped to freq_cid C-1565 with dist 10.6073825735
cid C-1363 mapped to freq_cid C-1565 with dist 9.84836586321
cid C-1378 mapped to freq_cid C-0063 with dist 2.67739481477
cid C-1389 mapped to freq_cid C-1349 with dist 0.931743272795
cid C-1404 is frequent enough
cid C-1405 is frequent enough
cid C-1409 mapped to freq_cid C-1404 with dist 0.473145969794
cid C-1419 mapped to freq_cid C-1405 with dist 1.49965028964
cid C-1428 is frequent enough
cid C-1429 mapped to freq_cid C-1349 with dist 0.58948078256
cid C-1432 is frequent enough
cid C-1488 mapped to freq_cid C-1598 with dist 5.49841786662
cid C-1500 is frequent enough
cid C-1514 mapped to freq_cid C-0063 with dist 2.66753565223
cid C-1527 is frequent enough
cid C-1538 mapped to freq_cid C-1345 with dist 2.57188523612
cid C-1539 mapped to freq_cid C-1317 with dist 0.752273812014
cid C-1542 mapped to freq_cid C-1428 with dist 0.791710302826
cid C-1565 is frequent enough
cid C-1566 mapped to freq_cid C-1527 with dist 0.80578404565
cid C-1575 mapped to freq_cid C-0598 with dist 0.805822925832
cid C-1576 mapped to freq_cid C-0616 with dist 2.45308703169
cid C-1585 is frequent enough
cid C-1586 is frequent enough
cid C-1587 is frequent enough
cid C-1588 mapped to freq_cid C-1587 with dist 3.3704389291
cid C-1589 mapped to freq_cid C-1587 with dist 6.16728153743
cid C-1598 is frequent enough
cid C-1599 mapped to freq_cid C-1598 with dist 0.239918811626
cid C-1600 mapped to freq_cid C-1598 with dist 0.429773640024
cid C-1601 mapped to freq_cid C-1006 with dist 1.22792362417
cid C-1602 mapped to freq_cid C-1006 with dist 0.812079155889
cid C-1603 mapped to freq_cid C-1598 with dist 0.717130675327
cid C-1604 mapped to freq_cid C-1598 with dist 0.703670502962
cid C-1605 mapped to freq_cid C-1598 with dist 0.589035432652
cid C-1606 mapped to freq_cid C-1598 with dist 1.09807679269
cid C-1607 mapped to freq_cid C-1598 with dist 1.72857359693
cid C-1608 mapped to freq_cid C-1006 with dist 0.136285654732
cid C-1609 mapped to freq_cid C-1598 with dist 1.7864143435
cid C-1610 mapped to freq_cid C-1598 with dist 1.74907754609
cid C-1611 mapped to freq_cid C-1006 with dist 15.0277555755
cid C-1612 mapped to freq_cid C-1006 with dist 6.38905686232
cid C-1613 mapped to freq_cid C-1006 with dist 2.75402958459
cid C-1636 mapped to freq_cid C-1598 with dist 2.50142561527
cid C-1681 mapped to freq_cid C-0063 with dist 1.76217086689
cid C-1686 mapped to freq_cid C-1935 with dist 7.98576878308
cid C-1726 mapped to freq_cid C-0616 with dist 3.35562537173
cid C-1733 mapped to freq_cid C-1345 with dist 1.84698739868
cid C-1744 mapped to freq_cid C-0616 with dist 0.455055821634
cid C-1759 mapped to freq_cid C-1006 with dist 1.48591910298
cid C-1769 mapped to freq_cid C-1783 with dist 0.970097840573
cid C-1774 mapped to freq_cid C-1783 with dist 3.12430921964
cid C-1782 mapped to freq_cid C-1783 with dist 0.791889134554
cid C-1783 is frequent enough
cid C-1786 mapped to freq_cid C-1783 with dist 1.05185075747
cid C-1788 mapped to freq_cid C-1783 with dist 0.623318235075
cid C-1803 mapped to freq_cid C-1006 with dist 6.41432713041
cid C-1816 mapped to freq_cid C-1349 with dist 0.744759379861
cid C-1819 mapped to freq_cid C-0598 with dist 1.71757053129
cid C-1820 mapped to freq_cid C-0616 with dist 0.308937461366
cid C-1821 mapped to freq_cid C-0616 with dist 8.12020328319
cid C-1826 mapped to freq_cid C-0616 with dist 0.553029597794
cid C-1829 mapped to freq_cid C-0401 with dist 14.158279247
cid C-1842 mapped to freq_cid C-1404 with dist 1.06645387479
cid C-1853 mapped to freq_cid C-1565 with dist 1.89210883215
cid C-1863 mapped to freq_cid C-0598 with dist 2.29378124057
cid C-1872 mapped to freq_cid C-1587 with dist 8.96921363592
cid C-1896 mapped to freq_cid C-0063 with dist 2.08277876594
cid C-1899 mapped to freq_cid C-1006 with dist 7.4757819651
cid C-1905 mapped to freq_cid C-1006 with dist 6.67305564556
cid C-1931 mapped to freq_cid C-1432 with dist 2.03139116005
cid C-1935 is frequent enough
cid C-1948 mapped to freq_cid C-0401 with dist 4.86137968317
cid C-1951 mapped to freq_cid C-1345 with dist 2.09350326856
cid C-1989 mapped to freq_cid C-1783 with dist 2.28951614787
cid C-1992 mapped to freq_cid C-0616 with dist 0.690302011105
cid C-2010 mapped to freq_cid C-1598 with dist 2.5749436388
cid C-2024 mapped to freq_cid C-1598 with dist 7.71455872802
cid C-2031 mapped to freq_cid C-1006 with dist 6.78851356227

In [17]:
cids = ['C-1992', 'C-0616']
group_df[group_df.index.isin(cids)]


Out[17]:
bolt_pattern_long bolt_pattern_wide extension_length overall_length thickness drop_length elbow_angle plug_diameter groove unique_feature orientation weight MJ-003 MJ-004 MJ-007 MJ-other MJ-005 MJ-006 MJP-other Threaded CP-008 CP-009 CP-010 CP-011 CP-012 bolt_pattern_long_missing bolt_pattern_wide_missing extension_length_missing overall_length_missing thickness_missing drop_length_missing elbow_angle_missing plug_diameter_missing weight_missing
component_id
C-0616 52.37 26.19 42.10 72.10 38 27 90 50.142857 True False True 0.699 0 0 0 1 0 0 1 0 1 0 0 0 0 False False False False False False False True False
C-1992 47.63 22.23 46.89 71.89 30 23 90 50.142857 True False True 1.100 0 0 0 1 0 0 1 0 1 0 0 0 0 False False False False False False False True False

In [18]:
dists = cid_to_dist.values()
hist(dists, bins=20);



In [19]:
df = pd.Series(cid_to_freq_cid, name='cluster')
df.index.name = 'component_id'
df = df.reset_index()
df.to_csv("clu_elbow.csv", index=False)

In [ ]: