In [1]:
%pylab inline
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier


Populating the interactive namespace from numpy and matplotlib

In [2]:
tube = pd.read_csv('./competition_data/tube.csv')

In [3]:
print tube.dtypes


tube_assembly_id     object
material_id          object
diameter            float64
wall                float64
length              float64
num_bends             int64
bend_radius         float64
end_a_1x             object
end_a_2x             object
end_x_1x             object
end_x_2x             object
end_a                object
end_x                object
num_boss              int64
num_bracket           int64
other                 int64
dtype: object

In [4]:
tube_isnull = tube.isnull()
tube_isnull_row = tube_isnull.any(axis=1)
# print tube_isnull_row.shape

# print tube_isnull_row

tube_train = tube[-tube_isnull_row]
# tube_train_X = tube.drop(['material_id'], axis=1)
tube_train_X = tube_train[['diameter', 'wall', 'length', 'num_bends', 'bend_radius',\
                     'num_boss', 'num_bracket', 'other']]
tube_train_y = tube_train['material_id']

tube_test = tube[tube_isnull_row]
tube_test_X = tube_test[['diameter', 'wall', 'length', 'num_bends', 'bend_radius',\
                     'num_boss', 'num_bracket', 'other']]
tube_test_y = tube_test['material_id']

In [5]:
# tube_test_y

In [6]:
print tube_train.shape
print tube_test.shape


(20919, 16)
(279, 16)

In [7]:
knn5 = KNeighborsClassifier()

In [8]:
knn_fit = knn5.fit(tube_train_X, tube_train_y)

In [9]:
knn_fit.get_params()


Out[9]:
{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'minkowski',
 'metric_params': None,
 'n_neighbors': 5,
 'p': 2,
 'weights': 'uniform'}

In [10]:
material_pred = knn_fit.predict(tube_test_X)
print material_pred


['SP-0029' 'SP-0029' 'SP-0029' 'SP-0019' 'SP-0029' 'SP-0029' 'SP-0029'
 'SP-0029' 'SP-0029' 'SP-0035' 'SP-0029' 'SP-0035' 'SP-0029' 'SP-0029'
 'SP-0029' 'SP-0029' 'SP-0037' 'SP-0035' 'SP-0035' 'SP-0035' 'SP-0035'
 'SP-0035' 'SP-0029' 'SP-0029' 'SP-0029' 'SP-0029' 'SP-0028' 'SP-0029'
 'SP-0029' 'SP-0029' 'SP-0029' 'SP-0029' 'SP-0029' 'SP-0008' 'SP-0029'
 'SP-0029' 'SP-0035' 'SP-0029' 'SP-0029' 'SP-0029' 'SP-0029' 'SP-0029'
 'SP-0029' 'SP-0029' 'SP-0029' 'SP-0029' 'SP-0029' 'SP-0029' 'SP-0029'
 'SP-0035' 'SP-0029' 'SP-0029' 'SP-0029' 'SP-0029' 'SP-0029' 'SP-0035'
 'SP-0029' 'SP-0035' 'SP-0035' 'SP-0029' 'SP-0029' 'SP-0029' 'SP-0035'
 'SP-0035' 'SP-0029' 'SP-0029' 'SP-0035' 'SP-0029' 'SP-0029' 'SP-0008'
 'SP-0029' 'SP-0029' 'SP-0044' 'SP-0029' 'SP-0029' 'SP-0029' 'SP-0029'
 'SP-0029' 'SP-0029' 'SP-0029' 'SP-0029' 'SP-0029' 'SP-0029' 'SP-0029'
 'SP-0029' 'SP-0029' 'SP-0029' 'SP-0029' 'SP-0029' 'SP-0029' 'SP-0029'
 'SP-0029' 'SP-0029' 'SP-0035' 'SP-0029' 'SP-0029' 'SP-0029' 'SP-0029'
 'SP-0029' 'SP-0029' 'SP-0035' 'SP-0035' 'SP-0029' 'SP-0035' 'SP-0035'
 'SP-0035' 'SP-0035' 'SP-0029' 'SP-0029' 'SP-0029' 'SP-0035' 'SP-0019'
 'SP-0029' 'SP-0029' 'SP-0035' 'SP-0035' 'SP-0029' 'SP-0029' 'SP-0035'
 'SP-0029' 'SP-0029' 'SP-0029' 'SP-0029' 'SP-0029' 'SP-0035' 'SP-0035'
 'SP-0029' 'SP-0035' 'SP-0035' 'SP-0035' 'SP-0035' 'SP-0029' 'SP-0035'
 'SP-0029' 'SP-0035' 'SP-0029' 'SP-0035' 'SP-0029' 'SP-0029' 'SP-0029'
 'SP-0029' 'SP-0029' 'SP-0035' 'SP-0035' 'SP-0029' 'SP-0029' 'SP-0029'
 'SP-0029' 'SP-0029' 'SP-0029' 'SP-0035' 'SP-0029' 'SP-0029' 'SP-0029'
 'SP-0029' 'SP-0029' 'SP-0029' 'SP-0029' 'SP-0029' 'SP-0029' 'SP-0029'
 'SP-0029' 'SP-0029' 'SP-0035' 'SP-0029' 'SP-0035' 'SP-0029' 'SP-0029'
 'SP-0029' 'SP-0029' 'SP-0029' 'SP-0029' 'SP-0029' 'SP-0029' 'SP-0029'
 'SP-0029' 'SP-0029' 'SP-0029' 'SP-0029' 'SP-0029' 'SP-0029' 'SP-0029'
 'SP-0029' 'SP-0029' 'SP-0029' 'SP-0029' 'SP-0029' 'SP-0029' 'SP-0029'
 'SP-0029' 'SP-0029' 'SP-0029' 'SP-0029' 'SP-0029' 'SP-0029' 'SP-0029'
 'SP-0029' 'SP-0029' 'SP-0029' 'SP-0029' 'SP-0029' 'SP-0029' 'SP-0029'
 'SP-0029' 'SP-0029' 'SP-0029' 'SP-0046' 'SP-0029' 'SP-0029' 'SP-0029'
 'SP-0029' 'SP-0029' 'SP-0046' 'SP-0029' 'SP-0029' 'SP-0048' 'SP-0029'
 'SP-0029' 'SP-0029' 'SP-0029' 'SP-0029' 'SP-0029' 'SP-0029' 'SP-0029'
 'SP-0029' 'SP-0029' 'SP-0029' 'SP-0029' 'SP-0029' 'SP-0029' 'SP-0029'
 'SP-0029' 'SP-0029' 'SP-0029' 'SP-0029' 'SP-0029' 'SP-0029' 'SP-0029'
 'SP-0029' 'SP-0048' 'SP-0029' 'SP-0029' 'SP-0029' 'SP-0029' 'SP-0029'
 'SP-0046' 'SP-0029' 'SP-0029' 'SP-0029' 'SP-0029' 'SP-0029' 'SP-0029'
 'SP-0029' 'SP-0029' 'SP-0029' 'SP-0029' 'SP-0029' 'SP-0029' 'SP-0029'
 'SP-0029' 'SP-0029' 'SP-0029' 'SP-0029' 'SP-0029' 'SP-0029' 'SP-0029'
 'SP-0029' 'SP-0029' 'SP-0029' 'SP-0029' 'SP-0029' 'SP-0029' 'SP-0048'
 'SP-0029' 'SP-0029' 'SP-0029' 'SP-0037' 'SP-0035' 'SP-0035']

In [11]:
tube.ix[tube_isnull_row, 'material_id'] = material_pred

In [12]:
tube.to_csv('tube_material_id_imputed.csv', index=False)

In [16]:
tube.dtypes


Out[16]:
tube_assembly_id     object
material_id          object
diameter            float64
wall                float64
length              float64
num_bends             int64
bend_radius         float64
end_a_1x             object
end_a_2x             object
end_x_1x             object
end_x_2x             object
end_a                object
end_x                object
num_boss              int64
num_bracket           int64
other                 int64
dtype: object

In [17]:
tube_material_id_imputed_dummies = \
pd.get_dummies(tube, columns=['material_id', \
                              'end_a_1x', 'end_a_2x', \
                              'end_x_1x', 'end_x_2x', \
                              'end_a', 'end_x'])

In [22]:
tube_material_id_imputed_dummies.dtypes.to_dict()


Out[22]:
{'bend_radius': dtype('float64'),
 'diameter': dtype('float64'),
 'end_a_1x_N': dtype('float64'),
 'end_a_1x_Y': dtype('float64'),
 'end_a_2x_N': dtype('float64'),
 'end_a_2x_Y': dtype('float64'),
 'end_a_EF-001': dtype('float64'),
 'end_a_EF-002': dtype('float64'),
 'end_a_EF-003': dtype('float64'),
 'end_a_EF-004': dtype('float64'),
 'end_a_EF-005': dtype('float64'),
 'end_a_EF-006': dtype('float64'),
 'end_a_EF-007': dtype('float64'),
 'end_a_EF-008': dtype('float64'),
 'end_a_EF-009': dtype('float64'),
 'end_a_EF-010': dtype('float64'),
 'end_a_EF-011': dtype('float64'),
 'end_a_EF-012': dtype('float64'),
 'end_a_EF-013': dtype('float64'),
 'end_a_EF-014': dtype('float64'),
 'end_a_EF-015': dtype('float64'),
 'end_a_EF-016': dtype('float64'),
 'end_a_EF-017': dtype('float64'),
 'end_a_EF-018': dtype('float64'),
 'end_a_EF-019': dtype('float64'),
 'end_a_EF-020': dtype('float64'),
 'end_a_EF-021': dtype('float64'),
 'end_a_EF-022': dtype('float64'),
 'end_a_EF-023': dtype('float64'),
 'end_a_EF-025': dtype('float64'),
 'end_a_NONE': dtype('float64'),
 'end_x_1x_N': dtype('float64'),
 'end_x_1x_Y': dtype('float64'),
 'end_x_2x_N': dtype('float64'),
 'end_x_2x_Y': dtype('float64'),
 'end_x_9999': dtype('float64'),
 'end_x_EF-001': dtype('float64'),
 'end_x_EF-002': dtype('float64'),
 'end_x_EF-003': dtype('float64'),
 'end_x_EF-004': dtype('float64'),
 'end_x_EF-005': dtype('float64'),
 'end_x_EF-006': dtype('float64'),
 'end_x_EF-007': dtype('float64'),
 'end_x_EF-008': dtype('float64'),
 'end_x_EF-009': dtype('float64'),
 'end_x_EF-010': dtype('float64'),
 'end_x_EF-011': dtype('float64'),
 'end_x_EF-012': dtype('float64'),
 'end_x_EF-013': dtype('float64'),
 'end_x_EF-014': dtype('float64'),
 'end_x_EF-015': dtype('float64'),
 'end_x_EF-016': dtype('float64'),
 'end_x_EF-017': dtype('float64'),
 'end_x_EF-018': dtype('float64'),
 'end_x_EF-019': dtype('float64'),
 'end_x_EF-021': dtype('float64'),
 'end_x_EF-022': dtype('float64'),
 'end_x_EF-023': dtype('float64'),
 'end_x_EF-024': dtype('float64'),
 'end_x_EF-025': dtype('float64'),
 'end_x_EF-026': dtype('float64'),
 'end_x_NONE': dtype('float64'),
 'length': dtype('float64'),
 'material_id_SP-0008': dtype('float64'),
 'material_id_SP-0019': dtype('float64'),
 'material_id_SP-0028': dtype('float64'),
 'material_id_SP-0029': dtype('float64'),
 'material_id_SP-0030': dtype('float64'),
 'material_id_SP-0031': dtype('float64'),
 'material_id_SP-0032': dtype('float64'),
 'material_id_SP-0033': dtype('float64'),
 'material_id_SP-0034': dtype('float64'),
 'material_id_SP-0035': dtype('float64'),
 'material_id_SP-0036': dtype('float64'),
 'material_id_SP-0037': dtype('float64'),
 'material_id_SP-0038': dtype('float64'),
 'material_id_SP-0039': dtype('float64'),
 'material_id_SP-0041': dtype('float64'),
 'material_id_SP-0044': dtype('float64'),
 'material_id_SP-0045': dtype('float64'),
 'material_id_SP-0046': dtype('float64'),
 'material_id_SP-0048': dtype('float64'),
 'num_bends': dtype('int64'),
 'num_boss': dtype('int64'),
 'num_bracket': dtype('int64'),
 'other': dtype('int64'),
 'tube_assembly_id': dtype('O'),
 'wall': dtype('float64')}

In [25]:
tube_material_id_imputed_dummies_drop_ns = \
tube_material_id_imputed_dummies.drop(['end_a_1x_N', 'end_a_2x_N', \
                                       'end_x_1x_N', 'end_x_2x_N'], axis=1)

In [27]:
tube_material_id_imputed_dummies_drop_ns.to_csv(\
                                                'tube_material_id_imputed_dummies_drop_ns.csv', \
                                                index=False)

In [ ]: