In [1]:
%pylab inline
import pandas as pd


Populating the interactive namespace from numpy and matplotlib

Train set has dummied suppliers and bracket_pricing. New feature year and quantity_rep.


In [2]:
train_set = pd.read_csv('./train_set_adjusted.csv')

In [6]:
print train_set.shape
# train_set.dtypes


(30213, 66)

In [8]:
# train_set.sum()

In [9]:
tube = pd.read_csv('./tube_material_id_imputed_dummies_drop_ns.csv')

In [19]:
print tube.shape
print tube.columns


(21198, 84)
Index([u'tube_assembly_id', u'diameter', u'wall', u'length', u'num_bends',
       u'bend_radius', u'num_boss', u'num_bracket', u'other',
       u'material_id_SP-0008', u'material_id_SP-0019', u'material_id_SP-0028',
       u'material_id_SP-0029', u'material_id_SP-0030', u'material_id_SP-0031',
       u'material_id_SP-0032', u'material_id_SP-0033', u'material_id_SP-0034',
       u'material_id_SP-0035', u'material_id_SP-0036', u'material_id_SP-0037',
       u'material_id_SP-0038', u'material_id_SP-0039', u'material_id_SP-0041',
       u'material_id_SP-0044', u'material_id_SP-0045', u'material_id_SP-0046',
       u'material_id_SP-0048', u'end_a_1x_Y', u'end_a_2x_Y', u'end_x_1x_Y',
       u'end_x_2x_Y', u'end_a_EF-001', u'end_a_EF-002', u'end_a_EF-003',
       u'end_a_EF-004', u'end_a_EF-005', u'end_a_EF-006', u'end_a_EF-007',
       u'end_a_EF-008', u'end_a_EF-009', u'end_a_EF-010', u'end_a_EF-011',
       u'end_a_EF-012', u'end_a_EF-013', u'end_a_EF-014', u'end_a_EF-015',
       u'end_a_EF-016', u'end_a_EF-017', u'end_a_EF-018', u'end_a_EF-019',
       u'end_a_EF-020', u'end_a_EF-021', u'end_a_EF-022', u'end_a_EF-023',
       u'end_a_EF-025', u'end_a_NONE', u'end_x_9999', u'end_x_EF-001',
       u'end_x_EF-002', u'end_x_EF-003', u'end_x_EF-004', u'end_x_EF-005',
       u'end_x_EF-006', u'end_x_EF-007', u'end_x_EF-008', u'end_x_EF-009',
       u'end_x_EF-010', u'end_x_EF-011', u'end_x_EF-012', u'end_x_EF-013',
       u'end_x_EF-014', u'end_x_EF-015', u'end_x_EF-016', u'end_x_EF-017',
       u'end_x_EF-018', u'end_x_EF-019', u'end_x_EF-021', u'end_x_EF-022',
       u'end_x_EF-023', u'end_x_EF-024', u'end_x_EF-025', u'end_x_EF-026',
       u'end_x_NONE'],
      dtype='object')

In [13]:
spec = pd.read_csv('./spec_dummies.csv')

In [14]:
print spec.shape
print spec.columns


(21198, 86)
Index([u'tube_assembly_id', u'SP-0001', u'SP-0002', u'SP-0003', u'SP-0004',
       u'SP-0005', u'SP-0006', u'SP-0007', u'SP-0009', u'SP-0010', u'SP-0011',
       u'SP-0012', u'SP-0013', u'SP-0014', u'SP-0015', u'SP-0016', u'SP-0017',
       u'SP-0018', u'SP-0019', u'SP-0020', u'SP-0021', u'SP-0022', u'SP-0023',
       u'SP-0024', u'SP-0025', u'SP-0026', u'SP-0027', u'SP-0028', u'SP-0029',
       u'SP-0030', u'SP-0033', u'SP-0035', u'SP-0036', u'SP-0037', u'SP-0038',
       u'SP-0039', u'SP-0040', u'SP-0042', u'SP-0043', u'SP-0044', u'SP-0046',
       u'SP-0047', u'SP-0049', u'SP-0050', u'SP-0051', u'SP-0052', u'SP-0053',
       u'SP-0054', u'SP-0055', u'SP-0056', u'SP-0057', u'SP-0058', u'SP-0059',
       u'SP-0060', u'SP-0061', u'SP-0062', u'SP-0063', u'SP-0064', u'SP-0065',
       u'SP-0066', u'SP-0067', u'SP-0068', u'SP-0069', u'SP-0070', u'SP-0071',
       u'SP-0072', u'SP-0073', u'SP-0074', u'SP-0075', u'SP-0076', u'SP-0077',
       u'SP-0078', u'SP-0079', u'SP-0080', u'SP-0081', u'SP-0082', u'SP-0083',
       u'SP-0084', u'SP-0085', u'SP-0086', u'SP-0087', u'SP-0088', u'SP-0091',
       u'SP-0092', u'SP-0094', u'SP-0096'],
      dtype='object')

In [15]:
comp_type_weight = pd.read_csv('./comp_type_dummies.csv')

In [16]:
print comp_type_weight.shape
print comp_type_weight.columns


(21198, 30)
Index([u'tube_assembly_id', u'CP-001', u'CP-002', u'CP-003', u'CP-004',
       u'CP-005', u'CP-006', u'CP-007', u'CP-008', u'CP-009', u'CP-010',
       u'CP-011', u'CP-012', u'CP-014', u'CP-015', u'CP-016', u'CP-017',
       u'CP-018', u'CP-019', u'CP-020', u'CP-021', u'CP-022', u'CP-023',
       u'CP-024', u'CP-025', u'CP-026', u'CP-027', u'CP-028', u'CP-029',
       u'OTHER'],
      dtype='object')

In [17]:
tube_vol = pd.read_csv('./tube_volume.csv')

In [18]:
print tube_vol.shape
print tube_vol.columns


(21198, 2)
Index([u'tube_assembly_id', u'tube_volume'], dtype='object')

In [23]:
train = pd.merge(train_set, tube)
train = pd.merge(train, spec)
train = pd.merge(train, comp_type_weight)
train = pd.merge(train, tube_vol)
train = train.drop(['quote_date', 'quantity_rep'], axis=1)

In [24]:
print train.shape
print train.columns.values


(30213, 262)
['tube_assembly_id' 'annual_usage' 'min_order_quantity' 'quantity' 'cost'
 'year' 'supplier_S-0003' 'supplier_S-0004' 'supplier_S-0005'
 'supplier_S-0006' 'supplier_S-0007' 'supplier_S-0008' 'supplier_S-0009'
 'supplier_S-0011' 'supplier_S-0012' 'supplier_S-0013' 'supplier_S-0014'
 'supplier_S-0015' 'supplier_S-0018' 'supplier_S-0022' 'supplier_S-0023'
 'supplier_S-0024' 'supplier_S-0025' 'supplier_S-0026' 'supplier_S-0027'
 'supplier_S-0029' 'supplier_S-0030' 'supplier_S-0031' 'supplier_S-0041'
 'supplier_S-0042' 'supplier_S-0043' 'supplier_S-0046' 'supplier_S-0050'
 'supplier_S-0051' 'supplier_S-0054' 'supplier_S-0056' 'supplier_S-0058'
 'supplier_S-0059' 'supplier_S-0060' 'supplier_S-0061' 'supplier_S-0062'
 'supplier_S-0064' 'supplier_S-0066' 'supplier_S-0068' 'supplier_S-0070'
 'supplier_S-0072' 'supplier_S-0074' 'supplier_S-0078' 'supplier_S-0080'
 'supplier_S-0081' 'supplier_S-0087' 'supplier_S-0090' 'supplier_S-0092'
 'supplier_S-0095' 'supplier_S-0096' 'supplier_S-0097' 'supplier_S-0104'
 'supplier_S-0105' 'supplier_S-0106' 'supplier_S-0107' 'supplier_S-0108'
 'supplier_S-0109' 'supplier_S-0111' 'bracket_pricing_Yes' 'diameter'
 'wall' 'length' 'num_bends' 'bend_radius' 'num_boss' 'num_bracket' 'other'
 'material_id_SP-0008' 'material_id_SP-0019' 'material_id_SP-0028'
 'material_id_SP-0029' 'material_id_SP-0030' 'material_id_SP-0031'
 'material_id_SP-0032' 'material_id_SP-0033' 'material_id_SP-0034'
 'material_id_SP-0035' 'material_id_SP-0036' 'material_id_SP-0037'
 'material_id_SP-0038' 'material_id_SP-0039' 'material_id_SP-0041'
 'material_id_SP-0044' 'material_id_SP-0045' 'material_id_SP-0046'
 'material_id_SP-0048' 'end_a_1x_Y' 'end_a_2x_Y' 'end_x_1x_Y' 'end_x_2x_Y'
 'end_a_EF-001' 'end_a_EF-002' 'end_a_EF-003' 'end_a_EF-004' 'end_a_EF-005'
 'end_a_EF-006' 'end_a_EF-007' 'end_a_EF-008' 'end_a_EF-009' 'end_a_EF-010'
 'end_a_EF-011' 'end_a_EF-012' 'end_a_EF-013' 'end_a_EF-014' 'end_a_EF-015'
 'end_a_EF-016' 'end_a_EF-017' 'end_a_EF-018' 'end_a_EF-019' 'end_a_EF-020'
 'end_a_EF-021' 'end_a_EF-022' 'end_a_EF-023' 'end_a_EF-025' 'end_a_NONE'
 'end_x_9999' 'end_x_EF-001' 'end_x_EF-002' 'end_x_EF-003' 'end_x_EF-004'
 'end_x_EF-005' 'end_x_EF-006' 'end_x_EF-007' 'end_x_EF-008' 'end_x_EF-009'
 'end_x_EF-010' 'end_x_EF-011' 'end_x_EF-012' 'end_x_EF-013' 'end_x_EF-014'
 'end_x_EF-015' 'end_x_EF-016' 'end_x_EF-017' 'end_x_EF-018' 'end_x_EF-019'
 'end_x_EF-021' 'end_x_EF-022' 'end_x_EF-023' 'end_x_EF-024' 'end_x_EF-025'
 'end_x_EF-026' 'end_x_NONE' 'SP-0001' 'SP-0002' 'SP-0003' 'SP-0004'
 'SP-0005' 'SP-0006' 'SP-0007' 'SP-0009' 'SP-0010' 'SP-0011' 'SP-0012'
 'SP-0013' 'SP-0014' 'SP-0015' 'SP-0016' 'SP-0017' 'SP-0018' 'SP-0019'
 'SP-0020' 'SP-0021' 'SP-0022' 'SP-0023' 'SP-0024' 'SP-0025' 'SP-0026'
 'SP-0027' 'SP-0028' 'SP-0029' 'SP-0030' 'SP-0033' 'SP-0035' 'SP-0036'
 'SP-0037' 'SP-0038' 'SP-0039' 'SP-0040' 'SP-0042' 'SP-0043' 'SP-0044'
 'SP-0046' 'SP-0047' 'SP-0049' 'SP-0050' 'SP-0051' 'SP-0052' 'SP-0053'
 'SP-0054' 'SP-0055' 'SP-0056' 'SP-0057' 'SP-0058' 'SP-0059' 'SP-0060'
 'SP-0061' 'SP-0062' 'SP-0063' 'SP-0064' 'SP-0065' 'SP-0066' 'SP-0067'
 'SP-0068' 'SP-0069' 'SP-0070' 'SP-0071' 'SP-0072' 'SP-0073' 'SP-0074'
 'SP-0075' 'SP-0076' 'SP-0077' 'SP-0078' 'SP-0079' 'SP-0080' 'SP-0081'
 'SP-0082' 'SP-0083' 'SP-0084' 'SP-0085' 'SP-0086' 'SP-0087' 'SP-0088'
 'SP-0091' 'SP-0092' 'SP-0094' 'SP-0096' 'CP-001' 'CP-002' 'CP-003'
 'CP-004' 'CP-005' 'CP-006' 'CP-007' 'CP-008' 'CP-009' 'CP-010' 'CP-011'
 'CP-012' 'CP-014' 'CP-015' 'CP-016' 'CP-017' 'CP-018' 'CP-019' 'CP-020'
 'CP-021' 'CP-022' 'CP-023' 'CP-024' 'CP-025' 'CP-026' 'CP-027' 'CP-028'
 'CP-029' 'OTHER' 'tube_volume']

In [26]:
train.to_csv('preprocessed_train.csv', index=False)

In [ ]: