In [1]:
# From https://www.kaggle.com/kumareshd/caterpillar-tube-pricing/xgbooost-222

%pylab inline

import pandas as pd
import numpy as np
from sklearn import ensemble, preprocessing
import xgboost as xgb


Populating the interactive namespace from numpy and matplotlib

In [9]:
# load training and test datasets
train = pd.read_csv('data/competition_data/train_set.csv', parse_dates=[2,])
test = pd.read_csv('data/competition_data/test_set.csv', parse_dates=[3,])
tube_data = pd.read_csv('data/competition_data/tube.csv')
bill_of_materials_data = pd.read_csv('data/competition_data/bill_of_materials.csv')
specs_data = pd.read_csv('data/competition_data/specs.csv')

print("train columns")
print(train.columns)
print("test columns")
print(test.columns)
print("tube.csv df columns")
print(tube_data.columns)
print("bill_of_materials.csv df columns")
print(bill_of_materials_data.columns)
print("specs.csv df columns")
print(specs_data.columns)

print(specs_data[2:3])


train columns
Index([u'tube_assembly_id', u'supplier', u'quote_date', u'annual_usage',
       u'min_order_quantity', u'bracket_pricing', u'quantity', u'cost'],
      dtype='object')
test columns
Index([u'id', u'tube_assembly_id', u'supplier', u'quote_date', u'annual_usage',
       u'min_order_quantity', u'bracket_pricing', u'quantity'],
      dtype='object')
tube.csv df columns
Index([u'tube_assembly_id', u'material_id', u'diameter', u'wall', u'length',
       u'num_bends', u'bend_radius', u'end_a_1x', u'end_a_2x', u'end_x_1x',
       u'end_x_2x', u'end_a', u'end_x', u'num_boss', u'num_bracket', u'other'],
      dtype='object')
bill_of_materials.csv df columns
Index([u'tube_assembly_id', u'component_id_1', u'quantity_1',
       u'component_id_2', u'quantity_2', u'component_id_3', u'quantity_3',
       u'component_id_4', u'quantity_4', u'component_id_5', u'quantity_5',
       u'component_id_6', u'quantity_6', u'component_id_7', u'quantity_7',
       u'component_id_8', u'quantity_8'],
      dtype='object')
specs.csv df columns
Index([u'tube_assembly_id', u'spec1', u'spec2', u'spec3', u'spec4', u'spec5',
       u'spec6', u'spec7', u'spec8', u'spec9', u'spec10'],
      dtype='object')
  tube_assembly_id spec1 spec2 spec3 spec4 spec5 spec6 spec7 spec8 spec9  \
2         TA-00003   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   

  spec10  
2    NaN  

In [10]:
train = pd.merge(train, tube_data, on ='tube_assembly_id')
train = pd.merge(train, bill_of_materials_data, on ='tube_assembly_id')
test = pd.merge(test, tube_data, on ='tube_assembly_id')
test = pd.merge(test, bill_of_materials_data, on ='tube_assembly_id')

print("new train columns")
print(train.columns)
print(train[1:10])
print(train.columns.to_series().groupby(train.dtypes).groups)


new train columns
Index([u'tube_assembly_id', u'supplier', u'quote_date', u'annual_usage',
       u'min_order_quantity', u'bracket_pricing', u'quantity', u'cost',
       u'material_id', u'diameter', u'wall', u'length', u'num_bends',
       u'bend_radius', u'end_a_1x', u'end_a_2x', u'end_x_1x', u'end_x_2x',
       u'end_a', u'end_x', u'num_boss', u'num_bracket', u'other',
       u'component_id_1', u'quantity_1', u'component_id_2', u'quantity_2',
       u'component_id_3', u'quantity_3', u'component_id_4', u'quantity_4',
       u'component_id_5', u'quantity_5', u'component_id_6', u'quantity_6',
       u'component_id_7', u'quantity_7', u'component_id_8', u'quantity_8'],
      dtype='object')
  tube_assembly_id supplier quote_date  annual_usage  min_order_quantity  \
1         TA-00002   S-0066 2013-07-07             0                   0   
2         TA-00002   S-0066 2013-07-07             0                   0   
3         TA-00002   S-0066 2013-07-07             0                   0   
4         TA-00002   S-0066 2013-07-07             0                   0   
5         TA-00002   S-0066 2013-07-07             0                   0   
6         TA-00002   S-0066 2013-07-07             0                   0   
7         TA-00002   S-0066 2013-07-07             0                   0   
8         TA-00004   S-0066 2013-07-07             0                   0   
9         TA-00004   S-0066 2013-07-07             0                   0   

  bracket_pricing  quantity       cost material_id  diameter    ...      \
1             Yes         2  12.341214     SP-0019      6.35    ...       
2             Yes         5   6.601826     SP-0019      6.35    ...       
3             Yes        10   4.687770     SP-0019      6.35    ...       
4             Yes        25   3.541561     SP-0019      6.35    ...       
5             Yes        50   3.224406     SP-0019      6.35    ...       
6             Yes       100   3.082521     SP-0019      6.35    ...       
7             Yes       250   2.999060     SP-0019      6.35    ...       
8             Yes         1  21.972702     SP-0019      6.35    ...       
9             Yes         2  12.407983     SP-0019      6.35    ...       

   component_id_4  quantity_4  component_id_5  quantity_5 component_id_6  \
1             NaN         NaN             NaN         NaN            NaN   
2             NaN         NaN             NaN         NaN            NaN   
3             NaN         NaN             NaN         NaN            NaN   
4             NaN         NaN             NaN         NaN            NaN   
5             NaN         NaN             NaN         NaN            NaN   
6             NaN         NaN             NaN         NaN            NaN   
7             NaN         NaN             NaN         NaN            NaN   
8             NaN         NaN             NaN         NaN            NaN   
9             NaN         NaN             NaN         NaN            NaN   

  quantity_6 component_id_7 quantity_7 component_id_8 quantity_8  
1        NaN            NaN        NaN            NaN        NaN  
2        NaN            NaN        NaN            NaN        NaN  
3        NaN            NaN        NaN            NaN        NaN  
4        NaN            NaN        NaN            NaN        NaN  
5        NaN            NaN        NaN            NaN        NaN  
6        NaN            NaN        NaN            NaN        NaN  
7        NaN            NaN        NaN            NaN        NaN  
8        NaN            NaN        NaN            NaN        NaN  
9        NaN            NaN        NaN            NaN        NaN  

[9 rows x 39 columns]
{dtype('<M8[ns]'): ['quote_date'], dtype('O'): ['tube_assembly_id', 'supplier', 'bracket_pricing', 'material_id', 'end_a_1x', 'end_a_2x', 'end_x_1x', 'end_x_2x', 'end_a', 'end_x', 'component_id_1', 'component_id_2', 'component_id_3', 'component_id_4', 'component_id_5', 'component_id_6', 'component_id_7', 'component_id_8'], dtype('int64'): ['annual_usage', 'min_order_quantity', 'quantity', 'num_bends', 'num_boss', 'num_bracket', 'other'], dtype('float64'): ['cost', 'diameter', 'wall', 'length', 'bend_radius', 'quantity_1', 'quantity_2', 'quantity_3', 'quantity_4', 'quantity_5', 'quantity_6', 'quantity_7', 'quantity_8']}

In [11]:
# create some new features
train['year'] = train.quote_date.dt.year
train['month'] = train.quote_date.dt.month
test['year'] = test.quote_date.dt.year
test['month'] = test.quote_date.dt.month

In [12]:
# drop useless columns and create labels
idx = test.id.values.astype(int)
test = test.drop(['id', 'tube_assembly_id', 'quote_date'], axis = 1)
labels = train.cost.values
#'tube_assembly_id', 'supplier', 'bracket_pricing', 'material_id', 'end_a_1x', 'end_a_2x', 'end_x_1x', 'end_x_2x', 'end_a', 'end_x'
#for some reason material_id cannot be converted to categorical variable
train = train.drop(['quote_date', 'cost', 'tube_assembly_id'], axis = 1)

In [18]:
# Replace missing vals with ' '...
train['material_id'].replace(np.nan,' ', regex=True, inplace= True)
test['material_id'].replace(np.nan,' ', regex=True, inplace= True)
for i in range(1,9):
    column_label = 'component_id_'+str(i)
    print(column_label)
    train[column_label].replace(np.nan,' ', regex=True, inplace= True)
    test[column_label].replace(np.nan,' ', regex=True, inplace= True)


component_id_1
component_id_2
component_id_3
component_id_4
component_id_5
component_id_6
component_id_7
component_id_8

In [20]:
train.fillna(0, inplace = True)
test.fillna(0, inplace = True)

In [22]:
print("train columns")
print(train.columns)


train columns
Index([u'supplier', u'annual_usage', u'min_order_quantity', u'bracket_pricing',
       u'quantity', u'material_id', u'diameter', u'wall', u'length',
       u'num_bends', u'bend_radius', u'end_a_1x', u'end_a_2x', u'end_x_1x',
       u'end_x_2x', u'end_a', u'end_x', u'num_boss', u'num_bracket', u'other',
       u'component_id_1', u'quantity_1', u'component_id_2', u'quantity_2',
       u'component_id_3', u'quantity_3', u'component_id_4', u'quantity_4',
       u'component_id_5', u'quantity_5', u'component_id_6', u'quantity_6',
       u'component_id_7', u'quantity_7', u'component_id_8', u'quantity_8',
       u'year', u'month'],
      dtype='object')

In [23]:
# convert data to numpy array
train = np.array(train)
test = np.array(test)

In [24]:
# label encode the categorical variables
for i in range(train.shape[1]):
    if i in [0,3,5,11,12,13,14,15,16,20,22,24,26,28,30,32,34]:
        print(i,list(train[1:5,i]) + list(test[1:5,i]))
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(train[:,i]) + list(test[:,i]))
        train[:,i] = lbl.transform(train[:,i])
        test[:,i] = lbl.transform(test[:,i])


(0, ['S-0066', 'S-0066', 'S-0066', 'S-0066', 'S-0066', 'S-0066', 'S-0066', 'S-0066'])
(3, ['Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes'])
(5, ['SP-0019', 'SP-0019', 'SP-0019', 'SP-0019', 'SP-0035', 'SP-0035', 'SP-0035', 'SP-0035'])
(11, ['N', 'N', 'N', 'N', 'N', 'N', 'N', 'N'])
(12, ['N', 'N', 'N', 'N', 'N', 'N', 'N', 'N'])
(13, ['N', 'N', 'N', 'N', 'N', 'N', 'N', 'N'])
(14, ['N', 'N', 'N', 'N', 'N', 'N', 'N', 'N'])
(15, ['EF-008', 'EF-008', 'EF-008', 'EF-008', 'EF-003', 'EF-003', 'EF-003', 'EF-003'])
(16, ['EF-008', 'EF-008', 'EF-008', 'EF-008', 'EF-003', 'EF-003', 'EF-003', 'EF-003'])
(20, ['C-1312', 'C-1312', 'C-1312', 'C-1312', 'C-1622', 'C-1622', 'C-1622', 'C-1622'])
(22, [' ', ' ', ' ', ' ', 'C-1629', 'C-1629', 'C-1629', 'C-1629'])
(24, [' ', ' ', ' ', ' ', ' ', ' ', ' ', ' '])
(26, [' ', ' ', ' ', ' ', ' ', ' ', ' ', ' '])
(28, [' ', ' ', ' ', ' ', ' ', ' ', ' ', ' '])
(30, [' ', ' ', ' ', ' ', ' ', ' ', ' ', ' '])
(32, [' ', ' ', ' ', ' ', ' ', ' ', ' ', ' '])
(34, [' ', ' ', ' ', ' ', ' ', ' ', ' ', ' '])

In [25]:
# object array to float
train = train.astype(float)
test = test.astype(float)

In [26]:
# i like to train on log(1+x) for RMSLE ;) 
# The choice is yours :)
label_log = np.log1p(labels)

In [27]:
# fit a random forest model

params = {}
params["objective"] = "reg:linear"
params["eta"] = 0.02
params["min_child_weight"] = 6
params["subsample"] = 0.7
params["colsample_bytree"] = 0.6
params["scale_pos_weight"] = 0.8  # undocumented?!
params["silent"] = 1
params["max_depth"] = 8
params["max_delta_step"] = 2

plst = list(params.items())

xgtrain = xgb.DMatrix(train, label=label_log)
xgtest = xgb.DMatrix(test)

In [29]:
print('2000')

num_rounds = 2000
%time model1 = xgb.train(plst, xgtrain, num_rounds)
%time preds1 = model1.predict(xgtest)


2000
CPU times: user 3min 55s, sys: 1.12 s, total: 3min 57s
Wall time: 2min 37s

In [30]:
print('3000')

num_rounds = 3000
%time model2 = xgb.train(plst, xgtrain, num_rounds)
%time preds2 = model2.predict(xgtest)


3000
CPU times: user 5min 31s, sys: 1.12 s, total: 5min 32s
Wall time: 3min 25s
CPU times: user 1min 1s, sys: 136 ms, total: 1min 1s
Wall time: 38.5 s

In [ ]: