notebook.community

Edit and run



In [49]:

    
import pandas as pd
import numpy as np
from sklearn import ensemble, preprocessing
import xgboost as xgb



In [50]:

    
train = pd.read_csv('competition_data/train_set.csv', parse_dates=[2,])
test = pd.read_csv('competition_data/test_set.csv', parse_dates=[3,])
tube_data = pd.read_csv('competition_data/tube.csv')
bill_of_materials_data = pd.read_csv('competition_data/bill_of_materials.csv')
specs_data = pd.read_csv('competition_data/specs.csv')



In [51]:

    
train = pd.merge(train, tube_data, on ='tube_assembly_id')
train = pd.merge(train, bill_of_materials_data, on ='tube_assembly_id')

test = pd.merge(test, tube_data, on ='tube_assembly_id')
test = pd.merge(test, bill_of_materials_data, on ='tube_assembly_id')



In [52]:

    
train['year'] = train.quote_date.dt.year
train['month'] = train.quote_date.dt.month



In [53]:

    
test['year'] = test.quote_date.dt.year
test['month'] = test.quote_date.dt.month



In [54]:

    
idx = test.id.values.astype(int)
test = test.drop(['id', 'tube_assembly_id', 'quote_date'], axis = 1)

labels = train.cost.values
train = train.drop(['quote_date', 'cost', 'tube_assembly_id'], axis = 1)



In [55]:

    
train['material_id'].replace(np.nan,' ', regex=True, inplace= True)
test['material_id'].replace(np.nan,' ', regex=True, inplace= True)
for i in range(1,9):
    column_label = 'component_id_'+str(i)
    print(column_label)
    train[column_label].replace(np.nan,' ', regex=True, inplace= True)
    test[column_label].replace(np.nan,' ', regex=True, inplace= True)









    



component_id_1
component_id_2
component_id_3
component_id_4
component_id_5
component_id_6
component_id_7
component_id_8



In [56]:

    
train.fillna(0, inplace = True)
test.fillna(0, inplace = True)



In [60]:

    
train.shape









    Out[60]:





(30213, 38)



In [58]:

    
train.head()









    Out[58]:






  
    
      
      supplier
      annual_usage
      min_order_quantity
      bracket_pricing
      quantity
      material_id
      diameter
      wall
      length
      num_bends
      ...
      component_id_5
      quantity_5
      component_id_6
      quantity_6
      component_id_7
      quantity_7
      component_id_8
      quantity_8
      year
      month
    
  
  
    
      0
      S-0066
      0
      0
      Yes
      1
      SP-0019
      6.35
      0.71
      137
      8
      ...
      
      0
      
      0
      
      0
      
      0
      2013
      7
    
    
      1
      S-0066
      0
      0
      Yes
      2
      SP-0019
      6.35
      0.71
      137
      8
      ...
      
      0
      
      0
      
      0
      
      0
      2013
      7
    
    
      2
      S-0066
      0
      0
      Yes
      5
      SP-0019
      6.35
      0.71
      137
      8
      ...
      
      0
      
      0
      
      0
      
      0
      2013
      7
    
    
      3
      S-0066
      0
      0
      Yes
      10
      SP-0019
      6.35
      0.71
      137
      8
      ...
      
      0
      
      0
      
      0
      
      0
      2013
      7
    
    
      4
      S-0066
      0
      0
      Yes
      25
      SP-0019
      6.35
      0.71
      137
      8
      ...
      
      0
      
      0
      
      0
      
      0
      2013
      7
    
  

5 rows × 38 columns



In [61]:

    
train = np.array(train)
test = np.array(test)



In [62]:

    
# label encode the categorical variables
for i in range(train.shape[1]):
    if i in [0,3,5,11,12,13,14,15,16,20,22,24,26,28,30,32,34]:
        print(i,list(train[1:5,i]) + list(test[1:5,i]))
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(train[:,i]) + list(test[:,i]))
        train[:,i] = lbl.transform(train[:,i])
        test[:,i] = lbl.transform(test[:,i])


# object array to float
train = train.astype(float)
test = test.astype(float)









    



(0, ['S-0066', 'S-0066', 'S-0066', 'S-0066', 'S-0066', 'S-0066', 'S-0066', 'S-0066'])
(3, ['Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes'])
(5, ['SP-0019', 'SP-0019', 'SP-0019', 'SP-0019', 'SP-0035', 'SP-0035', 'SP-0035', 'SP-0035'])
(11, ['N', 'N', 'N', 'N', 'N', 'N', 'N', 'N'])
(12, ['N', 'N', 'N', 'N', 'N', 'N', 'N', 'N'])
(13, ['N', 'N', 'N', 'N', 'N', 'N', 'N', 'N'])
(14, ['N', 'N', 'N', 'N', 'N', 'N', 'N', 'N'])
(15, ['EF-008', 'EF-008', 'EF-008', 'EF-008', 'EF-003', 'EF-003', 'EF-003', 'EF-003'])
(16, ['EF-008', 'EF-008', 'EF-008', 'EF-008', 'EF-003', 'EF-003', 'EF-003', 'EF-003'])
(20, ['C-1312', 'C-1312', 'C-1312', 'C-1312', 'C-1622', 'C-1622', 'C-1622', 'C-1622'])
(22, [' ', ' ', ' ', ' ', 'C-1629', 'C-1629', 'C-1629', 'C-1629'])
(24, [' ', ' ', ' ', ' ', ' ', ' ', ' ', ' '])
(26, [' ', ' ', ' ', ' ', ' ', ' ', ' ', ' '])
(28, [' ', ' ', ' ', ' ', ' ', ' ', ' ', ' '])
(30, [' ', ' ', ' ', ' ', ' ', ' ', ' ', ' '])
(32, [' ', ' ', ' ', ' ', ' ', ' ', ' ', ' '])
(34, [' ', ' ', ' ', ' ', ' ', ' ', ' ', ' '])



In [65]:

    
train[0,]









    Out[65]:





array([  4.10000000e+01,   0.00000000e+00,   0.00000000e+00,
         1.00000000e+00,   1.00000000e+00,   2.00000000e+00,
         6.35000000e+00,   7.10000000e-01,   1.37000000e+02,
         8.00000000e+00,   1.90500000e+01,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         7.00000000e+00,   8.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   5.05000000e+02,
         2.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         2.01300000e+03,   7.00000000e+00])



In [73]:

    
train.shape









    Out[73]:





(30213, 38)



In [66]:

    
label_log = np.log1p(labels)



In [67]:

    
type(label_log)









    Out[67]:





numpy.ndarray



In [68]:

    
label_log









    Out[68]:





array([ 3.13139596,  2.59085804,  2.0283885 , ...,  1.80272076,
        2.9556465 ,  4.00037493])



In [69]:

    
params = {}
params["objective"] = "reg:linear"
params["eta"] = 0.02
params["min_child_weight"] = 5
params["subsample"] = 0.7
params["colsample_bytree"] = 0.6
params["scale_pos_weight"] = 0.8
params["silent"] = 1
params["max_depth"] = 9
params["max_delta_step"]=2

plst = list(params.items())



In [70]:

    
xgtrain = xgb.DMatrix(train, label=label_log)
xgtest = xgb.DMatrix(test)



In [71]:

    
num_rounds = 2000
model = xgb.train(plst, xgtrain, num_rounds)
preds1 = model.predict(xgtest)



In [72]:

    
np.expm1(preds1)









    Out[72]:





array([ 22.92417717,  13.17559624,   7.35880375, ...,   8.84174728,
        43.01922607,  50.5223465 ], dtype=float32)



In [ ]:

    
num_rounds = 3000
model = xgb.train(plst, xgtrain, num_rounds)
preds2 = model.predict(xgtest)



In [ ]:

    
preds2



In [ ]:

    
num_rounds = 1500
model = xgb.train(plst, xgtrain, num_rounds)
preds4 = model.predict(xgtest)



In [ ]:

    
preds4



In [ ]:

    
preds = (np.expm1( (preds1+preds2+preds4)/3))



In [ ]:

    
preds



In [ ]:

    
preds = pd.DataFrame({"id": idx, "cost": preds})



In [ ]:

    
preds

	supplier	bracket_pricing	quantity	material_id	diameter	wall	length	num_bends	...	year	month
0	S-0066	Yes	1	SP-0019	6.35	0.71	137	8	...	2013	7
1	S-0066	Yes	2	SP-0019	6.35	0.71	137	8	...	2013	7
2	S-0066	Yes	5	SP-0019	6.35	0.71	137	8	...	2013	7
3	S-0066	Yes	10	SP-0019	6.35	0.71	137	8	...	2013	7
4	S-0066	Yes	25	SP-0019	6.35	0.71	137	8	...	2013	7