In [52]:
import pandas as pd
import numpy as np
from sklearn import ensemble, preprocessing
import xgboost as xgb

In [53]:
train = pd.read_csv('train_set_adjusted.csv')

test = pd.read_csv('test_dummies_adjusted.csv')

tube = pd.read_csv('tube_material_id_imputed_dummies_drop_ns.csv')
# bill_of_materials_data = pd.read_csv('competition_data/bill_of_materials.csv')
spec_dummies = pd.read_csv('spec_dummies.csv')
comp_type_dummies = pd.read_csv('comp_type_dummies.csv')
comp_weight = pd.read_csv('comp_weight.csv')

In [54]:
train = pd.merge(train, tube, on ='tube_assembly_id')
train = pd.merge(train, comp_type_dummies, on ='tube_assembly_id')
train = pd.merge(train, spec_dummies, on ='tube_assembly_id')
train = pd.merge(train, comp_weight, on ='tube_assembly_id')

test = pd.merge(test, tube, on ='tube_assembly_id')
test = pd.merge(test, comp_type_dummies, on ='tube_assembly_id')
test = pd.merge(test, spec_dummies, on ='tube_assembly_id')
test = pd.merge(test, comp_weight, on ='tube_assembly_id')

In [4]:
# train['year'] = train.quote_date.dt.year
# train['month'] = train.quote_date.dt.month

In [5]:
# test['year'] = test.quote_date.dt.year
# test['month'] = test.quote_date.dt.month

In [55]:
idx = test.id.values.astype(int)
test = test.drop(['id', 'tube_assembly_id', 'quote_date', 'quantity_rep'], axis = 1)

labels = train.cost.values
train = train.drop(['quote_date', 'cost', 'tube_assembly_id', 'quantity_rep'], axis = 1)

In [22]:
# train['material_id'].replace(np.nan,' ', regex=True, inplace= True)
# test['material_id'].replace(np.nan,' ', regex=True, inplace= True)
# for i in range(1,9):
#     column_label = 'component_id_'+str(i)
#     print(column_label)
#     train[column_label].replace(np.nan,' ', regex=True, inplace= True)
#     test[column_label].replace(np.nan,' ', regex=True, inplace= True)

In [9]:
# train.fillna(0, inplace = True)
# test.fillna(0, inplace = True)

In [58]:
train.head()


Out[58]:
(30213, 260)

In [59]:
test.head()


Out[59]:
annual_usage cost min_order_quantity quantity year supplier_S-0003 supplier_S-0004 supplier_S-0005 supplier_S-0006 supplier_S-0007 ... SP-0084 SP-0085 SP-0086 SP-0087 SP-0088 SP-0091 SP-0092 SP-0094 SP-0096 weight
0 0 0 0 1 2013 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0.096
1 0 0 0 2 2013 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0.096
2 0 0 0 5 2013 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0.096
3 0 0 0 10 2013 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0.096
4 0 0 0 25 2013 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0.096

5 rows × 261 columns


In [60]:
train = np.array(train)
test = np.array(test)

In [61]:
# label encode the categorical variables
# for i in range(train.shape[1]):
#     if i in [0,3,5,11,12,13,14,15,16,20,22,24,26,28,30,32,34]:
#         print(i,list(train[1:5,i]) + list(test[1:5,i]))
#         lbl = preprocessing.LabelEncoder()
#         lbl.fit(list(train[:,i]) + list(test[:,i]))
#         train[:,i] = lbl.transform(train[:,i])
#         test[:,i] = lbl.transform(test[:,i])


# object array to float
train = train.astype(float)
# test = test.astype(float)

In [62]:
label_log = np.log1p(labels)

In [63]:
type(label_log)


Out[63]:
numpy.ndarray

In [64]:
label_log


Out[64]:
array([ 3.13139596,  2.59085804,  2.0283885 , ...,  1.80272076,
        2.9556465 ,  4.00037493])

In [65]:
params = {}
params["objective"] = "reg:linear"
params["eta"] = 0.02
params["min_child_weight"] = 5
params["subsample"] = 0.7
params["colsample_bytree"] = 0.6
params["scale_pos_weight"] = 0.8
params["silent"] = 1
params["max_depth"] = 9
params["max_delta_step"]=2

plst = list(params.items())

In [66]:
xgtrain = xgb.DMatrix(train, label=label_log)
xgtest = xgb.DMatrix(test)

In [67]:
num_rounds = 2000
model = xgb.train(plst, xgtrain, num_rounds)
preds1 = model.predict(xgtest)

In [69]:
np.expm1(preds1)[0:16, ]


Out[69]:
array([ 19.56751442,  19.56751442,  19.56751442,  19.56751442,
        19.56751442,  19.56751442,  19.56751442,  19.56751442,
        22.41098595,  22.41098595,  22.41098595,  22.41098595,
        22.41098595,  22.41098595,  22.41098595,  22.41098595], dtype=float32)

In [28]:
num_rounds = 3000
model = xgb.train(plst, xgtrain, num_rounds)
preds2 = model.predict(xgtest)

In [29]:
np.expm1(preds2)


Out[29]:
array([ 20.13894272,  20.13894272,  20.13894272, ...,   4.25420189,
        20.13955688,  20.13955688], dtype=float32)

In [24]:
num_rounds = 1500
model = xgb.train(plst, xgtrain, num_rounds)
preds4 = model.predict(xgtest)

In [25]:
preds4


Out[25]:
array([ 3.18315363,  2.66007471,  2.13119411, ...,  2.28685904,
        3.78465104,  3.94792008], dtype=float32)

In [26]:
preds = (np.expm1( (preds1+preds2+preds4)/3))

In [27]:
preds


Out[27]:
array([ 22.92672729,  13.17769241,   7.3621645 , ...,   8.84170914,
        42.96528244,  50.57537842], dtype=float32)

In [28]:
preds = pd.DataFrame({"id": idx, "cost": preds})

In [29]:
preds


Out[29]:
cost id
0 22.926727 1
1 13.177692 2
2 7.362164 3
3 5.504807 4
4 4.297100 5
5 3.949535 6
6 3.831894 7
7 3.751457 8
8 21.937353 9
9 12.263498 10
10 6.461926 11
11 4.516992 12
12 3.392853 13
13 3.098706 14
14 2.951760 15
15 2.876397 16
16 28.235407 17
17 16.407301 18
18 9.377083 19
19 6.975800 20
20 5.581512 21
21 5.182198 22
22 4.996856 23
23 4.859473 24
24 23.196119 25
25 13.409578 26
26 7.536754 27
27 5.703030 28
28 4.543425 29
29 4.169497 30
... ... ...
30205 6.078804 30206
30206 5.729975 30207
30207 4.975499 30208
30208 13.354907 30209
30209 10.025105 30210
30210 2.963256 30211
30211 3.130186 30212
30212 4.455911 30213
30213 2.121010 30214
30214 52.896217 30215
30215 45.183983 30216
30216 31.145456 30217
30217 18.824589 30218
30218 12.752092 30219
30219 10.090666 30220
30220 7.562262 30221
30221 7.174855 30222
30222 7.030894 30223
30223 6.853220 30224
30224 12.207335 30225
30225 18.200567 30226
30226 11.094241 30227
30227 7.831469 30228
30228 21.181284 30229
30229 6.454031 30230
30230 8.676934 30231
30231 6.573697 30232
30232 8.841709 30233
30233 42.965282 30234
30234 50.575378 30235

30235 rows × 2 columns