In [52]:
import pandas as pd
import numpy as np
from sklearn import ensemble, preprocessing
import xgboost as xgb
In [53]:
train = pd.read_csv('train_set_adjusted.csv')
test = pd.read_csv('test_dummies_adjusted.csv')
tube = pd.read_csv('tube_material_id_imputed_dummies_drop_ns.csv')
# bill_of_materials_data = pd.read_csv('competition_data/bill_of_materials.csv')
spec_dummies = pd.read_csv('spec_dummies.csv')
comp_type_dummies = pd.read_csv('comp_type_dummies.csv')
comp_weight = pd.read_csv('comp_weight.csv')
In [54]:
train = pd.merge(train, tube, on ='tube_assembly_id')
train = pd.merge(train, comp_type_dummies, on ='tube_assembly_id')
train = pd.merge(train, spec_dummies, on ='tube_assembly_id')
train = pd.merge(train, comp_weight, on ='tube_assembly_id')
test = pd.merge(test, tube, on ='tube_assembly_id')
test = pd.merge(test, comp_type_dummies, on ='tube_assembly_id')
test = pd.merge(test, spec_dummies, on ='tube_assembly_id')
test = pd.merge(test, comp_weight, on ='tube_assembly_id')
In [4]:
# train['year'] = train.quote_date.dt.year
# train['month'] = train.quote_date.dt.month
In [5]:
# test['year'] = test.quote_date.dt.year
# test['month'] = test.quote_date.dt.month
In [55]:
idx = test.id.values.astype(int)
test = test.drop(['id', 'tube_assembly_id', 'quote_date', 'quantity_rep'], axis = 1)
labels = train.cost.values
train = train.drop(['quote_date', 'cost', 'tube_assembly_id', 'quantity_rep'], axis = 1)
In [22]:
# train['material_id'].replace(np.nan,' ', regex=True, inplace= True)
# test['material_id'].replace(np.nan,' ', regex=True, inplace= True)
# for i in range(1,9):
# column_label = 'component_id_'+str(i)
# print(column_label)
# train[column_label].replace(np.nan,' ', regex=True, inplace= True)
# test[column_label].replace(np.nan,' ', regex=True, inplace= True)
In [9]:
# train.fillna(0, inplace = True)
# test.fillna(0, inplace = True)
In [58]:
train.head()
Out[58]:
In [59]:
test.head()
Out[59]:
In [60]:
train = np.array(train)
test = np.array(test)
In [61]:
# label encode the categorical variables
# for i in range(train.shape[1]):
# if i in [0,3,5,11,12,13,14,15,16,20,22,24,26,28,30,32,34]:
# print(i,list(train[1:5,i]) + list(test[1:5,i]))
# lbl = preprocessing.LabelEncoder()
# lbl.fit(list(train[:,i]) + list(test[:,i]))
# train[:,i] = lbl.transform(train[:,i])
# test[:,i] = lbl.transform(test[:,i])
# object array to float
train = train.astype(float)
# test = test.astype(float)
In [62]:
label_log = np.log1p(labels)
In [63]:
type(label_log)
Out[63]:
In [64]:
label_log
Out[64]:
In [65]:
params = {}
params["objective"] = "reg:linear"
params["eta"] = 0.02
params["min_child_weight"] = 5
params["subsample"] = 0.7
params["colsample_bytree"] = 0.6
params["scale_pos_weight"] = 0.8
params["silent"] = 1
params["max_depth"] = 9
params["max_delta_step"]=2
plst = list(params.items())
In [66]:
xgtrain = xgb.DMatrix(train, label=label_log)
xgtest = xgb.DMatrix(test)
In [67]:
num_rounds = 2000
model = xgb.train(plst, xgtrain, num_rounds)
preds1 = model.predict(xgtest)
In [69]:
np.expm1(preds1)[0:16, ]
Out[69]:
In [28]:
num_rounds = 3000
model = xgb.train(plst, xgtrain, num_rounds)
preds2 = model.predict(xgtest)
In [29]:
np.expm1(preds2)
Out[29]:
In [24]:
num_rounds = 1500
model = xgb.train(plst, xgtrain, num_rounds)
preds4 = model.predict(xgtest)
In [25]:
preds4
Out[25]:
In [26]:
preds = (np.expm1( (preds1+preds2+preds4)/3))
In [27]:
preds
Out[27]:
In [28]:
preds = pd.DataFrame({"id": idx, "cost": preds})
In [29]:
preds
Out[29]: