In [1]:
# From https://www.kaggle.com/kumareshd/caterpillar-tube-pricing/xgbooost-222
%pylab inline
import pandas as pd
import numpy as np
from sklearn import ensemble, preprocessing
import xgboost as xgb
In [9]:
# load training and test datasets
train = pd.read_csv('data/competition_data/train_set.csv', parse_dates=[2,])
test = pd.read_csv('data/competition_data/test_set.csv', parse_dates=[3,])
tube_data = pd.read_csv('data/competition_data/tube.csv')
bill_of_materials_data = pd.read_csv('data/competition_data/bill_of_materials.csv')
specs_data = pd.read_csv('data/competition_data/specs.csv')
print("train columns")
print(train.columns)
print("test columns")
print(test.columns)
print("tube.csv df columns")
print(tube_data.columns)
print("bill_of_materials.csv df columns")
print(bill_of_materials_data.columns)
print("specs.csv df columns")
print(specs_data.columns)
print(specs_data[2:3])
In [10]:
train = pd.merge(train, tube_data, on ='tube_assembly_id')
train = pd.merge(train, bill_of_materials_data, on ='tube_assembly_id')
test = pd.merge(test, tube_data, on ='tube_assembly_id')
test = pd.merge(test, bill_of_materials_data, on ='tube_assembly_id')
print("new train columns")
print(train.columns)
print(train[1:10])
print(train.columns.to_series().groupby(train.dtypes).groups)
In [11]:
# create some new features
train['year'] = train.quote_date.dt.year
train['month'] = train.quote_date.dt.month
test['year'] = test.quote_date.dt.year
test['month'] = test.quote_date.dt.month
In [12]:
# drop useless columns and create labels
idx = test.id.values.astype(int)
test = test.drop(['id', 'tube_assembly_id', 'quote_date'], axis = 1)
labels = train.cost.values
#'tube_assembly_id', 'supplier', 'bracket_pricing', 'material_id', 'end_a_1x', 'end_a_2x', 'end_x_1x', 'end_x_2x', 'end_a', 'end_x'
#for some reason material_id cannot be converted to categorical variable
train = train.drop(['quote_date', 'cost', 'tube_assembly_id'], axis = 1)
In [18]:
# Replace missing vals with ' '...
train['material_id'].replace(np.nan,' ', regex=True, inplace= True)
test['material_id'].replace(np.nan,' ', regex=True, inplace= True)
for i in range(1,9):
column_label = 'component_id_'+str(i)
print(column_label)
train[column_label].replace(np.nan,' ', regex=True, inplace= True)
test[column_label].replace(np.nan,' ', regex=True, inplace= True)
In [20]:
train.fillna(0, inplace = True)
test.fillna(0, inplace = True)
In [22]:
print("train columns")
print(train.columns)
In [23]:
# convert data to numpy array
train = np.array(train)
test = np.array(test)
In [24]:
# label encode the categorical variables
for i in range(train.shape[1]):
if i in [0,3,5,11,12,13,14,15,16,20,22,24,26,28,30,32,34]:
print(i,list(train[1:5,i]) + list(test[1:5,i]))
lbl = preprocessing.LabelEncoder()
lbl.fit(list(train[:,i]) + list(test[:,i]))
train[:,i] = lbl.transform(train[:,i])
test[:,i] = lbl.transform(test[:,i])
In [25]:
# object array to float
train = train.astype(float)
test = test.astype(float)
In [26]:
# i like to train on log(1+x) for RMSLE ;)
# The choice is yours :)
label_log = np.log1p(labels)
In [27]:
# fit a random forest model
params = {}
params["objective"] = "reg:linear"
params["eta"] = 0.02
params["min_child_weight"] = 6
params["subsample"] = 0.7
params["colsample_bytree"] = 0.6
params["scale_pos_weight"] = 0.8 # undocumented?!
params["silent"] = 1
params["max_depth"] = 8
params["max_delta_step"] = 2
plst = list(params.items())
xgtrain = xgb.DMatrix(train, label=label_log)
xgtest = xgb.DMatrix(test)
In [29]:
print('2000')
num_rounds = 2000
%time model1 = xgb.train(plst, xgtrain, num_rounds)
%time preds1 = model1.predict(xgtest)
In [30]:
print('3000')
num_rounds = 3000
%time model2 = xgb.train(plst, xgtrain, num_rounds)
%time preds2 = model2.predict(xgtest)
In [ ]: