In [2]:
%pylab inline
import pandas as pd
import sklearn.ensemble as sk_ensemble
import sklearn.cross_validation as sk_cv
import xgboost as xgb
In [3]:
def rmsle(actual, predicted):
error = np.log1p(predicted) - np.log1p(actual)
return np.sqrt(np.mean(np.square(error)))
In [4]:
train = pd.read_csv('../preprocessed_train.csv')
In [5]:
train.shape
Out[5]:
In [6]:
train = train[train.bracket_pricing_Yes == 0]
In [7]:
# train_sub_train_cv, train_sub_test = sk_cv.train_test_split(train, train_size = 0.6, random_state = 234)
# train_sub_train, train_sub_cv = sk_cv.train_test_split(train_sub_train_cv, train_size = 0.7, random_state = 345)
# train.set_index('tube_assembly_id',inplace=True)
train_id = train.index.unique()
train_sub_train_cv_id, train_sub_test_id = sk_cv.train_test_split(train_id, train_size = 0.6, random_state = 234)
train_sub_train_id, train_sub_cv_id = sk_cv.train_test_split(train_sub_train_cv_id, train_size = 0.7, random_state = 234)
train_sub_train = train.loc[train_sub_train_id,:]
train_sub_cv = train.loc[train_sub_cv_id,:]
train_sub_test = train.loc[train_sub_test_id,:]
In [8]:
print train_sub_train.shape
print train_sub_cv.shape
In [9]:
params = {}
params["objective"] = "reg:linear"
# params["eta"] = 0.1
# params['gamma'] = 1
params["min_child_weight"] = 8
# params["subsample"] = 1.0
params["scale_pos_weight"] = 1.0
# params["silent"] = 1
params["max_depth"] = 5
plst = list(params.items())
In [10]:
train_sub_train.reset_index(inplace=True)
train_sub_cv.reset_index(inplace=True)
train_sub_test.reset_index(inplace=True)
In [16]:
X_train = train_sub_train.drop(['tube_assembly_id', 'cost', 'quantity'], axis=1).values
y_train = train_sub_train.cost
X_cv = train_sub_cv.drop(['tube_assembly_id', 'cost', 'quantity'], axis=1).values
y_cv = train_sub_cv.cost
X_test = train_sub_test.drop(['tube_assembly_id', 'cost', 'quantity'], axis=1).values
y_test = train_sub_test.cost
In [17]:
xgtrain = xgb.DMatrix(X_train, label=np.log1p(y_train))
xgcv = xgb.DMatrix(X_cv)
N = [1000]
for i,num_rounds in enumerate(N):
# num_rounds = 120
model = xgb.train(plst, xgtrain, num_rounds)
cv_preds = np.expm1(model.predict(xgcv))
print N[i],rmsle(y_cv, cv_preds)
In [14]:
test_preds = np.expm1(model.predict(xgb.DMatrix(X_test)))
rmsle(y_test, test_preds)
Out[14]:
In [54]:
X = train.drop(['tube_assembly_id', 'quote_date', 'cost'], axis=1).values
y = train.cost
xgb_mat = xgb.DMatrix(X, label=np.log1p(y))
N = [250]
for i,num_rounds in enumerate(N):
# num_rounds = 120
model = xgb.train(plst, xgb_mat, num_rounds)
# cv_preds = np.expm1(model.predict(xgcv))
# print N[i],rmsle(y_cv, cv_preds)
In [22]:
kaggle_test = pd.read_csv('../test_dummies_adjusted.csv')
In [23]:
kaggle_test_df = pd.merge(kaggle_test, tube)
kaggle_test_df = pd.merge(kaggle_test_df, spec)
kaggle_test_df = pd.merge(kaggle_test_df, comp_type_weight)
kaggle_test_df = pd.merge(kaggle_test_df, tube_vol)
In [24]:
# print kaggle_test_df.id
In [25]:
kaggle_test_vals = kaggle_test_df.drop(['tube_assembly_id', 'quote_date', 'cost', 'id'], axis=1).values
In [55]:
kaggle_preds = np.expm1(model.predict(xgb.DMatrix(kaggle_test_vals)))
In [59]:
print kaggle_preds[:16]
In [28]:
print kaggle_preds.shape
In [57]:
submit = pd.DataFrame({'id' : range(1, 30236), 'cost' : kaggle_preds})
In [39]:
submit.tail(10)
Out[39]:
In [58]:
submit.to_csv('../submit_py_xgboost_full_2.csv', index=False, columns=['id', 'cost'])
In [ ]: