In [1]:
%pylab inline
import pandas as pd
import sklearn.ensemble as sk_ensemble
import sklearn.cross_validation as sk_cv
import xgboost as xgb


Populating the interactive namespace from numpy and matplotlib

In [2]:
def rmsle(actual, predicted):
    error = np.log1p(predicted) - np.log1p(actual)
    return np.sqrt(np.mean(np.square(error)))

In [3]:
train_df = pd.read_csv('../train_set_adjusted.csv')

In [4]:
# print train_df.shape
# print train_df.columns

In [5]:
tube = pd.read_csv('../tube_material_id_imputed_dummies_drop_ns.csv')

In [6]:
# print tube.shape
# print tube.columns

In [7]:
spec = pd.read_csv('../spec_dummies.csv')

In [8]:
# print spec.shape
# print spec.columns

In [9]:
comp = pd.read_csv('../comp_type_dummies.csv')

In [10]:
# print comp.shape
# print comp.columns

In [11]:
comp_type_weight = pd.read_csv('../comp_type_weight.csv')

In [12]:
# print comp_type_weight.shape
# print comp_type_weight.columns

In [13]:
tube_vol = pd.read_csv('../tube_volume.csv')

In [14]:
train = pd.merge(train_df, tube)
train = pd.merge(train, spec)
train = pd.merge(train, comp_type_weight)
train = pd.merge(train, tube_vol)

In [15]:
train.shape


Out[15]:
(30213, 264)

In [81]:
# train_sub_train_cv, train_sub_test = sk_cv.train_test_split(train, train_size = 0.6, random_state = 234)
# train_sub_train, train_sub_cv = sk_cv.train_test_split(train_sub_train_cv, train_size = 0.7, random_state = 345)
# train.set_index('tube_assembly_id',inplace=True)
train_id = train.index.unique()
train_sub_train_cv_id, train_sub_test_id = sk_cv.train_test_split(train_id, train_size = 0.6, random_state = 234)
train_sub_train_id, train_sub_cv_id = sk_cv.train_test_split(train_sub_train_cv_id, train_size = 0.7, random_state = 234)
train_sub_train = train.loc[train_sub_train_id,:]
train_sub_cv = train.loc[train_sub_cv_id,:]
train_sub_test = train.loc[train_sub_test_id,:]

In [82]:
print train_sub_train.shape
print train_sub_cv.shape


(12353, 263)
(5485, 263)

In [97]:
params = {}
params["objective"] = "reg:linear"
# params["eta"] = 0.1
# params['gamma'] = 1
params["min_child_weight"] = 8
# params["subsample"] = 1.0
params["scale_pos_weight"] = 1.0
# params["silent"] = 1
params["max_depth"] = 5
plst = list(params.items())

In [89]:
train_sub_train.reset_index(inplace=True)
train_sub_cv.reset_index(inplace=True)
train_sub_test.reset_index(inplace=True)

In [90]:
X_train = train_sub_train.drop(['tube_assembly_id', 'quote_date', 'cost'], axis=1).values
y_train = train_sub_train.cost
X_cv = train_sub_cv.drop(['tube_assembly_id', 'quote_date', 'cost'], axis=1).values
y_cv = train_sub_cv.cost
X_test = train_sub_test.drop(['tube_assembly_id', 'quote_date', 'cost'], axis=1).values
y_test = train_sub_test.cost

In [98]:
xgtrain = xgb.DMatrix(X_train, label=np.log1p(y_train))
xgcv = xgb.DMatrix(X_cv)
N = [1000]
for i,num_rounds in enumerate(N):
    # num_rounds = 120
    model = xgb.train(plst, xgtrain, num_rounds)
    cv_preds = np.expm1(model.predict(xgcv))
    print N[i],rmsle(y_cv, cv_preds)


1000 0.273624622174

In [92]:
test_preds = np.expm1(model.predict(xgb.DMatrix(X_test)))
rmsle(y_test, test_preds)


Out[92]:
0.28002526153588175

In [54]:
X = train.drop(['tube_assembly_id', 'quote_date', 'cost'], axis=1).values
y = train.cost
xgb_mat = xgb.DMatrix(X, label=np.log1p(y))
N = [250]
for i,num_rounds in enumerate(N):
    # num_rounds = 120
    model = xgb.train(plst, xgb_mat, num_rounds)
#     cv_preds = np.expm1(model.predict(xgcv))
#     print N[i],rmsle(y_cv, cv_preds)

In [22]:
kaggle_test = pd.read_csv('../test_dummies_adjusted.csv')

In [23]:
kaggle_test_df = pd.merge(kaggle_test, tube)
kaggle_test_df = pd.merge(kaggle_test_df, spec)
kaggle_test_df = pd.merge(kaggle_test_df, comp_type_weight)
kaggle_test_df = pd.merge(kaggle_test_df, tube_vol)

In [24]:
# print kaggle_test_df.id

In [25]:
kaggle_test_vals = kaggle_test_df.drop(['tube_assembly_id', 'quote_date', 'cost', 'id'], axis=1).values

In [55]:
kaggle_preds = np.expm1(model.predict(xgb.DMatrix(kaggle_test_vals)))

In [59]:
print kaggle_preds[:16]


[ 21.20670509  11.08872986   6.41876507   4.98798847   3.42142439
   3.0059526    2.57624698   2.40019035  21.81780815  11.01176453
   6.82877827   4.06854391   2.81207585   2.39966464   2.01504898
   1.88880312]

In [28]:
print kaggle_preds.shape


(30235,)

In [57]:
submit = pd.DataFrame({'id' : range(1, 30236), 'cost' : kaggle_preds})

In [39]:
submit.tail(10)


Out[39]:
cost id
30225 7.332325 30226
30226 4.213070 30227
30227 2.585519 30228
30228 14.225010 30229
30229 3.489382 30230
30230 4.558745 30231
30231 2.753475 30232
30232 3.773052 30233
30233 41.871063 30234
30234 54.053925 30235

In [58]:
submit.to_csv('../submit_py_xgboost_full_2.csv', index=False, columns=['id', 'cost'])

In [ ]: