In [2]:
%pylab inline
import pandas as pd
import sklearn.ensemble as sk_ensemble
import sklearn.cross_validation as sk_cv
import xgboost as xgb


Populating the interactive namespace from numpy and matplotlib

In [3]:
def rmsle(actual, predicted):
    error = np.log1p(predicted) - np.log1p(actual)
    return np.sqrt(np.mean(np.square(error)))

In [4]:
train_df = pd.read_csv('../train_set_adjusted.csv')

In [5]:
print train_df.shape
print train_df.columns


(30213, 66)
Index([u'tube_assembly_id', u'quote_date', u'annual_usage',
       u'min_order_quantity', u'quantity', u'cost', u'year', u'quantity_rep',
       u'supplier_S-0003', u'supplier_S-0004', u'supplier_S-0005',
       u'supplier_S-0006', u'supplier_S-0007', u'supplier_S-0008',
       u'supplier_S-0009', u'supplier_S-0011', u'supplier_S-0012',
       u'supplier_S-0013', u'supplier_S-0014', u'supplier_S-0015',
       u'supplier_S-0018', u'supplier_S-0022', u'supplier_S-0023',
       u'supplier_S-0024', u'supplier_S-0025', u'supplier_S-0026',
       u'supplier_S-0027', u'supplier_S-0029', u'supplier_S-0030',
       u'supplier_S-0031', u'supplier_S-0041', u'supplier_S-0042',
       u'supplier_S-0043', u'supplier_S-0046', u'supplier_S-0050',
       u'supplier_S-0051', u'supplier_S-0054', u'supplier_S-0056',
       u'supplier_S-0058', u'supplier_S-0059', u'supplier_S-0060',
       u'supplier_S-0061', u'supplier_S-0062', u'supplier_S-0064',
       u'supplier_S-0066', u'supplier_S-0068', u'supplier_S-0070',
       u'supplier_S-0072', u'supplier_S-0074', u'supplier_S-0078',
       u'supplier_S-0080', u'supplier_S-0081', u'supplier_S-0087',
       u'supplier_S-0090', u'supplier_S-0092', u'supplier_S-0095',
       u'supplier_S-0096', u'supplier_S-0097', u'supplier_S-0104',
       u'supplier_S-0105', u'supplier_S-0106', u'supplier_S-0107',
       u'supplier_S-0108', u'supplier_S-0109', u'supplier_S-0111',
       u'bracket_pricing_Yes'],
      dtype='object')

In [6]:
tube = pd.read_csv('../tube_material_id_imputed_dummies_drop_ns.csv')

In [7]:
print tube.shape
print tube.columns


(21198, 84)
Index([u'tube_assembly_id', u'diameter', u'wall', u'length', u'num_bends',
       u'bend_radius', u'num_boss', u'num_bracket', u'other',
       u'material_id_SP-0008', u'material_id_SP-0019', u'material_id_SP-0028',
       u'material_id_SP-0029', u'material_id_SP-0030', u'material_id_SP-0031',
       u'material_id_SP-0032', u'material_id_SP-0033', u'material_id_SP-0034',
       u'material_id_SP-0035', u'material_id_SP-0036', u'material_id_SP-0037',
       u'material_id_SP-0038', u'material_id_SP-0039', u'material_id_SP-0041',
       u'material_id_SP-0044', u'material_id_SP-0045', u'material_id_SP-0046',
       u'material_id_SP-0048', u'end_a_1x_Y', u'end_a_2x_Y', u'end_x_1x_Y',
       u'end_x_2x_Y', u'end_a_EF-001', u'end_a_EF-002', u'end_a_EF-003',
       u'end_a_EF-004', u'end_a_EF-005', u'end_a_EF-006', u'end_a_EF-007',
       u'end_a_EF-008', u'end_a_EF-009', u'end_a_EF-010', u'end_a_EF-011',
       u'end_a_EF-012', u'end_a_EF-013', u'end_a_EF-014', u'end_a_EF-015',
       u'end_a_EF-016', u'end_a_EF-017', u'end_a_EF-018', u'end_a_EF-019',
       u'end_a_EF-020', u'end_a_EF-021', u'end_a_EF-022', u'end_a_EF-023',
       u'end_a_EF-025', u'end_a_NONE', u'end_x_9999', u'end_x_EF-001',
       u'end_x_EF-002', u'end_x_EF-003', u'end_x_EF-004', u'end_x_EF-005',
       u'end_x_EF-006', u'end_x_EF-007', u'end_x_EF-008', u'end_x_EF-009',
       u'end_x_EF-010', u'end_x_EF-011', u'end_x_EF-012', u'end_x_EF-013',
       u'end_x_EF-014', u'end_x_EF-015', u'end_x_EF-016', u'end_x_EF-017',
       u'end_x_EF-018', u'end_x_EF-019', u'end_x_EF-021', u'end_x_EF-022',
       u'end_x_EF-023', u'end_x_EF-024', u'end_x_EF-025', u'end_x_EF-026',
       u'end_x_NONE'],
      dtype='object')

In [8]:
spec = pd.read_csv('../spec_dummies.csv')

In [9]:
print spec.shape
print spec.columns


(21198, 86)
Index([u'tube_assembly_id', u'SP-0001', u'SP-0002', u'SP-0003', u'SP-0004',
       u'SP-0005', u'SP-0006', u'SP-0007', u'SP-0009', u'SP-0010', u'SP-0011',
       u'SP-0012', u'SP-0013', u'SP-0014', u'SP-0015', u'SP-0016', u'SP-0017',
       u'SP-0018', u'SP-0019', u'SP-0020', u'SP-0021', u'SP-0022', u'SP-0023',
       u'SP-0024', u'SP-0025', u'SP-0026', u'SP-0027', u'SP-0028', u'SP-0029',
       u'SP-0030', u'SP-0033', u'SP-0035', u'SP-0036', u'SP-0037', u'SP-0038',
       u'SP-0039', u'SP-0040', u'SP-0042', u'SP-0043', u'SP-0044', u'SP-0046',
       u'SP-0047', u'SP-0049', u'SP-0050', u'SP-0051', u'SP-0052', u'SP-0053',
       u'SP-0054', u'SP-0055', u'SP-0056', u'SP-0057', u'SP-0058', u'SP-0059',
       u'SP-0060', u'SP-0061', u'SP-0062', u'SP-0063', u'SP-0064', u'SP-0065',
       u'SP-0066', u'SP-0067', u'SP-0068', u'SP-0069', u'SP-0070', u'SP-0071',
       u'SP-0072', u'SP-0073', u'SP-0074', u'SP-0075', u'SP-0076', u'SP-0077',
       u'SP-0078', u'SP-0079', u'SP-0080', u'SP-0081', u'SP-0082', u'SP-0083',
       u'SP-0084', u'SP-0085', u'SP-0086', u'SP-0087', u'SP-0088', u'SP-0091',
       u'SP-0092', u'SP-0094', u'SP-0096'],
      dtype='object')

In [10]:
comp = pd.read_csv('../comp_type_dummies.csv')

In [11]:
print comp.shape
print comp.columns


(21198, 30)
Index([u'tube_assembly_id', u'CP-001', u'CP-002', u'CP-003', u'CP-004',
       u'CP-005', u'CP-006', u'CP-007', u'CP-008', u'CP-009', u'CP-010',
       u'CP-011', u'CP-012', u'CP-014', u'CP-015', u'CP-016', u'CP-017',
       u'CP-018', u'CP-019', u'CP-020', u'CP-021', u'CP-022', u'CP-023',
       u'CP-024', u'CP-025', u'CP-026', u'CP-027', u'CP-028', u'CP-029',
       u'OTHER'],
      dtype='object')

In [12]:
comp_type_weight = pd.read_csv('../comp_type_weight.csv')

In [13]:
print comp_type_weight.shape
print comp_type_weight.columns


(21198, 30)
Index([u'tube_assembly_id', u'CP-001', u'CP-002', u'CP-003', u'CP-004',
       u'CP-005', u'CP-006', u'CP-007', u'CP-008', u'CP-009', u'CP-010',
       u'CP-011', u'CP-012', u'CP-014', u'CP-015', u'CP-016', u'CP-017',
       u'CP-018', u'CP-019', u'CP-020', u'CP-021', u'CP-022', u'CP-023',
       u'CP-024', u'CP-025', u'CP-026', u'CP-027', u'CP-028', u'CP-029',
       u'OTHER'],
      dtype='object')

In [14]:
tube_vol = pd.read_csv('../tube_volume.csv')

In [15]:
train = pd.merge(train_df, tube)
train = pd.merge(train, spec)
train = pd.merge(train, comp_type_weight)
train = pd.merge(train, tube_vol)

In [30]:
train.shape


Out[30]:
(30213, 264)

In [31]:
train_sub_train, train_sub_cv = sk_cv.train_test_split(train.ix[:29000], train_size = 0.7, random_state = 0)
train_sub_test = train.ix[29000:]

In [32]:
print train_sub_train.shape
print train_sub_cv.shape


(20300, 264)
(8701, 264)

In [33]:
X = train_sub_train.drop(['tube_assembly_id', 'quote_date', 'cost'], axis=1).values
Y = train_sub_train.cost
# , random_state=0, verbose=0
rf = sk_ensemble.RandomForestRegressor(n_estimators=400, n_jobs=4, oob_score=True)
rf = rf.fit(X, Y)

In [34]:
X_cv = train_sub_cv.drop(['tube_assembly_id', 'quote_date', 'cost'], axis=1).values
y_cv = train_sub_cv.cost
X_test = train_sub_test.drop(['tube_assembly_id', 'quote_date', 'cost'], axis=1).values
y_test = train_sub_test.cost
y_cv_fitted = rf.predict(X_cv)

In [35]:
rmsle(y_cv, y_cv_fitted)


Out[35]:
0.25562184567559815

In [41]:
params = {}
params["objective"] = "reg:linear"
params["eta"] = 0.1
# params["min_child_weight"] = 5
# params["subsample"] = 1.0
# params["scale_pos_weight"] = 1.0
# params["silent"] = 1
# params["max_depth"] = 7
plst = list(params.items())


1500 0.190494844002
2000 0.187726062196
2500 0.186332167469

In [43]:
xgtrain = xgb.DMatrix(X, label=np.log1p(Y))
xgtest = xgb.DMatrix(X_cv)
N = [1500,2000,2500]
for i,num_rounds in enumerate(N):
    # num_rounds = 120
    model = xgb.train(plst, xgtrain, num_rounds)
    preds = np.expm1(model.predict(xgtest))
    print N[i],rmsle(y_cv, preds)


1500 0.190494844002
2000 0.187726062196
2500 0.186332167469

In [44]:
test_pred = np.expm1(model.predict(xgb.DMatrix(X_test)))
print test_pred[:5]
print y_test[:5]
rmsle(y_test, test_pred)


[ 5.98402452  4.04146385  2.90155935  2.58758712  2.43880701]
29000    5.978645
29001    4.064588
29002    2.918380
29003    2.601225
29004    2.459340
Name: cost, dtype: float64
Out[44]:
0.21605522456327003

In [ ]: