In [2]:
%pylab inline
import pandas as pd
import sklearn.ensemble as sk_ensemble
import sklearn.cross_validation as sk_cv
import xgboost as xgb
In [3]:
def rmsle(actual, predicted):
error = np.log1p(predicted) - np.log1p(actual)
return np.sqrt(np.mean(np.square(error)))
In [4]:
train_df = pd.read_csv('../train_set_adjusted.csv')
In [5]:
print train_df.shape
print train_df.columns
In [6]:
tube = pd.read_csv('../tube_material_id_imputed_dummies_drop_ns.csv')
In [7]:
print tube.shape
print tube.columns
In [8]:
spec = pd.read_csv('../spec_dummies.csv')
In [9]:
print spec.shape
print spec.columns
In [10]:
comp = pd.read_csv('../comp_type_dummies.csv')
In [11]:
print comp.shape
print comp.columns
In [12]:
comp_type_weight = pd.read_csv('../comp_type_weight.csv')
In [13]:
print comp_type_weight.shape
print comp_type_weight.columns
In [14]:
tube_vol = pd.read_csv('../tube_volume.csv')
In [15]:
train = pd.merge(train_df, tube)
train = pd.merge(train, spec)
train = pd.merge(train, comp_type_weight)
train = pd.merge(train, tube_vol)
In [30]:
train.shape
Out[30]:
In [31]:
train_sub_train, train_sub_cv = sk_cv.train_test_split(train.ix[:29000], train_size = 0.7, random_state = 0)
train_sub_test = train.ix[29000:]
In [32]:
print train_sub_train.shape
print train_sub_cv.shape
In [33]:
X = train_sub_train.drop(['tube_assembly_id', 'quote_date', 'cost'], axis=1).values
Y = train_sub_train.cost
# , random_state=0, verbose=0
rf = sk_ensemble.RandomForestRegressor(n_estimators=400, n_jobs=4, oob_score=True)
rf = rf.fit(X, Y)
In [34]:
X_cv = train_sub_cv.drop(['tube_assembly_id', 'quote_date', 'cost'], axis=1).values
y_cv = train_sub_cv.cost
X_test = train_sub_test.drop(['tube_assembly_id', 'quote_date', 'cost'], axis=1).values
y_test = train_sub_test.cost
y_cv_fitted = rf.predict(X_cv)
In [35]:
rmsle(y_cv, y_cv_fitted)
Out[35]:
In [41]:
params = {}
params["objective"] = "reg:linear"
params["eta"] = 0.1
# params["min_child_weight"] = 5
# params["subsample"] = 1.0
# params["scale_pos_weight"] = 1.0
# params["silent"] = 1
# params["max_depth"] = 7
plst = list(params.items())
In [43]:
xgtrain = xgb.DMatrix(X, label=np.log1p(Y))
xgtest = xgb.DMatrix(X_cv)
N = [1500,2000,2500]
for i,num_rounds in enumerate(N):
# num_rounds = 120
model = xgb.train(plst, xgtrain, num_rounds)
preds = np.expm1(model.predict(xgtest))
print N[i],rmsle(y_cv, preds)
In [44]:
test_pred = np.expm1(model.predict(xgb.DMatrix(X_test)))
print test_pred[:5]
print y_test[:5]
rmsle(y_test, test_pred)
Out[44]:
In [ ]: