In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn import preprocessing
import matplotlib.pyplot as plt
plt.style.use("ggplot")
%matplotlib inline
In [2]:
train = pd.read_csv("data/train.csv", index_col="id", parse_dates=["timestamp"])
test = pd.read_csv("data/test.csv", index_col="id", parse_dates=["timestamp"])
macro = pd.read_csv("data/macro.csv", parse_dates=["timestamp"])
In [3]:
train.head()
Out[3]:
In [4]:
test.head()
Out[4]:
In [5]:
macro.head()
Out[5]:
In [6]:
y_train = train["price_doc"]
x_train = train.drop(["timestamp", "price_doc"], axis=1)
In [7]:
# transform noon-numerical variables
for c in x_train.columns:
if x_train[c].dtype == "object":
lbl = preprocessing.LabelEncoder()
lbl.fit(list(x_train[c].values))
x_train[c] = lbl.transform(list(x_train[c].values))
In [8]:
x_test = test.drop(["timestamp"], axis=1)
In [9]:
# transform noon-numerical variables
for c in x_test.columns:
if x_test[c].dtype == "object":
lbl = preprocessing.LabelEncoder()
lbl.fit(list(x_test[c].values))
x_test[c] = lbl.transform(list(x_test[c].values))
In [10]:
xgb_params = {
"eta": 0.05,
"max_depth": 5,
"subsample": 0.7,
"colsample_bytree": 0.7,
"objective": "reg:linear",
"eval_metric": "rmse",
"silent": 1,
"seed":42
}
In [11]:
dtrain = xgb.DMatrix(x_train, y_train)
In [12]:
cv_output = xgb.cv(xgb_params, dtrain, num_boost_round=1000, early_stopping_rounds=20,
verbose_eval=50, show_stdv=False)
In [13]:
cv_output[["train-rmse-mean", "test-rmse-mean"]].plot()
pass
In [14]:
num_boost_rounds = len(cv_output)
model = xgb.train(dict(xgb_params, silent=0), dtrain, num_boost_round=num_boost_rounds)
In [15]:
xgb.plot_importance(model, max_num_features=20)
Out[15]:
In [16]:
pred = model.predict(dtrain)
In [17]:
error = np.mean(abs(pred - y_train)) / len(y_train)
In [18]:
n = 1 / len(y_train)
rmsle = np.sqrt(1/n * np.sum(np.power(np.log(pred + 1) - np.log(y_train.values + 1), 2)))
In [19]:
print("RMSLE: {rmsle}, Error: {error}".format(rmsle=rmsle, error=error))
In [20]:
dtest = xgb.DMatrix(x_test)
In [21]:
y_predict = model.predict(dtest)
In [22]:
output = pd.DataFrame({"id": x_test.index, "price_doc": y_predict})
In [23]:
output.to_csv("submissions.csv", index=False)
In [ ]:
In [ ]:
In [ ]:
In [ ]: