In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn import preprocessing
import matplotlib.pyplot as plt
plt.style.use("ggplot")
%matplotlib inline

In [2]:
train = pd.read_csv("data/train.csv", index_col="id", parse_dates=["timestamp"])
test = pd.read_csv("data/test.csv", index_col="id", parse_dates=["timestamp"])
macro = pd.read_csv("data/macro.csv", parse_dates=["timestamp"])

In [3]:
train.head()


Out[3]:
timestamp full_sq life_sq floor max_floor material build_year num_room kitch_sq state ... cafe_count_5000_price_2500 cafe_count_5000_price_4000 cafe_count_5000_price_high big_church_count_5000 church_count_5000 mosque_count_5000 leisure_count_5000 sport_count_5000 market_count_5000 price_doc
id
1 2011-08-20 43 27.0 4.0 NaN NaN NaN NaN NaN NaN ... 9 4 0 13 22 1 0 52 4 5850000
2 2011-08-23 34 19.0 3.0 NaN NaN NaN NaN NaN NaN ... 15 3 0 15 29 1 10 66 14 6000000
3 2011-08-27 43 29.0 2.0 NaN NaN NaN NaN NaN NaN ... 10 3 0 11 27 0 4 67 10 5700000
4 2011-09-01 89 50.0 9.0 NaN NaN NaN NaN NaN NaN ... 11 2 1 4 4 0 0 26 3 13100000
5 2011-09-05 77 77.0 4.0 NaN NaN NaN NaN NaN NaN ... 319 108 17 135 236 2 91 195 14 16331452

5 rows × 291 columns


In [4]:
test.head()


Out[4]:
timestamp full_sq life_sq floor max_floor material build_year num_room kitch_sq state ... cafe_count_5000_price_1500 cafe_count_5000_price_2500 cafe_count_5000_price_4000 cafe_count_5000_price_high big_church_count_5000 church_count_5000 mosque_count_5000 leisure_count_5000 sport_count_5000 market_count_5000
id
30474 2015-07-01 39.0 20.7 2 9 1 1998.0 1 8.9 3.0 ... 8 0 0 0 1 10 1 0 14 1
30475 2015-07-01 79.2 NaN 8 17 1 0.0 3 1.0 1.0 ... 4 1 1 0 2 11 0 1 12 1
30476 2015-07-01 40.5 25.1 3 5 2 1960.0 2 4.8 2.0 ... 42 11 4 0 10 21 0 10 71 11
30477 2015-07-01 62.8 36.0 17 17 1 2016.0 2 62.8 3.0 ... 1 1 2 0 0 10 0 0 2 0
30478 2015-07-01 40.0 40.0 17 17 1 0.0 1 1.0 1.0 ... 5 1 1 0 2 12 0 1 11 1

5 rows × 290 columns


In [5]:
macro.head()


Out[5]:
timestamp oil_urals gdp_quart gdp_quart_growth cpi ppi gdp_deflator balance_trade balance_trade_growth usdrub ... provision_retail_space_modern_sqm turnover_catering_per_cap theaters_viewers_per_1000_cap seats_theather_rfmin_per_100000_cap museum_visitis_per_100_cap bandwidth_sports population_reg_sports_share students_reg_sports_share apartment_build apartment_fund_sqm
0 2010-01-01 76.1 NaN NaN NaN NaN NaN NaN NaN NaN ... 690.0 6221.0 527.0 0.41 993.0 NaN NaN 63.03 22825.0 NaN
1 2010-01-02 76.1 NaN NaN NaN NaN NaN NaN NaN NaN ... 690.0 6221.0 527.0 0.41 993.0 NaN NaN 63.03 22825.0 NaN
2 2010-01-03 76.1 NaN NaN NaN NaN NaN NaN NaN NaN ... 690.0 6221.0 527.0 0.41 993.0 NaN NaN 63.03 22825.0 NaN
3 2010-01-04 76.1 NaN NaN NaN NaN NaN NaN NaN 29.905 ... 690.0 6221.0 527.0 0.41 993.0 NaN NaN 63.03 22825.0 NaN
4 2010-01-05 76.1 NaN NaN NaN NaN NaN NaN NaN 29.836 ... 690.0 6221.0 527.0 0.41 993.0 NaN NaN 63.03 22825.0 NaN

5 rows × 100 columns


In [6]:
y_train = train["price_doc"]
x_train = train.drop(["timestamp", "price_doc"], axis=1)

In [7]:
# transform noon-numerical variables
for c in x_train.columns:
    if x_train[c].dtype == "object":
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(x_train[c].values)) 
        x_train[c] = lbl.transform(list(x_train[c].values))

In [8]:
x_test = test.drop(["timestamp"], axis=1)

In [9]:
# transform noon-numerical variables
for c in x_test.columns:
    if x_test[c].dtype == "object":
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(x_test[c].values)) 
        x_test[c] = lbl.transform(list(x_test[c].values))

In [10]:
xgb_params = {
    "eta": 0.05,
    "max_depth": 5,
    "subsample": 0.7,
    "colsample_bytree": 0.7,
    "objective": "reg:linear",
    "eval_metric": "rmse",
    "silent": 1,
    "seed":42
}

In [11]:
dtrain = xgb.DMatrix(x_train, y_train)

In [12]:
cv_output = xgb.cv(xgb_params, dtrain, num_boost_round=1000, early_stopping_rounds=20, 
                   verbose_eval=50, show_stdv=False)


[0]	train-rmse:8.20552e+06	test-rmse:8.21233e+06
[50]	train-rmse:2.53422e+06	test-rmse:2.90077e+06
[100]	train-rmse:2.19598e+06	test-rmse:2.71451e+06
[150]	train-rmse:2.07905e+06	test-rmse:2.67354e+06
[200]	train-rmse:1.9912e+06	test-rmse:2.65576e+06
[250]	train-rmse:1.92183e+06	test-rmse:2.64652e+06
[300]	train-rmse:1.85985e+06	test-rmse:2.64064e+06
[350]	train-rmse:1.80693e+06	test-rmse:2.63987e+06

In [13]:
cv_output[["train-rmse-mean", "test-rmse-mean"]].plot()
pass



In [14]:
num_boost_rounds = len(cv_output)
model = xgb.train(dict(xgb_params, silent=0), dtrain, num_boost_round=num_boost_rounds)

In [15]:
xgb.plot_importance(model, max_num_features=20)


Out[15]:
<matplotlib.axes._subplots.AxesSubplot at 0x106038a20>

In [16]:
pred = model.predict(dtrain)

In [17]:
error = np.mean(abs(pred - y_train)) / len(y_train)

In [18]:
n = 1 / len(y_train)
rmsle = np.sqrt(1/n * np.sum(np.power(np.log(pred + 1) - np.log(y_train.values + 1), 2)))


/Users/datitran/anaconda/envs/kaggle/lib/python3.5/site-packages/ipykernel/__main__.py:2: RuntimeWarning: invalid value encountered in log
  from ipykernel import kernelapp as app

In [19]:
print("RMSLE: {rmsle}, Error: {error}".format(rmsle=rmsle, error=error))


RMSLE: nan, Error: 39.29211739451779

In [20]:
dtest = xgb.DMatrix(x_test)

In [21]:
y_predict = model.predict(dtest)

In [22]:
output = pd.DataFrame({"id": x_test.index, "price_doc": y_predict})

In [23]:
output.to_csv("submissions.csv", index=False)

In [ ]:


In [ ]:


In [ ]:


In [ ]: