In [7]:
import datetime
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn import preprocessing
from sklearn.decomposition import PCA, FastICA
import matplotlib.pyplot as plt
plt.style.use("ggplot")
%matplotlib inline

In [2]:
train = pd.read_csv("data/train_clean.csv", index_col="id", parse_dates=["timestamp"])
test = pd.read_csv("data/test_clean.csv", index_col="id", parse_dates=["timestamp"])
macro = pd.read_csv("data/macro.csv", parse_dates=["timestamp"])

In [3]:
y_train = train["price_doc"]
x_train = train.drop(["timestamp", "price_doc"], axis=1)

# transform non-numerical variables
for c in x_train.columns:
    if x_train[c].dtype == "object":
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(x_train[c].values)) 
        x_train[c] = lbl.transform(list(x_train[c].values))

y_train = y_train.astype(np.float32)
x_train = x_train.astype(np.float32)

In [4]:
x_test = test.drop(["timestamp"], axis=1)

# transform non-numerical variables
for c in x_test.columns:
    if x_test[c].dtype == "object":
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(x_test[c].values)) 
        x_test[c] = lbl.transform(list(x_test[c].values))
        
x_test = x_test.astype(np.float32)

In [63]:
x_train.shape


Out[63]:
(28747, 289)

In [67]:
# impute with zero first
pca = PCA(n_components=10, random_state=42)
pca_train = pca.fit(x_train.fillna(0))
sum(pca_train.explained_variance_ratio_)


Out[67]:
0.99996293953063586

In [68]:
n_components=30
pca = PCA(n_components=n_components, random_state=42)
pca_train = pca.fit_transform(x_train.fillna(0))
pca_test = pca.transform(x_test.fillna(0))

ica = FastICA(n_components=n_components, random_state=42)
ica_train = ica.fit_transform(x_train.fillna(0))
ica_test = ica.transform(x_test.fillna(0))

In [69]:
pca_train_df = pd.DataFrame(pca_train).add_prefix("pca_")
pca_test_df = pd.DataFrame(pca_test).add_prefix("pca_")
ica_train_df = pd.DataFrame(ica_train).add_prefix("ica_")
ica_test_df = pd.DataFrame(ica_test).add_prefix("ica_")

In [70]:
x_train_full = pd.concat([pca_train_df, ica_train_df], axis=1)
x_test_full = pd.concat([pca_test_df, ica_test_df], axis=1)

In [71]:
xgb_params = {
    "eta": 0.005,
    "max_depth": 5,
    "subsample": 0.7,
    "colsample_bytree": 0.7,
    "objective": "reg:linear",
    "eval_metric": "rmse",
    "silent": 1,
    "seed":42
}

In [72]:
dtrain = xgb.DMatrix(x_train_full, y_train)

In [74]:
cv_output = xgb.cv(xgb_params, dtrain, num_boost_round=3000, early_stopping_rounds=20, 
                   verbose_eval=50, show_stdv=False)


[0]	train-rmse:8.43944e+06	test-rmse:8.43961e+06
[50]	train-rmse:7.05076e+06	test-rmse:7.07859e+06
[100]	train-rmse:6.03163e+06	test-rmse:6.09492e+06
[150]	train-rmse:5.29856e+06	test-rmse:5.40092e+06
[200]	train-rmse:4.77743e+06	test-rmse:4.91985e+06
[250]	train-rmse:4.41151e+06	test-rmse:4.59642e+06
[300]	train-rmse:4.15597e+06	test-rmse:4.3796e+06
[350]	train-rmse:3.97541e+06	test-rmse:4.23603e+06
[400]	train-rmse:3.8455e+06	test-rmse:4.14082e+06
[450]	train-rmse:3.75177e+06	test-rmse:4.07708e+06
[500]	train-rmse:3.68128e+06	test-rmse:4.03345e+06
[550]	train-rmse:3.62671e+06	test-rmse:4.00329e+06
[600]	train-rmse:3.58193e+06	test-rmse:3.98101e+06
[650]	train-rmse:3.54349e+06	test-rmse:3.96329e+06
[700]	train-rmse:3.51232e+06	test-rmse:3.95129e+06
[750]	train-rmse:3.4845e+06	test-rmse:3.94127e+06
[800]	train-rmse:3.45969e+06	test-rmse:3.93221e+06
[850]	train-rmse:3.437e+06	test-rmse:3.92526e+06
[900]	train-rmse:3.41615e+06	test-rmse:3.91864e+06
[950]	train-rmse:3.39663e+06	test-rmse:3.91275e+06
[1000]	train-rmse:3.37789e+06	test-rmse:3.90732e+06
[1050]	train-rmse:3.35906e+06	test-rmse:3.9025e+06
[1100]	train-rmse:3.34334e+06	test-rmse:3.89893e+06
[1150]	train-rmse:3.32809e+06	test-rmse:3.89549e+06
[1200]	train-rmse:3.31324e+06	test-rmse:3.89203e+06
[1250]	train-rmse:3.29911e+06	test-rmse:3.88837e+06
[1300]	train-rmse:3.28529e+06	test-rmse:3.88504e+06
[1350]	train-rmse:3.27134e+06	test-rmse:3.88186e+06
[1400]	train-rmse:3.25875e+06	test-rmse:3.87928e+06
[1450]	train-rmse:3.24619e+06	test-rmse:3.87638e+06
[1500]	train-rmse:3.23316e+06	test-rmse:3.87391e+06
[1550]	train-rmse:3.22042e+06	test-rmse:3.87121e+06
[1600]	train-rmse:3.20779e+06	test-rmse:3.86828e+06
[1650]	train-rmse:3.19563e+06	test-rmse:3.86584e+06
[1700]	train-rmse:3.1839e+06	test-rmse:3.86335e+06
[1750]	train-rmse:3.17217e+06	test-rmse:3.86119e+06
[1800]	train-rmse:3.15996e+06	test-rmse:3.85871e+06
[1850]	train-rmse:3.14878e+06	test-rmse:3.85682e+06
[1900]	train-rmse:3.13857e+06	test-rmse:3.85516e+06
[1950]	train-rmse:3.12748e+06	test-rmse:3.8529e+06
[2000]	train-rmse:3.11706e+06	test-rmse:3.85087e+06
[2050]	train-rmse:3.10697e+06	test-rmse:3.84931e+06
[2100]	train-rmse:3.09667e+06	test-rmse:3.84769e+06
[2150]	train-rmse:3.08651e+06	test-rmse:3.8456e+06
[2200]	train-rmse:3.0766e+06	test-rmse:3.8439e+06
[2250]	train-rmse:3.06626e+06	test-rmse:3.84238e+06
[2300]	train-rmse:3.05606e+06	test-rmse:3.84054e+06
[2350]	train-rmse:3.0467e+06	test-rmse:3.83902e+06
[2400]	train-rmse:3.037e+06	test-rmse:3.83742e+06
[2450]	train-rmse:3.02734e+06	test-rmse:3.83595e+06
[2500]	train-rmse:3.01796e+06	test-rmse:3.83467e+06
[2550]	train-rmse:3.00873e+06	test-rmse:3.83305e+06
[2600]	train-rmse:2.99907e+06	test-rmse:3.83154e+06
[2650]	train-rmse:2.98979e+06	test-rmse:3.82977e+06
[2700]	train-rmse:2.98078e+06	test-rmse:3.82839e+06
[2750]	train-rmse:2.9719e+06	test-rmse:3.82703e+06
[2800]	train-rmse:2.96342e+06	test-rmse:3.8258e+06
[2850]	train-rmse:2.95437e+06	test-rmse:3.82438e+06
[2900]	train-rmse:2.94562e+06	test-rmse:3.8228e+06
[2950]	train-rmse:2.93713e+06	test-rmse:3.82164e+06
[2999]	train-rmse:2.92863e+06	test-rmse:3.82051e+06

In [75]:
cv_output[["train-rmse-mean", "test-rmse-mean"]].plot()
pass



In [76]:
num_boost_rounds = len(cv_output)
model = xgb.train(dict(xgb_params, silent=0), dtrain, num_boost_round=num_boost_rounds)

In [77]:
xgb.plot_importance(model, max_num_features=20)


Out[77]:
<matplotlib.axes._subplots.AxesSubplot at 0x1170ae518>

In [78]:
pred = model.predict(dtrain)

In [79]:
error = np.mean(abs(pred - y_train)) / len(y_train)

In [80]:
n = 1 / len(y_train)
rmsle = np.sqrt(1/n * np.sum(np.power(np.log(pred + 1) - np.log(y_train.values + 1), 2)))

In [81]:
print("RMSLE: {rmsle}, Error: {error}".format(rmsle=rmsle, error=error))


RMSLE: 15212.399936053915, Error: 70.67757939958952

In [82]:
dtest = xgb.DMatrix(x_test_full)

In [83]:
y_predict = model.predict(dtest)

In [84]:
output = pd.DataFrame({"id": x_test.index, "price_doc": y_predict})

In [94]:
output.to_csv("submissions_decomposition_{}.csv".format(datetime.datetime.today()), index=False)

Correction


In [85]:
# train prices are round values for investment type
# idea: take price_doc that are close to the prices in train and take this value
train[["price_doc", "product_type"]][train.product_type == "Investment"].price_doc.value_counts(dropna=False).head(10)


Out[85]:
2000000    750
1000000    743
6000000    369
3000000    329
6500000    327
7000000    319
5500000    309
6300000    292
5000000    292
7500000    275
Name: price_doc, dtype: int64

In [86]:
investment_prices = train[["price_doc", "product_type"]][train.product_type == "Investment"].price_doc.values
unique_investment_prices = set(investment_prices)

In [87]:
output_cor = pd.DataFrame({"id": test.index, "product_type": test.product_type, "price_doc": y_predict})

In [88]:
def correct_investment_price(price):
    return min(unique_investment_prices, key=lambda x: abs(x - price))

In [89]:
output_cor["price_doc_cor"] = output_cor[output_cor.product_type == "Investment"].apply(lambda x: correct_investment_price(x.price_doc), axis=1)

In [90]:
def take_price_doc_corr(x):
    if x.product_type == "Investment":
        return x.price_doc_cor
    else:
        return x.price_doc

In [91]:
output_final = output_cor.apply(lambda x: take_price_doc_corr(x), axis=1).reset_index(name="price_doc")

In [92]:
output_final.head()


Out[92]:
id price_doc
0 30474 7170000.0
1 30475 5700844.0
2 30476 6070000.0
3 30477 5322445.0
4 30478 6338101.5

In [93]:
output_final.to_csv("submissions_corrected_{}.csv".format(datetime.datetime.today()), index=False)

In [ ]: