In [1]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
%matplotlib inline
%load_ext ipycache
import pandas as pd
pd.options.display.max_rows = 300
pd.options.display.max_columns = 300
import lightgbm as lgb
import numpy as np
import scipy
import sklearn as sk
import xgboost as xgb
from eli5 import show_weights
import seaborn as sns
sns.set()
import matplotlib.pyplot as plt
In [2]:
# Вспомогательные функции
import math
from sklearn.metrics import make_scorer
#A function to calculate Root Mean Squared Logarithmic Error (RMSLE)
def rmsle(y, y_pred):
assert len(y) == len(y_pred)
terms_to_sum = [
(math.log(y_pred[i] + 1) - math.log(y[i] + 1)) ** 2.0
for i,pred in enumerate(y_pred)
]
return (sum(terms_to_sum) * (1.0/len(y))) ** 0.5
def rmse(y, y_pred):
return np.sqrt(((y_pred - y) ** 2).mean())
def feat_imp(model):
return pd.DataFrame(
model.get_fscore().items(),
columns=['feature','importance']
).sort_values('importance', ascending=False)
def unlog(y):
return np.expm1(y)
rmse_scoring = make_scorer(rmse, greater_is_better=False)
rmsle_scoring = make_scorer(rmsle, greater_is_better=False)
In [3]:
def align_to_lb_score(df):
# https://www.kaggle.com/c/sberbank-russian-housing-market/discussion/32717
df = df.copy()
trainsub = df[df.timestamp < '2015-01-01']
trainsub = trainsub[trainsub.product_type=="Investment"]
ind_1m = trainsub[trainsub.price_doc <= 1000000].index
ind_2m = trainsub[trainsub.price_doc == 2000000].index
ind_3m = trainsub[trainsub.price_doc == 3000000].index
train_index = set(df.index.copy())
for ind, gap in zip([ind_1m, ind_2m, ind_3m], [10, 3, 2]):
ind_set = set(ind)
ind_set_cut = ind.difference(set(ind[::gap]))
train_index = train_index.difference(ind_set_cut)
df = df.loc[train_index]
df["price_doc"] = np.log1p(df["price_doc"].values)
return df
def preprocess_anomaly(df):
# удаляем из обучающей выборки все нулевые данные. В test данные все заполнены
df = df.dropna(subset=["preschool_education_centers_raion", "num_room",
"max_floor", "material", "kitch_sq", "floor"])
df["product_type"].fillna("Investment", inplace=True)
df["full_sq"] = map(lambda x: x if x > 10 else float("NaN"), df["full_sq"])
df["life_sq"] = map(lambda x: x if x > 5 else float("NaN"), df["life_sq"])
df["kitch_sq"] = map(lambda x: x if x > 2 else float("NaN"), df["kitch_sq"])
# superclean
# https://www.kaggle.com/keremt/very-extensive-cleaning-by-sberbank-discussions
df.ix[df[df.life_sq > df.full_sq].index, "life_sq"] = np.NaN
df.ix[df[df.kitch_sq >= df.life_sq].index, "kitch_sq"] = np.NaN
df.ix[df[df.kitch_sq == 0].index, "kitch_sq"] = np.NaN
df.ix[df[df.kitch_sq == 1].index, "kitch_sq"] = np.NaN
df.ix[df[df.num_room == 0].index, "num_room"] = np.NaN
df.ix[df[df.floor == 0].index, "floor"] = np.NaN
df.ix[df[df.max_floor == 0].index, "max_floor"] = np.NaN
df.ix[df[df.floor > df.max_floor].index, "max_floor"] = np.NaN
df.ix[df[df.state == 33].index, "state"] = np.NaN
df.ix[df[df.build_year == 20052009].index, "build_year"] = 2005
df.ix[df[df.build_year == 20].index, "build_year"] = 2000
df.ix[df[df.build_year == 215].index, "build_year"] = 2015
df.ix[df[df.build_year < 1500].index, "build_year"] = np.NaN
df.ix[df[df.build_year > 2022].index, "build_year"] = np.NaN
return df
In [4]:
def smoothed_likelihood(targ_mean, nrows, globalmean, alpha=10):
try:
return (targ_mean * nrows + globalmean * alpha) / (nrows + alpha)
except Exception:
return float("NaN")
def mess_y_categorial(df, nfolds=3, alpha=10):
from copy import copy
folds = np.array_split(df, nfolds)
newfolds = []
for i in range(nfolds):
fold = folds[i]
other_folds = copy(folds)
other_folds.pop(i)
other_fold = pd.concat(other_folds)
newfolds.append(mess_y_categorial_fold(fold, other_fold, alpha=10))
return pd.concat(newfolds)
def mess_y_categorial_fold(fold_raw, other_fold, cols=None, y_col="price_doc", alpha=10):
fold = fold_raw.copy()
if not cols:
cols = list(fold.select_dtypes(include=["object"]).columns)
globalmean = other_fold[y_col].mean()
for c in cols:
target_mean = other_fold[[c, y_col]].fillna("").groupby(c).mean().to_dict()[y_col]
nrows = other_fold[c].fillna("").value_counts().to_dict()
fold[c + "_sll"] = fold[c].fillna("").apply(
lambda x: smoothed_likelihood(target_mean.get(x), nrows.get(x), globalmean, alpha)
)
return fold
def feature_exclude(df):
# Убираем build_year, вместо него остается age_of_building
# Вероятно из-за build_year переобучение
feats = ["build_year", "build_year_cat_le"]
with open("greedy_search.tsv") as gs:
for line in gs:
row = line.strip().split("\t")
if len(row) < 6:
continue
if row[5] == "remove":
feats.append(row[0])
df = df.drop(feats, axis=1, errors="ignore")
return df
In [5]:
ALPHA = 50
lbl = sk.preprocessing.LabelEncoder()
def preprocess_categorial(df):
for c in list(df.columns):
if df[c].dtype == 'object':
try:
try:
lbl.fit(list(train_raw[c].values) + list(test[c].values) + list(df[c].values))
except KeyError as e:
lbl.fit(df[c].values)
df[c + "_le"] = lbl.transform(list(df[c].values))
except ValueError as e:
print c, e
raise
df = mess_y_categorial(df, 5, alpha=ALPHA)
df = df.select_dtypes(exclude=['object'])
return df
def apply_categorial(test, train):
for c in list(test.columns):
if test[c].dtype == 'object':
try:
lbl.fit(list(train_raw[c].values) + list(test[c].values) + list(train[c].values))
except KeyError:
lbl.fit(test[c].values)
test[c + "_le"] = lbl.transform(list(test[c].values))
test = mess_y_categorial_fold(test, train, alpha=ALPHA)
test = test.select_dtypes(exclude=['object'])
return test
def apply_macro(df):
macro_cols = [
'timestamp', "balance_trade", "balance_trade_growth", "eurrub", "average_provision_of_build_contract",
"micex_rgbi_tr", "micex_cbi_tr", "deposits_rate", "mortgage_value", "mortgage_rate",
"income_per_cap", "rent_price_4+room_bus", "museum_visitis_per_100_cap", "apartment_build"
]
return df.merge(macro[macro_cols], on='timestamp', how='left')
In [6]:
def preprocess(df):
df = df.copy()
ecology = ["no data", "poor", "satisfactory", "good", "excellent"]
df["ecology_index"] = map(ecology.index, df["ecology"].values)
df["age_of_building"] = df["timestamp"].apply(lambda x: x.split("-")[0]).astype(int) - df["build_year"]
df["is_build_in_progress"] = df["age_of_building"].apply(lambda x: "yes" if x < 0 else "no")
bool_feats = [
"thermal_power_plant_raion",
"incineration_raion",
"oil_chemistry_raion",
"radiation_raion",
"railroad_terminal_raion",
"big_market_raion",
"nuclear_reactor_raion",
"detention_facility_raion",
"water_1line",
"big_road1_1line",
"railroad_1line",
"culture_objects_top_25"
]
for bf in bool_feats:
try:
df[bf + "_bool"] = map(lambda x: x == "yes", df[bf].values)
except:
pass
df = preprocess_anomaly(df)
df['rel_floor'] = df['floor'] / df['max_floor'].astype(float)
df['rel_kitch_sq'] = df['kitch_sq'] / df['full_sq'].astype(float)
df['rel_life_sq'] = df['life_sq'] / df['full_sq'].astype(float)
df["material_cat"] = df.material.fillna(0).astype(int).astype(str).replace("0", "")
df["state_cat"] = df.state.fillna(0).astype(int).astype(str).replace("0", "")
# df["num_room_cat"] = df.num_room.fillna(0).astype(int).astype(str).replace("0", "")
# df["build_year_cat"] = df.build_year.fillna(0).astype(int).astype(str).replace("0", "")
df["build_year_ten"] = (df.build_year / 10).round()
df["ID_metro"] = df.ID_metro.fillna(-10).astype(int).astype(str).replace("-10", "")
df["ID_railroad_station_walk"] = df.ID_railroad_station_walk.replace("", "-10").fillna(-10).astype(int).astype(str).replace("-10", "")
df["ID_railroad_station_avto"] = df.ID_railroad_station_avto.fillna(-10).astype(int).astype(str).replace("-10", "")
df["ID_big_road1"] = df.ID_big_road1.fillna(-10).astype(int).astype(str).replace("-10", "")
df["ID_big_road2"] = df.ID_big_road2.fillna(-10).astype(int).astype(str).replace("-10", "")
df["ID_bus_terminal"] = df.ID_bus_terminal.fillna(-10).astype(int).astype(str).replace("-10", "")
# # ratio of living area to full area #
df["ratio_life_sq_full_sq"] = df["life_sq"] / np.maximum(df["full_sq"].astype("float"),1)
df["ratio_life_sq_full_sq"].ix[df["ratio_life_sq_full_sq"]<0] = 0
df["ratio_life_sq_full_sq"].ix[df["ratio_life_sq_full_sq"]>1] = 1
# # ratio of kitchen area to living area #
df["ratio_kitch_sq_life_sq"] = df["kitch_sq"] / np.maximum(df["life_sq"].astype("float"),1)
df["ratio_kitch_sq_life_sq"].ix[df["ratio_kitch_sq_life_sq"]<0] = 0
df["ratio_kitch_sq_life_sq"].ix[df["ratio_kitch_sq_life_sq"]>1] = 1
# # ratio of kitchen area to full area #
df["ratio_kitch_sq_full_sq"] = df["kitch_sq"] / np.maximum(df["full_sq"].astype("float"),1)
df["ratio_kitch_sq_full_sq"].ix[df["ratio_kitch_sq_full_sq"]<0] = 0
df["ratio_kitch_sq_full_sq"].ix[df["ratio_kitch_sq_full_sq"]>1] = 1
df = df.drop(["timestamp"], axis=1, errors="ignore")
return df
In [7]:
# train_raw = pd.read_csv("data/train.csv")
train_raw = pd.read_csv("data/train_without_noise.csv", index_col="id")
test = pd.read_csv("data/test.csv", index_col="id")
macro = pd.read_csv("data/macro.csv")
In [8]:
train_pr = align_to_lb_score(train_raw)
train_pr = preprocess(train_pr)
train_pr = preprocess_categorial(train_pr)
train = feature_exclude(train_pr)
train.head()
Out[8]:
In [9]:
important_feats = ["full_sq", "life_sq", "kitch_sq", "max_floor"]
# important_feats = ["full_sq", "life_sq"]
# Учим модели для заполнения NA важных полей, последовательность важна
feats_to_remove = ["price_doc", "rel_kitch_sq", "rel_life_sq", "id", "build_year_cat_le",
"age_of_building", "rel_floor", "num_room_cat_le", "build_year_ten",
"ratio_life_sq_full_sq", "ratio_kitch_sq_full_sq", "ratio_kitch_sq_life_sq"]
In [10]:
%%cache na_models.pkl na_models
na_models = {}
xgb_params = {
'max_depth': 5,
'n_estimators': 200,
'learning_rate': 0.05,
'objective': 'reg:linear',
'eval_metric': 'rmse',
'silent': 1
}
for f in important_feats:
t = train[train[f].notnull()]
fX = t.drop([f] + feats_to_remove, axis=1, errors="ignore")
fy = t[f].values
dtrain_all = xgb.DMatrix(fX.values, fy, feature_names=fX.columns)
model = xgb.train(xgb_params, dtrain_all, num_boost_round=400, verbose_eval=40)
na_models[f] = model
print f
print feat_imp(model).head(10)
In [11]:
def fill_na_xgb(df_orig):
df = df_orig.copy()
for f in important_feats:
X_pr = df[df[f].isnull()].drop([f] + feats_to_remove, axis=1, errors="ignore")
if not len(X_pr):
continue
X_pr = xgb.DMatrix(X_pr.values, feature_names=X_pr.columns)
df.loc[df[f].isnull(), f] = na_models[f].predict(X_pr).round()
df[f] = df[f].astype(int)
return df
In [12]:
train = fill_na_xgb(train)
In [33]:
from sklearn.model_selection import train_test_split
X = train.drop(["price_doc"], axis=1)
y = train["price_doc"].values
bound = int(len(X) * 0.7)
X_train, X_val, y_train, y_val = X[:bound].copy(), X[bound+1:].copy(), y[:bound].copy(), y[bound+1:].copy()
In [34]:
dtrain_all = xgb.DMatrix(X.values, y, feature_names=X.columns)
dtrain = xgb.DMatrix(X_train.values, y_train, feature_names=X.columns)
dval = xgb.DMatrix(X_val.values, y_val, feature_names=X.columns)
In [35]:
xgb_params = {
'eta': 0.01,
'max_depth': 5,
'subsample': 0.7,
'colsample_bytree': 0.7,
'objective': 'reg:linear',
'eval_metric': 'rmse',
'silent': 1
}
# Uncomment to tune XGB `num_boost_rounds`
model = xgb.train(xgb_params, dtrain, num_boost_round=4000, evals=[(dval, 'val')],
early_stopping_rounds=20, verbose_eval=40)
num_boost_round = model.best_iteration
In [16]:
cv_output = xgb.cv(xgb_params, dtrain_all, num_boost_round=4000,
verbose_eval=100, early_stopping_rounds=100, nfold=5)
In [36]:
xgbmodel = xgb.train(xgb_params, dtrain, num_boost_round=num_boost_round, verbose_eval=40)
y_pred = xgbmodel.predict(dtrain)
print "predict-train:", rmse(y_pred, y_train)
submdf = pd.DataFrame({"id": X_train.index, "price_doc": unlog(y_pred)})
submdf.to_csv("xgb_train_preds.csv", index=False)
y_pred = xgbmodel.predict(dval)
print "predict-val:", rmse(y_pred, y_val)
submdf = pd.DataFrame({"id": X_val.index, "price_doc": unlog(y_pred)})
submdf.to_csv("xgb_val_preds.csv", index=False)
feat_imp(model).head(10)
Out[36]:
In [37]:
RS = 20170501
np.random.seed(RS)
FACT_ROUNDS=0
ROUNDS = 2000
lgb_params = {
'objective': 'regression',
'metric': 'rmse',
'boosting': 'gbdt',
'learning_rate': 0.01,
# 'verbose': 1,
# 'num_leaves': 2 ** 5,
'bagging_fraction': 0.95,
'bagging_freq': 1,
'bagging_seed': RS,
# 'feature_fraction': 0.7,
# 'feature_fraction_seed': RS,
'subsample': 0.7,
'colsample_bytree': 0.7,
# 'max_bin': 100,
'max_depth': 10,
'num_rounds': ROUNDS
}
lgb_train_all = lgb.Dataset(X, y)
lgb_train = lgb.Dataset(X_train, y_train)
In [38]:
cvres = pd.DataFrame(lgb.cv(params=lgb_params, train_set=lgb_train, nfold=5, shuffle=False,
early_stopping_rounds=100, verbose_eval=100, num_boost_round=ROUNDS))
FACT_ROUNDS = len(cvres)
In [39]:
lgbmodel = lgb.train(lgb_params, lgb_train, num_boost_round=FACT_ROUNDS or ROUNDS)
pd.DataFrame({
"name": lgbmodel.feature_name(),
"imp": lgbmodel.feature_importance()}
).sort_values("imp", ascending=False).head(20)
Out[39]:
In [40]:
y_pred = lgbmodel.predict(X_train)
print "predict-train:", rmse(y_pred, y_train)
submdf = pd.DataFrame({"id": X_train.index, "price_doc": unlog(y_pred)})
submdf.to_csv("lgb_train_preds.csv", index=False)
y_pred = lgbmodel.predict(X_val)
print "predict-val:", rmse(y_pred, y_val)
submdf = pd.DataFrame({"id": X_val.index, "price_doc": unlog(y_pred)})
submdf.to_csv("lgb_val_preds.csv", index=False)
In [123]:
from vowpalwabbit.sklearn_vw import VWRegressor
In [29]:
from sklearn.base import TransformerMixin
from scipy.stats import skew
class SkewLogAlign(TransformerMixin):
skewed_feats = None
skew_treshold = 0.75
def __init__(self, skew_treshold=0.75):
self.skew_treshold = skew_treshold
def fit(self, X, y=None):
#log transform skewed numeric features:
df = pd.DataFrame(X, dtype=np.float64)
skewed_feats = df.apply(lambda x: skew(x.dropna())) #compute skewness
skewed_feats = skewed_feats[skewed_feats > 0.75]
self.skewed_feats = skewed_feats.index
return self
def transform(self, X):
df = pd.DataFrame(X, dtype=np.float64)
df[self.skewed_feats] = np.log1p(df[self.skewed_feats].values)
return df.values
import sys
class FillNaWithConstant(TransformerMixin):
nan_value = 0
inf_value = None
minf_value = None
def __init__(self, nan_value=0, inf_value=sys.maxint - 1, minf_value=-sys.maxint - 1):
self.nan_value = nan_value
self.inf_value = inf_value
self.minf_value = minf_value
def fit(self, X, y=None):
return self
def transform(self, X):
df = pd.DataFrame(X).fillna(self.nan_value)
df = df.replace(np.inf, self.inf_value)
df = df.replace(-np.inf, self.minf_value)
return df.values
In [45]:
from sklearn.pipeline import Pipeline
lasso_feat_pipeline = Pipeline([
("skew", SkewLogAlign()),
("fillna", FillNaWithConstant()),
])
In [143]:
from sklearn.linear_model import LassoCV
LASSO_alphas = [1, 0.1, 0.001, 0.0005]
lasso_cv_model = LassoCV(alphas = [1, 0.1, 0.001, 0.0005], cv=5, max_iter=50000, verbose=True, n_jobs=-1)
lasso_cv_model.fit(lasso_feat_pipeline.transform(X.values), y)
print "alpha:", lasso_cv_model.alpha_
print "MSE:"
print zip(LASSO_alphas, np.sqrt(lasso_cv_model.mse_path_))
print pd.Series(lasso_cv_model.coef_, index=X.columns).sort_values(ascending=False)[:20]
In [46]:
from sklearn.linear_model import Lasso
best_alpha = 0.001
lasso_model = Pipeline([
("feat", lasso_feat_pipeline),
("clf", Lasso(alpha=best_alpha, max_iter=50000))
])
lasso_model.fit(X_train.values, y_train)
Out[46]:
In [47]:
y_pred = lasso_model.predict(X_train.values)
print "predict-train:", rmse(y_pred, y_train)
submdf = pd.DataFrame({"id": X_train.index, "price_doc": unlog(y_pred)})
submdf.to_csv("lasso_train_preds.csv", index=False)
y_pred = lasso_model.predict(X_val.values)
print "predict-validation:", rmse(y_pred, y_val)
submdf = pd.DataFrame({"id": X_val.index, "price_doc": unlog(y_pred)})
submdf.to_csv("lasso_val_preds.csv", index=False)
In [41]:
test_pr = preprocess(test)
train_pr = preprocess(train_raw)
test_pr = apply_categorial(test_pr, train_pr)
test_pr = feature_exclude(test_pr)
test_pr = fill_na_xgb(test_pr)
In [42]:
# XGB
dtest = xgb.DMatrix(test_pr.values, feature_names=test_pr.columns)
y_pred = xgbmodel.predict(dtest)
submdf = pd.DataFrame({"id": test_pr.index, "price_doc": unlog(y_pred)})
submdf.to_csv("xgb_sub.csv", index=False)
!head xgb_sub.csv
In [43]:
y_pred = lgbmodel.predict(test_pr)
submdf = pd.DataFrame({"id": test_pr.index, "price_doc": unlog(y_pred)})
submdf.to_csv("lgb_sub.csv", index=False)
!head lgb_sub.csv
In [51]:
y_pred = lasso_model.predict(test_pr.values)
submdf = pd.DataFrame({"id": test_pr.index, "price_doc": unlog(y_pred)})
submdf.to_csv("lasso_sub.csv", index=False)
!head lasso_sub.csv
In [46]:
models = ["lgb", "xgb"]
In [47]:
etrain = pd.DataFrame(index=X_val.index)
etrain = etrain.join(train[["price_doc"]])
for i, p in enumerate(models):
pred = pd.read_csv("%s_val_preds.csv" % p, index_col="id", names=["id", "p_%s" % i], header=0)
etrain = etrain.join(pred)
eX = etrain.drop("price_doc", axis=1)
ey = etrain["price_doc"].values
etrain.head()
Out[47]:
In [48]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LassoCV
emodel = Pipeline([
("skew", SkewLogAlign()),
("fillna", FillNaWithConstant()),
("clf", LassoCV(alphas=None, cv=5, max_iter=50000, verbose=True, n_jobs=-1))
])
emodel.fit(eX.values, ey)
lmodel = emodel.named_steps["clf"]
print "alpha:", lmodel.alpha_
print "MSE:"
print np.sqrt(lmodel.mse_path_)
print pd.Series(lmodel.coef_, index=eX.columns).sort_values(ascending=False)[:20]
In [31]:
eFACT_ROUNDS = 0
In [ ]:
elgb_train = lgb.Dataset(eX, ey)
cvres = pd.DataFrame(lgb.cv(params=lgb_params, train_set=elgb_train, nfold=7, shuffle=False,
early_stopping_rounds=100, verbose_eval=100, num_boost_round=ROUNDS))
eFACT_ROUNDS = len(cvres)
In [ ]:
emodel = lgb.train(lgb_params, elgb_train, num_boost_round=eFACT_ROUNDS or ROUNDS)
In [49]:
etest = test_pr[[]].copy()
for i, p in enumerate(models):
pred = pd.read_csv("%s_sub.csv" % p, index_col="id", names=["id", "p_%s" % i], header=0)
etest = etest.join(pred)
y_pred = emodel.predict(etest.values)
df = pd.DataFrame({"id": etest.index, "price_doc": unlog(y_pred)})
df.to_csv("ensemble_sub.csv", index=False)
!head ensemble_sub.csv
5*200, no macro, add rel features, no log price, train_without_noise, 4000 iter, not fillna, superclean, ID metro + other, no ratio feats, superfeatures, Label Encoding, is_build_in_progress + age_of_building, kfold wo shuffle, feature_exclude, xgb fillna, predict price meter:
val-rmse:42206.6
predict-train: 36746.0165399
kaggle: 0.31331
5*200, no macro, add rel features, no log price, train_without_noise, 4000 iter, not fillna, superclean, ID metro + other, no ratio feats, no superfeatures, Label Encoding, is_build_in_progress + age_of_building, kfold wo shuffle, feature_exclude, xgb fillna, predict price doc:
val-rmse:2.57852e+06
train-rmse:1.90168e+06+26844.3 test-rmse:2.66642e+06+56338.9
predict-train: 2021259.19865
kaggle: 0.31386
5*200, no macro, add rel features, no log price, train_without_noise, 4000 iter, not fillna, superclean, ID metro + other, no ratio feats, no superfeatures, Label Encoding, is_build_in_progress + age_of_building, kfold wo shuffle, feature_exclude, xgb fillna, predict price meter:
val-rmse:42206.6
predict-train: 36746.0165399
kaggle: 0.31331
5*200, no macro, add rel features, no log price, train_without_noise, 4000 iter, not fillna, superclean, ID metro + other, no ratio feats, no superfeatures, Label Encoding, is_build_in_progress + age_of_building, kfold wo shuffle, feature_exclude:
val-rmse:2.55793e+06
train-rmse:1.74066e+06+28727.3 test-rmse:2.65025e+06+64969.5
predict-train: 1881896.66663
kaggle: 0.31344
5*200, no macro, add rel features, no log price, train_without_noise, 4000 iter, not fillna, superclean, ID metro + other, no ratio feats, no superfeatures, Label Encoding, is_build_in_progress + age_of_building, kfold wo shuffle, feature_exclude 143:
val-rmse:2.54654e+06
train-rmse:1.74594e+06+24020 test-rmse:2.66053e+06+67300.3
predict-train: 1883352.60935
kaggle: 0.31364
5*200, no macro, add rel features, no log price, train_without_noise, 4000 iter, not fillna, superclean, ID metro, no ratio feats, no superfeatures, Label Encoding, is_build_in_progress + age_of_building, kfold wo shuffle, feature_exclude 143:
val-rmse:2.55613e+06
train-rmse:1.74466e+06+27385.6 test-rmse:2.66422e+06+69734.1
predict-train: 1888051.35357
kaggle: 0.31366
5*200, no macro, add rel features, no log price, train_without_noise, 4000 iter, not fillna, superclean, ID metro with other ID, ratio feats, no superfeatures, Label Encoding, is_build_in_progress + age_of_building, kfold wo shuffle, feature_exclude 143:
val-rmse:2.58557e+06
train-rmse:1.98509e+06+26803.7 test-rmse:2.68755e+06+59691.1
predict-train: 2092731.29028
kaggle: 0.31731
5*200, no macro, add rel features, no log price, train_without_noise:
val-rmse:2.63772e+06
train-rmse:1.9989e+06+10986.4 test-rmse:2.69158e+06+53020
predict-train: 2076010.27131
kaggle: 0.31720
5*200, no macro, add rel features, no log price, train_with_noise:
val-rmse:2.53378e+06
train-rmse:1.95069e+06+16166.4 test-rmse:2.69703e+06+61455.1
predict-train: 2054421.59869
kaggle: 0.32056
5*200, macro, add rel features, no log price, train_without_noise:
val-rmse:2.79632e+06
train-rmse:1.81015e+06+19781.2 test-rmse:2.6641e+06+123875
predict-train: 1904063.27368
kaggle: 0.32976
5*200, no macro, add rel features, no log price, train_without_noise:
val-rmse:2.61682e+06
train-rmse:1.81123e+06+27681.2 test-rmse:2.66923e+06+53925.7
predict-train: 1899129.43771
kaggle: 0.31592
5*200, no macro, add rel features, no log price, train_without_noise, 4000 iter:
val-rmse:2.61055e+06
train-rmse:1.71826e+06+30076.1 test-rmse:2.66515e+06+54583.5
predict-train: 1814572.97424
kaggle: 0.31602
7*300, no macro, add rel features, no log price, train_without_noise, 4000 iter:
val-rmse:2.59955e+06
train-rmse:1.41393e+06+21208.1 test-rmse:2.6763e+06+35553.3
predict-train: 1548257.49121
kaggle: 0.31768
4*300, no macro, add rel features, no log price, train_without_noise, 4000 iter:
val-rmse:2.63407e+06
train-rmse:1.96513e+06+21470.8 test-rmse:2.69417e+06+74288.3
predict-train: 2062299.41091
kaggle: 0.31952
7*200, no macro, add rel features, no log price, train_without_noise, 4000 iter:
val-rmse:2.59955e+06
train-rmse:1.41393e+06+21208.1 test-rmse:2.6763e+06+35553.3
predict-train: 1548257.49121
5*300, no macro, add rel features, no log price, train_without_noise, 4000 iter:
val-rmse:2.61055e+06
train-rmse:1.71826e+06+30076.1 test-rmse:2.66515e+06+54583.5
predict-train: 1814572.97424
5*200, no macro, add rel features, no log price, train_without_noise, 4000 iter, not fillna:
val-rmse:2.61664e+06
train-rmse:1.77892e+06+23111 test-rmse:2.65829e+06+56398.6
predict-train: 1875799.54634
kaggle: 0.31521
5*200, no macro, add rel features, no log price, train_without_noise, 4000 iter, not fillna, superclean:
val-rmse:2.6265e+06
train-rmse:1.78478e+06+22545.4 test-rmse:2.66179e+06+60626.3
predict-train: 1881672.27588
kaggle: 0.31476
5*200, no macro, add rel features, no log price, train_without_noise, 4000 iter, not fillna, superclean, no super features + Label Encoding:
val-rmse:2.56494e+06
train-rmse:1.78862e+06+18589.1 test-rmse:2.69283e+06+79861.4
predict-train: 1923466.41923
kaggle: 0.31434
5*200, no macro, add rel features, no log price, train_without_noise, 4000 iter, not fillna, superclean, remove material state num_room:
val-rmse:2.56932e+06
train-rmse:1.88495e+06+20133.7 test-rmse:2.69624e+06+70491.2
predict-train: 1979198.19201
kaggle: 0.31513
5*200, no macro, add rel features, no log price, train_without_noise, 4000 iter, not fillna, superclean, ID metro/bus...:
val-rmse:2.60017e+06
train-rmse:1.80654e+06+19453.5 test-rmse:2.68203e+06+68169.5
predict-train: 1906439.98603
kaggle: 0.31927
5*200, no macro, add rel features, no log price, train_without_noise, 4000 iter, not fillna, superclean, ID metro, remove 50 features:
val-rmse:2.93665e+06
train-rmse:1.73425e+06+19462.4 test-rmse:2.68682e+06+140661
predict-train: 1861268.6455
kaggle: 0.31555
5*200, no macro, add rel features, no log price, train_without_noise, 4000 iter, not fillna, superclean, ID metro, remove 50 features, add ratio feats:
val-rmse:2.59747e+06
train-rmse:1.75828e+06+26639.4 test-rmse:2.68491e+06+67201.8
predict-train: 1875707.6581
kaggle: 0.31760
5*200, no macro, add rel features, no log price, train_without_noise, 4000 iter, not fillna, superclean, ID metro, no ratio feats, superfeatures + Label Encoding, is_build_in_progress + age_of_building, kfold wo shuffle:
val-rmse:2.5419e+06
train-rmse:1.74381e+06+22710.7 test-rmse:2.65787e+06+66889.9
predict-train: 1862467.67153
kaggle: 0.31716
5*200, no macro, add rel features, no log price, train_without_noise, 4000 iter, not fillna, superclean, ID metro, no ratio feats, no superfeatures, Label Encoding, is_build_in_progress + age_of_building, kfold wo shuffle:
val-rmse:2.5676e+06
train-rmse:1.81485e+06+24274 test-rmse:2.67324e+06+60153.1
predict-train: 1947645.83102
kaggle: 0.31376
In [45]:
from tqdm import tqdm
def get_best_score(train):
xgb_params = {
'max_depth': 5,
'n_estimators': 200,
'learning_rate': 0.01,
'objective': 'reg:linear',
'eval_metric': 'rmse',
'silent': 1
}
cvres = xgb.cv(xgb_params, train, num_boost_round=4000, early_stopping_rounds=40)
return cvres["test-rmse-mean"].min(), cvres["test-rmse-mean"].argmin()
def df2DMatrix(df):
return xgb.DMatrix(data=df.drop("price_doc", axis=1).values, label=df["price_doc"].values)
def greedy_remove_features(df, feature_importances):
train = df
with open("greedy_search.tsv", "a") as f:
best_score, iterno = get_best_score(df2DMatrix(df))
f.write("\t".join(["INITIAL", str(best_score), str(iterno)]) + "\n")
to_analyze = sorted(feature_importances.items(), key=lambda x: x[1])
for feat, feat_importance in tqdm(to_analyze):
f.flush()
candidate_train = train.drop(feat, axis=1)
cand_best_score, iterno = get_best_score(df2DMatrix(candidate_train))
if cand_best_score > best_score:
# стало хуже, оставляем фичу
f.write("\t".join([feat, str(cand_best_score), str(best_score), str(feat_importance), str(iterno), "skip"]) + "\n")
f.flush()
continue
f.write("\t".join([feat, str(cand_best_score), str(best_score), str(feat_importance), str(iterno), "remove"]) + "\n")
best_score = cand_best_score
train = candidate_train
In [47]:
feature_importances = imp_features.set_index("feature").to_dict()["importance"]
train_gs = train
with open("greedy_search.tsv") as gs:
for line in gs:
row = line.strip().split("\t")
if len(row) < 6:
continue
if row[5] == "remove":
try:
train_gs = train_gs.drop(row[0], axis=1)
except ValueError:
pass
print "drop", row[0]
feature_importances.pop(row[0], None)
greedy_remove_features(train_gs, feature_importances)
In [168]:
# train_raw = pd.read_csv("data/train.csv")
train_raw = pd.read_csv("data/train_without_noise.csv")
test = pd.read_csv("data/test.csv")
macro = pd.read_csv("data/macro.csv")
train_raw.head()
Out[168]:
In [169]:
train_new_pr = feature_exclude(preprocess_categorial(preprocess(train_raw, dropid=False)))
test_new_pr = feature_exclude(preprocess_categorial(preprocess(test, dropid=False)))
# нужно сделать fillna, чтобы получить филлеры для NA из моделей
filled_train = fill_na_xgb(train_new_pr)
filled_test = fill_na_xgb(test_new_pr)
filled_train = filled_train.set_index("id")
filled_test = filled_test.set_index("id")
In [175]:
# train_raw = pd.read_csv("data/train.csv")
train_raw = pd.read_csv("data/train_without_noise.csv")
test = pd.read_csv("data/test.csv")
macro = pd.read_csv("data/macro.csv")
train_raw.head()
Out[175]:
In [176]:
train_new = preprocess_anomaly(train_raw)
test_new = preprocess_anomaly(test)
train_new = train_new.set_index("id")
test_new = test_new.set_index("id")
train_new = train_new.join(filled_train[important_feats], rsuffix="_filled")
test_new = test_new.join(filled_test[important_feats], rsuffix="_filled")
for impf in important_feats:
train_new[impf] = train_new[impf].fillna(train_new["%s_filled" % impf])
train_new = train_new.drop(["%s_filled" % impf], axis=1)
test_new[impf] = test_new[impf].fillna(test_new["%s_filled" % impf])
test_new = test_new.drop(["%s_filled" % impf], axis=1)
In [177]:
# train_new = feature_exclude(train_new)
# test_new = feature_exclude(test_new)
In [178]:
train_new.to_csv("data/train_cleaned.csv", encoding="utf_8")
test_new.to_csv("data/test_cleaned.csv", encoding="utf_8")
In [259]:
# train_raw = pd.read_csv("data/train.csv")
train_raw = pd.read_csv("data/train_without_noise.csv")
test = pd.read_csv("data/test.csv")
macro = pd.read_csv("data/macro.csv")
train_raw.head()
Out[259]:
In [260]:
def update(source, patch):
dtypes = source.dtypes
source.update(patch, overwrite=True)
for c, t in dtypes.iteritems():
source[c] = source[c].astype(t)
return source
In [261]:
train_raw.set_index("id")
test.set_index("id")
fx = pd.read_excel('data/BAD_ADDRESS_FIX.xlsx').drop_duplicates('id').set_index('id')
train_raw = update(train_raw, fx)
test = update(test, fx)
train_raw.reset_index()
test.reset_index()
print('Fix in train: ', train_raw.index.intersection(fx.index).shape[0])
print('Fix in test : ', test.index.intersection(fx.index).shape[0])
train_raw.to_csv("data/train_fix.csv", index=False, encoding="utf-8")
test.to_csv("data/test_fix.csv", index=False, encoding="utf-8")
In [266]:
from auto_ml import Predictor
In [267]:
# train_raw = pd.read_csv("data/train.csv")
train_raw = pd.read_csv("data/train_without_noise.csv")
test = pd.read_csv("data/test.csv")
macro = pd.read_csv("data/macro.csv")
train_raw.head()
Out[267]:
In [268]:
train_pr = preprocess(train_raw)
train_pr = preprocess_categorial(train_pr)
train = feature_exclude(train_pr)
In [ ]:
# Tell auto_ml which column is 'output'
# Also note columns that aren't purely numerical
# Examples include ['nlp', 'date', 'categorical', 'ignore']
column_descriptions = {
'price_doc': 'output'
}
ml_predictor = Predictor(type_of_estimator='regressor', column_descriptions=column_descriptions)
ml_predictor.train(train)
file_name = ml_predictor.save()
print file_name
# Score the model on test data
test_score = ml_predictor.score(df_test, df_test.MEDV)
In [173]:
#Checking for missing data
NAs = pd.concat([
train.isnull().sum(),
test_pr.isnull().sum()
], axis=1, keys=['Train', 'Test'])
NAs[NAs.sum(axis=1) > 0]
Out[173]:
In [ ]: