This version adjusts results using average (log) predicted prices from a macro model.
By default (that is, if micro_humility_factor=1), it rescales the adjusted log predictions so that the standard deviation of the raw predictions is the same as it was before the adjustment.
There is also a provision for applying micro and macro humility factors . The macro humility factor adjusts the macro predictions by averaging in a "naive" macro model. The micro humility factor adjusts individual (log) predictions toward the (log) mean.
TODO:
Apply BAD ADDRESS FIX - *FAILED*
Попробовать пересчитать с новыми fillna
https://www.kaggle.com/jasonpeng/latest-iteration-in-this-silly-game/code
In [50]:
train_path = "data/train_without_noise.csv"
test_path = "data/test_cleaned.csv"
macro_path = "data/macro.csv"
In [51]:
# Parameters
micro_humility_factor = 1 # range from 0 (complete humility) to 1 (no humility)
macro_humility_factor = 0.96
jason_weight = .2
bruno_weight = .2
reynaldo_weight = 1 - jason_weight - bruno_weight
In [52]:
# Get ready for lots of annoying deprecation warnings
import statsmodels.api as sm
In [53]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import model_selection, preprocessing
import xgboost as xgb
import datetime
import scipy as sp
In [54]:
def price_doc_to_meter(train):
train["price_doc"] = train["price_doc"] / train["full_sq"]
return train
def price_meter_to_doc(test):
test["price_doc"] = test["price_doc"] * test["full_sq"]
return test
In [55]:
# Read data
macro = pd.read_csv(macro_path)
train = pd.read_csv(train_path)
# train = price_doc_to_meter(train)
test = pd.read_csv(test_path)
# Macro data monthly medians
macro["timestamp"] = pd.to_datetime(macro["timestamp"])
macro["year"] = macro["timestamp"].dt.year
macro["month"] = macro["timestamp"].dt.month
macro["yearmonth"] = 100*macro.year + macro.month
macmeds = macro.groupby("yearmonth").median()
# Price data monthly medians
train["timestamp"] = pd.to_datetime(train["timestamp"])
train["year"] = train["timestamp"].dt.year
train["month"] = train["timestamp"].dt.month
train["yearmonth"] = 100*train.year + train.month
prices = train[["yearmonth","price_doc"]]
p = prices.groupby("yearmonth").median()
# Join monthly prices to macro data
df = macmeds.join(p)
In [56]:
# Function to process Almon lags
import numpy.matlib as ml
def almonZmatrix(X, maxlag, maxdeg):
"""
Creates the Z matrix corresponding to vector X.
"""
n = len(X)
Z = ml.zeros((len(X)-maxlag, maxdeg+1))
for t in range(maxlag, n):
#Solve for Z[t][0].
Z[t-maxlag,0] = sum([X[t-lag] for lag in range(maxlag+1)])
for j in range(1, maxdeg+1):
s = 0.0
for i in range(1, maxlag+1):
s += (i)**j * X[t-i]
Z[t-maxlag,j] = s
return Z
In [57]:
# Prepare data for macro model
y = df.price_doc.div(df.cpi).apply(np.log).loc[201108:201506]
lncpi = df.cpi.apply(np.log)
tblags = 5 # Number of lags used on PDL for Trade Balance
mrlags = 5 # Number of lags used on PDL for Mortgage Rate
cplags = 5 # Number of lags used on PDL for CPI
ztb = almonZmatrix(df.balance_trade.loc[201103:201506].as_matrix(), tblags, 1)
zmr = almonZmatrix(df.mortgage_rate.loc[201103:201506].as_matrix(), mrlags, 1)
zcp = almonZmatrix(lncpi.loc[201103:201506].as_matrix(), cplags, 1)
columns = ['tb0', 'tb1', 'mr0', 'mr1', 'cp0', 'cp1']
z = pd.DataFrame( np.concatenate( (ztb, zmr, zcp), axis=1), y.index.values, columns )
X = sm.add_constant( z )
# Fit macro model
eq = sm.OLS(y, X)
fit = eq.fit()
# Predict with macro model
test_cpi = df.cpi.loc[201507:201605]
test_index = test_cpi.index
ztb_test = almonZmatrix(df.balance_trade.loc[201502:201605].as_matrix(), tblags, 1)
zmr_test = almonZmatrix(df.mortgage_rate.loc[201502:201605].as_matrix(), mrlags, 1)
zcp_test = almonZmatrix(lncpi.loc[201502:201605].as_matrix(), cplags, 1)
z_test = pd.DataFrame( np.concatenate( (ztb_test, zmr_test, zcp_test), axis=1),
test_index, columns )
X_test = sm.add_constant( z_test )
pred_lnrp = fit.predict( X_test )
pred_p = np.exp(pred_lnrp) * test_cpi
# Merge with test cases and compute mean for macro prediction
test["timestamp"] = pd.to_datetime(test["timestamp"])
test["year"] = test["timestamp"].dt.year
test["month"] = test["timestamp"].dt.month
test["yearmonth"] = 100*test.year + test.month
test_ids = test[["yearmonth","id"]]
monthprices = pd.DataFrame({"yearmonth":pred_p.index.values,"monthprice":pred_p.values})
macro_mean = np.exp(test_ids.merge(monthprices, on="yearmonth").monthprice.apply(np.log).mean())
macro_mean
Out[57]:
In [58]:
# Naive macro model assumes housing prices will simply follow CPI
naive_pred_lnrp = y.mean()
naive_pred_p = np.exp(naive_pred_lnrp) * test_cpi
monthnaive = pd.DataFrame({"yearmonth":pred_p.index.values, "monthprice":naive_pred_p.values})
macro_naive = np.exp(test_ids.merge(monthnaive, on="yearmonth").monthprice.apply(np.log).mean())
macro_naive
Out[58]:
In [59]:
# Combine naive and substantive macro models
macro_mean = macro_naive * (macro_mean/macro_naive) ** macro_humility_factor
macro_mean
Out[59]:
In [68]:
# Jason/Gunja
#load files
train = pd.read_csv(train_path, parse_dates=['timestamp'])
# train = price_doc_to_meter(train)
test = pd.read_csv(test_path, parse_dates=['timestamp'])
macro = pd.read_csv(macro_path, parse_dates=['timestamp'])
id_test = test.id
#clean data
bad_index = train[train.life_sq > train.full_sq].index
train.ix[bad_index, "life_sq"] = np.NaN
equal_index = [601,1896,2791]
test.ix[equal_index, "life_sq"] = test.ix[equal_index, "full_sq"]
bad_index = test[test.life_sq > test.full_sq].index
test.ix[bad_index, "life_sq"] = np.NaN
bad_index = train[train.life_sq < 5].index
train.ix[bad_index, "life_sq"] = np.NaN
bad_index = test[test.life_sq < 5].index
test.ix[bad_index, "life_sq"] = np.NaN
bad_index = train[train.full_sq < 5].index
train.ix[bad_index, "full_sq"] = np.NaN
bad_index = test[test.full_sq < 5].index
test.ix[bad_index, "full_sq"] = np.NaN
kitch_is_build_year = [13117]
train.ix[kitch_is_build_year, "build_year"] = train.ix[kitch_is_build_year, "kitch_sq"]
bad_index = train[train.kitch_sq >= train.life_sq].index
train.ix[bad_index, "kitch_sq"] = np.NaN
bad_index = test[test.kitch_sq >= test.life_sq].index
test.ix[bad_index, "kitch_sq"] = np.NaN
bad_index = train[(train.kitch_sq == 0).values + (train.kitch_sq == 1).values].index
train.ix[bad_index, "kitch_sq"] = np.NaN
bad_index = test[(test.kitch_sq == 0).values + (test.kitch_sq == 1).values].index
test.ix[bad_index, "kitch_sq"] = np.NaN
bad_index = train[(train.full_sq > 210) & (train.life_sq / train.full_sq < 0.3)].index
train.ix[bad_index, "full_sq"] = np.NaN
bad_index = test[(test.full_sq > 150) & (test.life_sq / test.full_sq < 0.3)].index
test.ix[bad_index, "full_sq"] = np.NaN
bad_index = train[train.life_sq > 300].index
train.ix[bad_index, ["life_sq", "full_sq"]] = np.NaN
bad_index = test[test.life_sq > 200].index
test.ix[bad_index, ["life_sq", "full_sq"]] = np.NaN
train.product_type.value_counts(normalize= True)
test.product_type.value_counts(normalize= True)
bad_index = train[train.build_year < 1500].index
train.ix[bad_index, "build_year"] = np.NaN
bad_index = test[test.build_year < 1500].index
test.ix[bad_index, "build_year"] = np.NaN
bad_index = train[train.num_room == 0].index
train.ix[bad_index, "num_room"] = np.NaN
bad_index = test[test.num_room == 0].index
test.ix[bad_index, "num_room"] = np.NaN
bad_index = [10076, 11621, 17764, 19390, 24007, 26713]
train.ix[bad_index, "num_room"] = np.NaN
bad_index = [3174, 7313]
test.ix[bad_index, "num_room"] = np.NaN
bad_index = train[(train.floor == 0).values * (train.max_floor == 0).values].index
train.ix[bad_index, ["max_floor", "floor"]] = np.NaN
bad_index = train[train.floor == 0].index
train.ix[bad_index, "floor"] = np.NaN
bad_index = train[train.max_floor == 0].index
train.ix[bad_index, "max_floor"] = np.NaN
bad_index = test[test.max_floor == 0].index
test.ix[bad_index, "max_floor"] = np.NaN
bad_index = train[train.floor > train.max_floor].index
train.ix[bad_index, "max_floor"] = np.NaN
bad_index = test[test.floor > test.max_floor].index
test.ix[bad_index, "max_floor"] = np.NaN
train.floor.describe(percentiles= [0.9999])
bad_index = [23584]
train.ix[bad_index, "floor"] = np.NaN
train.material.value_counts()
test.material.value_counts()
train.state.value_counts()
bad_index = train[train.state == 33].index
train.ix[bad_index, "state"] = np.NaN
test.state.value_counts()
# brings error down a lot by removing extreme price per sqm
train.loc[train.full_sq == 0, 'full_sq'] = 50
train = train[train.price_doc/train.full_sq <= 600000]
train = train[train.price_doc/train.full_sq >= 10000]
# Add month-year
month_year = (train.timestamp.dt.month + train.timestamp.dt.year * 100)
month_year_cnt_map = month_year.value_counts().to_dict()
train['month_year_cnt'] = month_year.map(month_year_cnt_map)
month_year = (test.timestamp.dt.month + test.timestamp.dt.year * 100)
month_year_cnt_map = month_year.value_counts().to_dict()
test['month_year_cnt'] = month_year.map(month_year_cnt_map)
# Add week-year count
week_year = (train.timestamp.dt.weekofyear + train.timestamp.dt.year * 100)
week_year_cnt_map = week_year.value_counts().to_dict()
train['week_year_cnt'] = week_year.map(week_year_cnt_map)
week_year = (test.timestamp.dt.weekofyear + test.timestamp.dt.year * 100)
week_year_cnt_map = week_year.value_counts().to_dict()
test['week_year_cnt'] = week_year.map(week_year_cnt_map)
# Add month and day-of-week
train['month'] = train.timestamp.dt.month
train['dow'] = train.timestamp.dt.dayofweek
test['month'] = test.timestamp.dt.month
test['dow'] = test.timestamp.dt.dayofweek
# Other feature engineering
train['rel_floor'] = train['floor'] / train['max_floor'].astype(float)
train['rel_kitch_sq'] = train['kitch_sq'] / train['full_sq'].astype(float)
test['rel_floor'] = test['floor'] / test['max_floor'].astype(float)
test['rel_kitch_sq'] = test['kitch_sq'] / test['full_sq'].astype(float)
train.apartment_name=train.sub_area + train['metro_km_avto'].astype(str)
test.apartment_name=test.sub_area + train['metro_km_avto'].astype(str)
train['room_size'] = train['life_sq'] / train['num_room'].astype(float)
test['room_size'] = test['life_sq'] / test['num_room'].astype(float)
y_train = train["price_doc"]
wts = 1 - .47*(y_train == 1e6)
x_train = train.drop(["id", "timestamp", "price_doc"], axis=1)
x_test = test.drop(["id", "timestamp"], axis=1)
for c in x_train.columns:
if x_train[c].dtype == 'object':
lbl = preprocessing.LabelEncoder()
lbl.fit(list(x_train[c].values))
x_train[c] = lbl.transform(list(x_train[c].values))
#x_train.drop(c,axis=1,inplace=True)
for c in x_test.columns:
if x_test[c].dtype == 'object':
lbl = preprocessing.LabelEncoder()
lbl.fit(list(x_test[c].values))
x_test[c] = lbl.transform(list(x_test[c].values))
#x_test.drop(c,axis=1,inplace=True)
xgb_params = {
'eta': 0.05,
'max_depth': 5,
'subsample': 0.7,
'colsample_bytree': 0.7,
'objective': 'reg:linear',
'eval_metric': 'rmse',
'silent': 1
}
dtrain = xgb.DMatrix(x_train, y_train, weight=wts)
dtest = xgb.DMatrix(x_test)
#cv_output = xgb.cv(xgb_params, dtrain, num_boost_round=1000, early_stopping_rounds=20,
# verbose_eval=50, show_stdv=False)
#cv_output[['train-rmse-mean', 'test-rmse-mean']].plot()
#num_boost_rounds = len(cv_output)
model = xgb.train(dict(xgb_params, silent=0), dtrain, num_boost_round=350)
#fig, ax = plt.subplots(1, 1, figsize=(8, 13))
#xgb.plot_importance(model, max_num_features=50, height=0.5, ax=ax)
y_predict = model.predict(dtest)
jason_model_output = pd.DataFrame({'id': id_test, 'price_doc': y_predict})
jason_model_output.head()
Out[68]:
In [69]:
jason_model_output.to_csv('jason_model.csv', index=False)
np.exp(jason_model_output.price_doc.apply(np.log).mean())
Out[69]:
In [70]:
# Reynaldo
train = pd.read_csv(train_path)
# train = price_doc_to_meter(train)
test = pd.read_csv(test_path)
id_test = test.id
y_train = train["price_doc"]
x_train = train.drop(["id", "timestamp", "price_doc"], axis=1)
x_test = test.drop(["id", "timestamp"], axis=1)
for c in x_train.columns:
if x_train[c].dtype == 'object':
lbl = preprocessing.LabelEncoder()
lbl.fit(list(x_train[c].values))
x_train[c] = lbl.transform(list(x_train[c].values))
for c in x_test.columns:
if x_test[c].dtype == 'object':
lbl = preprocessing.LabelEncoder()
lbl.fit(list(x_test[c].values))
x_test[c] = lbl.transform(list(x_test[c].values))
xgb_params = {
'eta': 0.05,
'max_depth': 5,
'subsample': 0.7,
'colsample_bytree': 0.7,
'objective': 'reg:linear',
'eval_metric': 'rmse',
'silent': 1
}
dtrain = xgb.DMatrix(x_train, y_train)
dtest = xgb.DMatrix(x_test)
num_boost_rounds = 384 # This was the CV output, as earlier version shows
model = xgb.train(dict(xgb_params, silent=0), dtrain, num_boost_round= num_boost_rounds)
y_predict = model.predict(dtest)
reynaldo_model_output = pd.DataFrame({'id': id_test, 'price_doc': y_predict})
reynaldo_model_output.head()
Out[70]:
In [71]:
reynaldo_model_output.to_csv('reynaldo_model.csv', index=False)
np.exp( reynaldo_model_output.price_doc.apply(np.log).mean() )
Out[71]:
In [72]:
# Bruno with outlier dropped
# Any results you write to the current directory are saved as output.
df_train = pd.read_csv(train_path, parse_dates=['timestamp'])
# df_train = price_doc_to_meter(df_train)
df_test = pd.read_csv(test_path, parse_dates=['timestamp'])
df_macro = pd.read_csv(macro_path, parse_dates=['timestamp'])
df_train.drop(df_train[df_train["life_sq"] > 7000].index, inplace=True)
y_train = df_train['price_doc'].values
id_test = df_test['id']
df_train.drop(['id', 'price_doc'], axis=1, inplace=True)
df_test.drop(['id'], axis=1, inplace=True)
num_train = len(df_train)
df_all = pd.concat([df_train, df_test])
# Next line just adds a lot of NA columns (becuase "join" only works on indexes)
# but somewhow it seems to affect the result
df_all = df_all.join(df_macro, on='timestamp', rsuffix='_macro')
print(df_all.shape)
# Add month-year
month_year = (df_all.timestamp.dt.month + df_all.timestamp.dt.year * 100)
month_year_cnt_map = month_year.value_counts().to_dict()
df_all['month_year_cnt'] = month_year.map(month_year_cnt_map)
# Add week-year count
week_year = (df_all.timestamp.dt.weekofyear + df_all.timestamp.dt.year * 100)
week_year_cnt_map = week_year.value_counts().to_dict()
df_all['week_year_cnt'] = week_year.map(week_year_cnt_map)
# Add month and day-of-week
df_all['month'] = df_all.timestamp.dt.month
df_all['dow'] = df_all.timestamp.dt.dayofweek
# Other feature engineering
df_all['rel_floor'] = df_all['floor'] / df_all['max_floor'].astype(float)
df_all['rel_kitch_sq'] = df_all['kitch_sq'] / df_all['full_sq'].astype(float)
# Remove timestamp column (may overfit the model in train)
df_all.drop(['timestamp', 'timestamp_macro'], axis=1, inplace=True)
factorize = lambda t: pd.factorize(t[1])[0]
df_obj = df_all.select_dtypes(include=['object'])
X_all = np.c_[
df_all.select_dtypes(exclude=['object']).values,
np.array(list(map(factorize, df_obj.iteritems()))).T
]
print(X_all.shape)
X_train = X_all[:num_train]
X_test = X_all[num_train:]
# Deal with categorical values
df_numeric = df_all.select_dtypes(exclude=['object'])
df_obj = df_all.select_dtypes(include=['object']).copy()
for c in df_obj:
df_obj[c] = pd.factorize(df_obj[c])[0]
df_values = pd.concat([df_numeric, df_obj], axis=1)
# Convert to numpy values
X_all = df_values.values
print(X_all.shape)
X_train = X_all[:num_train]
X_test = X_all[num_train:]
df_columns = df_values.columns
xgb_params = {
'eta': 0.05,
'max_depth': 5,
'subsample': 0.7,
'colsample_bytree': 0.7,
'objective': 'reg:linear',
'eval_metric': 'rmse',
'silent': 1
}
dtrain = xgb.DMatrix(X_train, y_train, feature_names=df_columns)
dtest = xgb.DMatrix(X_test, feature_names=df_columns)
num_boost_round = 489 # From Bruno's original CV, I think
model = xgb.train(dict(xgb_params, silent=0), dtrain, num_boost_round=num_boost_round)
y_pred = model.predict(dtest)
bruno_model_output = pd.DataFrame({'id': id_test, 'price_doc': y_pred})
bruno_model_output.head()
Out[72]:
In [73]:
bruno_model_output.to_csv('bruno_model.csv', index=False)
np.exp( bruno_model_output.price_doc.apply(np.log).mean() )
Out[73]:
In [74]:
# Merge
results = reynaldo_model_output.merge(
jason_model_output.merge(
bruno_model_output, on='id', suffixes=['_jason','_bruno'] ), on='id' )
results["price_doc_reynaldo"] = results["price_doc"]
results["price_doc"] = np.exp( np.log(results.price_doc_reynaldo)*reynaldo_weight +
np.log(results.price_doc_jason)*jason_weight +
np.log(results.price_doc_bruno)*bruno_weight )
results.drop(["price_doc_reynaldo", "price_doc_bruno", "price_doc_jason"],axis=1,inplace=True)
results.head()
Out[74]:
In [75]:
results.to_csv('unadjusted_combo.csv', index=False)
In [76]:
# Functions to use in data adjustment
def scale_miss(
alpha,
shifted_logs,
oldstd,
new_logmean
):
newlogs = new_logmean + alpha*(shifted_logs - new_logmean)
newstd = np.std(np.exp(newlogs))
return (newstd-oldstd)**2
def shift_logmean_but_keep_scale( # Or change the scale, but relative to the old scale
data,
new_logmean,
rescaler
):
logdata = np.log(data)
oldstd = data.std()
shift = new_logmean - logdata.mean()
shifted_logs = logdata + shift
scale = sp.optimize.leastsq( scale_miss, 1, args=(shifted_logs, oldstd, new_logmean) )
alpha = scale[0][0]
newlogs = new_logmean + rescaler*alpha*(shifted_logs - new_logmean)
return np.exp(newlogs)
In [77]:
# Adjust
lnm = np.log(macro_mean)
y_predict = shift_logmean_but_keep_scale( results.price_doc, lnm, micro_humility_factor )
sub = pd.DataFrame({'id': id_test, 'price_doc': y_predict})
# sub = price_meter_to_doc(test.merge(sub, on="id"))[["id", "price_doc"]]
sub.to_csv('andy_sub.csv', index=False)
sub.head()
Out[77]:
In [ ]: