This notebook estimates the price for a rental if it has medium interest, the idea being to create a framing effect for the model.
In [4]:
import os
import sys
import operator
import numpy as np
import pandas as pd
from scipy import sparse
import xgboost as xgb
import random
from sklearn import model_selection, preprocessing, ensemble
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import pickle
import sklearn.cluster
import Levenshtein
from multiprocessing import Pool
In [5]:
train_df = pd.read_pickle('fin-dprep-train.pkl')
test_df = pd.read_pickle('fin-dprep-test.pkl')
features_to_use = pickle.load(open('fin-dprep-flist.pkl', 'rb'))
In [6]:
train_df.price = train_df.price.clip(0, 13000)
In [7]:
class MeansProcessor:
def __init__(self, key, outkey = None, tgt = 'interest_cat'):
self.key = key
self.outkey = key if outkey is None else outkey
self.count = {}
self.means = {}
self.std = {}
self.global_means = 0
self.tgt = tgt
self.outkeys = [self.outkey + '_level', self.outkey + '_level_std']
def fit(self, df):
self.global_means = df[self.tgt].mean()
for k in df.groupby(self.key, sort=False):
self.count[k[0]] = len(k[1])
if len(k[1]) < 0:
self.means[k[0]] = np.nan
self.std[k[0]] = np.nan
else:
self.means[k[0]] = np.mean(k[1][self.tgt])
self.std[k[0]] = np.std(k[1][self.tgt])
def predict(self, df):
for l in self.outkeys:
df[l] = np.nan # self.global_means[l]
df[self.outkey + '_count'] = 0
for k in df.groupby(self.key, sort=False):
if k[0] == 0:
continue
if k[0] in self.means:
df.loc[k[1].index, self.outkey + '_count'] = self.count[k[0]]
df.loc[k[1].index, self.outkey + '_level'] = self.means[k[0]]
df.loc[k[1].index, self.outkey + '_level_std'] = self.std[k[0]]
return df
def get_features(self):
return self.outkeys.copy() + [self.outkey + '_count']
# i kept the same index randomization (with fixed seed) so I could validate this code against
# the original...
target_num_map = {'low':0, 'medium':1, 'high':2}
train_y = np.array(train_df['interest_level'].apply(lambda x: target_num_map[x]))
def proc_fold(fold):
train_index = fold[0]
test_index = fold[1]
cv_train = train_df.iloc[train_index]
cv_valid = train_df.iloc[test_index][['interest_level', 'manager_id', 'building_id']]
cv_test = test_df.copy()
m_build = MeansProcessor('building_id', 'building_sort')
m_build.fit(cv_train)
cv_valid = m_build.predict(cv_valid)
cv_test = m_build.predict(cv_test)
m_mgr = MeansProcessor('manager_id', 'manager_sort')
m_mgr.fit(cv_train)
cv_valid = m_mgr.predict(cv_valid)
cv_test = m_mgr.predict(cv_test)
m_comb = MeansProcessor(['building_id', 'manager_id'], 'mb_comb')
m_comb.fit(cv_train)
cv_valid = m_comb.predict(cv_valid)
cv_test = m_comb.predict(cv_test)
return cv_train, cv_valid, cv_test
kf = model_selection.StratifiedKFold(n_splits=5, shuffle=True, random_state=2016)
folds = [(k[0], k[1]) for k in kf.split(list(range(train_df.shape[0])), train_y)]
#with Pool(5) as pool:
# rv = pool.map(proc_fold, folds)
import pickle
try:
rv = pickle.load(open('0420-model-groupfeatures.pkl', 'rb'))
except:
with Pool(5) as pool:
rv = pool.map(proc_fold, folds)
pickle.dump(rv, open('0420-model-groupfeatures.pkl', 'wb'))
# dummies to get feature id's
m_build = MeansProcessor('building_id', 'building_sort')
m_mgr = MeansProcessor('manager_id', 'manager_sort')
m_comb = MeansProcessor(['building_id', 'manager_id'], 'mb_comb')
group_features = m_build.get_features() + m_mgr.get_features() + m_comb.get_features()
cv_test = []
for r in rv:
cv_test.append(test_df.merge(r[2][group_features], left_index=True, right_index=True))
cv_allvalid = pd.concat([r[1] for r in rv])
train_df = train_df.merge(cv_allvalid[group_features], left_index=True, right_index=True)
In [8]:
#fl = features_to_use + m_build.get_features() + m_mgr.get_features() + m_comb.get_features() + tfidf_fn
fl = features_to_use + group_features
fl.remove('price')
fl.remove('price_t')
fl.remove('price_per_room')
fl.append('density_exp01')
In [9]:
for f in fl:
if 'price' in f:
print(f)
In [10]:
def run_to_stackdf(run):
df_testpreds = pd.DataFrame(run[2].mean(axis=0))
df_testpreds.columns = ['level']
df_testpreds['listing_id'] = cv_test[0].listing_id
df_allpreds = pd.concat([run[1][['level', 'listing_id']], df_testpreds])
df_allpreds.sort_values('listing_id', inplace=True)
df_allpreds.set_index('listing_id', inplace=True)
return df_allpreds
In [11]:
def runXGB1(train_X, train_y, test_X, test_y=None, feature_names=None, seed_val=0, num_rounds=4000):
param = {}
param['objective'] = 'reg:linear'
#param['tree_method'] = 'hist'
param['eta'] = 0.02
param['max_depth'] = 8
param['silent'] = 1
param['num_class'] = 1
param['eval_metric'] = "rmse"
param['min_child_weight'] = 1
param['subsample'] = 0.7
param['colsample_bytree'] = 0.7
param['seed'] = seed_val
param['base_score'] = train_y.mean()
num_rounds = num_rounds
plst = list(param.items())
xgtrain = xgb.DMatrix(train_X, label=train_y)
if test_y is not None:
xgtest = xgb.DMatrix(test_X, label=test_y)
watchlist = [ (xgtrain,'train'), (xgtest, 'test') ]
model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=50, verbose_eval=10)
else:
xgtest = xgb.DMatrix(test_X)
model = xgb.train(plst, xgtrain, num_rounds)
pred_test_y = model.predict(xgtest, ntree_limit=model.best_ntree_limit)
return pred_test_y, model
In [12]:
def run_cv1(train_df, cv_test, kf, features_to_use):
cv_preds = []
cv_scores = []
models = []
test_preds = []
fold = 0
for dev_index, val_index in kf.split(train_df, train_df.interest_cat):
cv_cur_train = train_df.loc[dev_index]
cv_cur_train = cv_cur_train[cv_cur_train.interest_cat == 1]
cv_cur_valid = train_df.loc[val_index]
cv_cur_valid_cut = cv_cur_valid[cv_cur_valid.interest_cat == 1]
dev_X, val_X = cv_cur_train[features_to_use], cv_cur_valid_cut[features_to_use]
dev_y, val_y = cv_cur_train['price'], cv_cur_valid_cut['price']
#dev_X, val_X = cv_cur_train[features_to_use], cv_cur_valid[features_to_use]
#dev_y, val_y = cv_cur_train['price'], cv_cur_valid['price']
preds, model = runXGB1(dev_X, dev_y, val_X, val_y)
models.append(model)
cv_scores.append(model.best_score)
print(cv_scores)
fullpreds = model.predict(xgb.DMatrix(cv_cur_valid[features_to_use]), ntree_limit=model.best_ntree_limit)
cut_df = train_df.loc[val_index]
out_df = pd.DataFrame(fullpreds)
out_df.columns = ["predicted_price"]
out_df["listing_id"] = cut_df.listing_id.values
out_df['interest_tgt'] = cut_df.interest.values
out_df['interest_cat'] = cut_df.interest_cat.values
out_df['price'] = cut_df.price.values
cv_preds.append(out_df)
xgtest = xgb.DMatrix(cv_test[fold][features_to_use])
test_preds.append(model.predict(xgtest, ntree_limit=model.best_ntree_limit))
df_cv = pd.concat(cv_preds)
apreds = np.array(test_preds)
return models, df_cv, apreds
In [13]:
kfold = model_selection.KFold(n_splits=5, shuffle=True, random_state=2016)
rv1 = run_cv1(train_df, cv_test, kfold, fl)
In [14]:
rv1_subset = rv1[1][rv1[1].interest_cat == 1]
rmse_subset = np.sqrt(sklearn.metrics.mean_squared_error(rv1_subset.predicted_price, rv1_subset.price))
mae_subset = sklearn.metrics.mean_absolute_error(rv1_subset.predicted_price, rv1_subset.price)
mae_full = sklearn.metrics.mean_absolute_error(rv1[1].predicted_price, rv1[1].price)
rmse_full = np.sqrt(sklearn.metrics.mean_squared_error(rv1[1].predicted_price, rv1[1].price))
ldiff = np.log(rv1[1].predicted_price) - np.log(rv1[1].price)
print(ldiff.mean(), rv1[1].interest_tgt.corr(ldiff), mae_subset, mae_full, rmse_full)
In [15]:
#rv1[1].columns = ['predicted_price', 'listing_id', 'interest_tgt']
df_testpreds = pd.DataFrame(rv1[2].mean(axis=0))
df_testpreds.columns = ['predicted_price']
df_testpreds['listing_id'] = cv_test[0].listing_id
df_allpreds = pd.concat([rv1[1][['predicted_price', 'listing_id']], df_testpreds])
df_allpreds.sort_values('listing_id', inplace=True)
df_allpreds.set_index('listing_id', inplace=True)
df_allpreds.to_pickle('fin-medium-price.pkl')
In [25]:
df_allprices = pd.concat([train_df[['listing_id', 'price']], test_df[['listing_id', 'price']]]).copy()
df_allprices.set_index('listing_id', inplace=True)
df_allprices.sort_index(inplace=True)
df_allpreds_logdiff = df_allpreds.copy()
df_allpreds_logdiff['logdiff'] = np.log(df_allpreds_logdiff.predicted_price) - np.log(df_allprices.price)
df_allpreds_logdiff.drop('predicted_price', axis=1, inplace=True)
#df_allpreds_logdiff.to_pickle('bag-submodel-medium-logdiff-r1.pkl')
df_fold = []
for f in range(5):
df_fold.append(pd.DataFrame(rv1[2][f]))
df_fold[-1].columns = ['predicted_price']
df_fold[-1]['logdiff'] = np.log(df_fold[-1].predicted_price) - np.log(cv_test[0].price)
df_fold[-1].drop('predicted_price', axis=1, inplace=True)
df_fold[-1]['listing_id'] = test_df.listing_id
df_fold[-1].sort_values('listing_id', inplace=True)
df_fold[-1].set_index('listing_id', inplace=True)
pickle.dump((df_allpreds_logdiff, df_fold), open('model-medium-logdiff.pkl', 'wb'))
In [16]:
df_tp_cmp = df_testpreds.copy()
df_tp_cmp['price'] = cv_test[0]['price']
In [23]:
df_tp_cmp_cut = df_tp_cmp.copy()
df_tp_cmp_cut.price = df_tp_cmp_cut.price.clip(0, 13000)
In [24]:
np.sqrt(sklearn.metrics.mean_squared_error(df_tp_cmp_cut.price, df_tp_cmp_cut.predicted_price))
Out[24]:
In [ ]:
# XXX update for final package
In [19]:
df_allprices = pd.concat([train_df[['listing_id', 'price']], test_df[['listing_id', 'price']]]).copy()
df_allprices.set_index('listing_id', inplace=True)
df_allprices.sort_index(inplace=True)
In [20]:
df_allpreds_logdiff = df_allpreds.copy()
df_allpreds_logdiff['logdiff'] = np.log(df_allpreds_logdiff.predicted_price) - np.log(df_allprices.price)
df_allpreds_logdiff.drop('predicted_price', axis=1, inplace=True)
df_allpreds_logdiff.to_pickle('fin-submodel-medium-logdiff.pkl')