This stacking code includes XGB and lightGBM stackers - I found using OOB tests (not posted here, I wrote them in a hurry and this is pretty much the same code) that XGB alone worked better - but they usually work in lock step, so I can experiment with the much faster lightGBM.
In [1]:
import os
import sys
import operator
import numpy as np
import pandas as pd
from scipy import sparse
import xgboost as xgb
import random
from sklearn import model_selection, preprocessing, ensemble
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import pickle
import sklearn.cluster
import Levenshtein
from multiprocessing import Pool
In [2]:
train_df = pd.read_pickle('fin-dprep-train.pkl')
test_df = pd.read_pickle('fin-dprep-test.pkl')
features_to_use = pickle.load(open('fin-dprep-flist.pkl', 'rb'))
medium_price = pd.read_pickle('fin-medium-price.pkl')
train_df = pd.merge(train_df, medium_price, left_on='listing_id', right_index=True)
test_df = pd.merge(test_df, medium_price, left_on='listing_id', right_index=True)
In [3]:
target_num_map = {'low':0, 'medium':1, 'high':2}
train_y = np.array(train_df['interest_level'].apply(lambda x: target_num_map[x]))
kf = model_selection.StratifiedKFold(n_splits=5, shuffle=True, random_state=2016)
folds = [(k[0], k[1]) for k in kf.split(list(range(train_df.shape[0])), train_y)]
train_ids = []
val_ids = []
for dev_index, val_index in kf.split(range(train_df.shape[0]), train_df.interest_cat):
train_ids.append(train_df.iloc[dev_index].listing_id.values)
val_ids.append(train_df.iloc[val_index].listing_id.values)
In [4]:
class StackerXGB:
def __init__(self, use, nn_shape = [(32, .1), (16, .1)]):
self.train_info, self.test_ids = pd.read_pickle('stacker-info.pkl')
self.nfolds = 5
self.nn_shape = nn_shape.copy()
df_nn = use[0][0].copy()
self.df_nn_tests = [u.copy() for u in use[0][1]]
for i, df in enumerate(use[1:]):
df_nn = pd.merge(df_nn, df[0], left_index = True, right_index = True)
for f in range(self.nfolds):
self.df_nn_tests[f] = pd.merge(self.df_nn_tests[f], df[1][f], left_index = True, right_index = True)
self.df_nn_train = df_nn.loc[self.train_info.index]
#self.test_x = np.array(self.df_nn_test.values)
self.models = []
self.df_folds = []
self.test_preds = []
self.tgts = ['low', 'medium', 'high']
param = {}
param['objective'] = 'multi:softprob'
#param['tree_method'] = 'hist'
param['eta'] = 0.05
param['eta'] = 0.01
param['max_depth'] = 3
param['silent'] = 1
param['num_class'] = 3
param['eval_metric'] = "mlogloss"
param['min_child_weight'] = 4
param['subsample'] = .7
param['colsample_bytree'] = 0.7
param['seed'] = 1234
self.plst = list(param.items())
# plenty of code to do this, but it's simple enough
def oneheat(self, y):
rv = np.zeros((len(y), 3))
for i in [0, 1, 2]:
rv[:,i] = (y == i)
return rv
def run_fold(self, foldnum, train_idx, valid_idx):
nn_fold_train = self.df_nn_train.loc[train_idx]
nn_fold_valid = self.df_nn_train.loc[valid_idx]
tmp_train_x = np.array(nn_fold_train.values)
tmp_valid_x = np.array(nn_fold_valid.values)
xgtrain = xgb.DMatrix(tmp_train_x, label=self.train_info.loc[train_idx].interest_level)
xgvalid = xgb.DMatrix(tmp_valid_x, label=self.train_info.loc[valid_idx].interest_level)
watchlist = [ (xgtrain,'train'), (xgvalid, 'valid') ]
model = xgb.train(self.plst, xgtrain, 8000, watchlist, early_stopping_rounds=50, verbose_eval=100)
tpreds = model.predict(xgvalid, ntree_limit = model.best_ntree_limit)
df_tmp = pd.DataFrame(tpreds)
df_tmp.columns = [['low', 'medium', 'high']]
df_tmp['listing_id'] = nn_fold_valid.index
df_tmp.set_index('listing_id', inplace=True)
#tgts = ['low', 'medium', 'high']
print(log_loss(self.train_info.loc[valid_idx].interest_level, df_tmp[self.tgts]))
self.df_folds.append(df_tmp)
test_x = xgb.DMatrix(self.df_nn_tests[foldnum].values)
self.test_preds.append(model.predict(test_x, ntree_limit = model.best_ntree_limit))
self.models.append(model)
return df_tmp
def run(self, train_ids, val_ids):
#print(folds)
#self.kf_nn = model_selection.StratifiedKFold(n_splits=folds, shuffle=True, random_state=seed)
#self.folds_nn = [(k[0], k[1]) for k in self.kf_nn.split(self.df_nn_train.index, self.train_info.interest_level)]
for i, fold in enumerate(zip(train_ids, val_ids)):
self.run_fold(i, fold[0], fold[1])
self.df_cv = pd.concat(self.df_folds).sort_index()
print('CV logloss:', log_loss(self.train_info.interest_level, self.df_cv[self.tgts]))
testarray = np.array(self.test_preds.copy())
self.df_test = pd.DataFrame(testarray.mean(axis=0))
self.df_test.columns = [['low', 'medium', 'high']]
self.df_test['listing_id'] = self.test_ids
self.df_test.set_index('listing_id', inplace=True)
return self.df_cv, self.df_test
In [5]:
import lightgbm as lgbm
class StackerLGBM:
def __init__(self, use, nn_shape = [(32, .1), (16, .1)], nfolds = 5):
self.train_info, self.test_ids = pd.read_pickle('stacker-info.pkl')
self.nfolds = 5
self.nn_shape = nn_shape.copy()
df_nn = use[0][0].copy()
self.df_nn_tests = [u.copy() for u in use[0][1]]
for i, df in enumerate(use[1:]):
df_nn = pd.merge(df_nn, df[0], left_index = True, right_index = True)
for f in range(self.nfolds):
self.df_nn_tests[f] = pd.merge(self.df_nn_tests[f], df[1][f], left_index = True, right_index = True)
self.df_nn_train = df_nn.loc[self.train_info.index]
#self.test_x = np.array(self.df_nn_test.values)
self.models = []
self.df_folds = []
self.test_preds = []
self.tgts = ['low', 'medium', 'high']
t4_params = {
'boosting_type': 'gbdt', 'objective': 'multiclass', 'nthread': -1, 'silent': True,
'num_leaves': 5, 'learning_rate': 0.01, 'max_depth': -1, 'metric': ['multi_logloss'],
'max_bin': 255, 'subsample_for_bin': 50000,
'subsample': 0.7, 'subsample_freq': 1, 'colsample_bytree': .85, 'reg_alpha': 1, 'reg_lambda': 0,
'min_split_gain': 1, 'min_child_weight': 1, 'min_child_samples': 50, 'scale_pos_weight': 1}
self.lgbm_params = t4_params.copy()
self.lgbm_params['num_class'] = 3
# plenty of code to do this, but it's simple enough
def oneheat(self, y):
rv = np.zeros((len(y), 3))
for i in [0, 1, 2]:
rv[:,i] = (y == i)
return rv
def run_fold(self, foldnum, train_idx, valid_idx):
nn_fold_train = self.df_nn_train.loc[train_idx]
nn_fold_valid = self.df_nn_train.loc[valid_idx]
tmp_train_x = np.array(nn_fold_train.values)
tmp_valid_x = np.array(nn_fold_valid.values)
#xgtrain = xgb.DMatrix(tmp_train_x, label=self.train_info.iloc[train_idx].interest_level)
#xgvalid = xgb.DMatrix(tmp_valid_x, label=self.train_info.iloc[valid_idx].interest_level)
dset = lgbm.Dataset(tmp_train_x, self.train_info.loc[train_idx].interest_level, silent=True)
dset_val = lgbm.Dataset(tmp_valid_x, self.train_info.loc[valid_idx].interest_level, silent=True)
#watchlist = [ (xgtrain,'train'), (xgvalid, 'valid') ]
#model = xgb.train(self.plst, xgtrain, 4000, watchlist, early_stopping_rounds=50, verbose_eval=10)
model = lgbm.train(self.lgbm_params, dset, early_stopping_rounds=100, verbose_eval=False, valid_sets=dset_val, num_boost_round=2000)
tpreds = model.predict(tmp_valid_x, num_iteration=model.best_iteration)
df_tmp = pd.DataFrame(tpreds)
df_tmp.columns = [['low', 'medium', 'high']]
df_tmp['listing_id'] = nn_fold_valid.index
df_tmp.set_index('listing_id', inplace=True)
#tgts = ['low', 'medium', 'high']
print(log_loss(self.train_info.loc[valid_idx].interest_level, df_tmp[self.tgts]))
self.df_folds.append(df_tmp)
self.test_preds.append(model.predict(self.df_nn_tests[foldnum].values, num_iteration=model.best_iteration))
self.models.append(model)
return df_tmp
def run(self, train_ids, val_ids):
#print(folds)
#self.kf_nn = model_selection.StratifiedKFold(n_splits=folds, shuffle=True, random_state=seed)
#self.folds_nn = [(k[0], k[1]) for k in self.kf_nn.split(self.df_nn_train.index, self.train_info.interest_level)]
for i, fold in enumerate(zip(train_ids, val_ids)):
self.run_fold(i, fold[0], fold[1])
self.df_cv = pd.concat(self.df_folds).sort_index()
print('CV logloss:', log_loss(self.train_info.interest_level, self.df_cv[self.tgts]))
testarray = np.array(self.test_preds.copy())
self.df_test = pd.DataFrame(testarray.mean(axis=0))
self.df_test.columns = [['low', 'medium', 'high']]
self.df_test['listing_id'] = self.test_ids
self.df_test.set_index('listing_id', inplace=True)
return self.df_cv, self.df_test
In [40]:
dfs_sn1 = pickle.load(open('stacker-sn-l1.pkl', 'rb'))
dfs_lgbm_v2 = pickle.load(open('modeloutput-klightgbm-clf-r2.pkl', 'rb'))
dfs_lgbmr_v2 = pickle.load(open('modeloutput-lightgbm-reg-r2.pkl', 'rb'))
dfs_lgbm_v3 = pickle.load(open('modeloutput-klightgbm-clf-r3.pkl', 'rb'))
dfs_lgbmr_v3 = pickle.load(open('modeloutput-lightgbm-reg-r3.pkl', 'rb'))
dfs_xgbv2 = pickle.load(open('modeloutput-xgb-clf-r2.pkl', 'rb'))
dfs_xgbv3 = pickle.load(open('modeloutput-xgb-clf-r3.pkl', 'rb'))
dfs_xgbrv2 = pickle.load(open('modeloutput-xgb-reg-r2.pkl', 'rb'))
dfs_xgbrv3 = pickle.load(open('modeloutput-xgb-reg-r3.pkl', 'rb'))
dfs_rf = pickle.load(open('model-output-rf.pkl', 'rb'))
dfs_med = pickle.load(open('model-medium-logdiff.pkl', 'rb'))
dfs_med3 = pickle.load(open('model-medium-logdiff-r2.pkl', 'rb'))
dfs_nn = pickle.load(open('bag-model-nn-v1.pkl', 'rb'))
mset = [dfs_lgbm3, dfs_lgbmr3, dfs_sn1, dfs_rf, dfs_xgbv2, dfs_xgbrv3, dfs_med3]
In [43]:
mset = [dfs_lgbm_v3, dfs_lgbmr_v3, dfs_sn1, dfs_rf, dfs_xgbv3, dfs_xgbrv2, dfs_med3]
s = StackerLGBM(mset)
df_cv, df_test = s.run(train_ids, val_ids)
In [42]:
mset = [dfs_lgbm3, dfs_lgbmr3, dfs_sn1, dfs_rf, dfs_xgbv3, dfs_xgbrv2, dfs_med3]
mset = [dfs_lgbmr_v2, dfs_lgbm_v2, dfs_lgbm_v3, dfs_lgbmr_v3, dfs_sn1, dfs_rf, dfs_xgbv3, dfs_xgbrv2, dfs_med]
s = StackerLGBM(mset)
df_cv, df_test = s.run(train_ids, val_ids)
In [34]:
s = StackerXGB(mset)
df_cv_xgb, df_test_xgb = s.run(train_ids, val_ids)
In [45]:
mset = [dfs_lgbm3, dfs_lgbmr3, dfs_sn1, dfs_rf, dfs_xgbv2, dfs_xgbrv2, dfs_med3]
mset = [dfs_lgbmr_v2, dfs_lgbm_v2, dfs_lgbm_v3, dfs_lgbmr_v3, dfs_sn1, dfs_rf, dfs_xgbv3, dfs_xgbrv2, dfs_med]
sa = StackerXGB(mset)
df_cv_xgba, df_test_xgba = sa.run(train_ids, val_ids)
In [46]:
for k in df_test_xgb.keys():
print(k, df_test_xgb[k].min(), df_test_xgb[k].max())
In [47]:
df_test_xgb.to_csv('submission-0424-4a.csv.gz', compression='gzip')
compare with old submission to make sure this isn't stupid ;)
In [48]:
old_sub = pd.read_csv('../nb/k0423-r2_verywidestack.csv.gz')
In [49]:
old_sub.set_index('listing_id', inplace=True)
In [50]:
df_test_xgb.low.corr(old_sub.low)
Out[50]:
In [51]:
df_test_xgb.medium.corr(old_sub.medium)
Out[51]:
In [52]:
df_test_xgb.high.corr(old_sub.high)
Out[52]:
In [ ]: