lightgbm processor
In [1]:
import os
import sys
import operator
import numpy as np
import pandas as pd
from scipy import sparse
import xgboost as xgb
import random
from sklearn import model_selection, preprocessing, ensemble
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import pickle
import time
import sklearn.cluster
import Levenshtein
from multiprocessing import Pool
import lightgbm as lgbm
In [2]:
train_df = pd.read_pickle('fin-dprep-train.pkl')
test_df = pd.read_pickle('fin-dprep-test.pkl')
features_to_use = pickle.load(open('fin-dprep-flist.pkl', 'rb'))
adams = pd.read_pickle('features-adams.pkl')
train_df = pd.merge(train_df, adams, left_on='listing_id', right_index=True)
test_df = pd.merge(test_df, adams, left_on='listing_id', right_index=True)
In [3]:
target_num_map_reg = {'low':0, 'medium': (.5 + (9/13)) / 2, 'high':1}
train_df['interest'] = np.array(train_df['interest_level'].apply(lambda x: target_num_map_reg[x]))
In [4]:
medium_price = pd.read_pickle('fin-medium-price.pkl')
train_df = pd.merge(train_df, medium_price, left_on='listing_id', right_index=True)
test_df = pd.merge(test_df, medium_price, left_on='listing_id', right_index=True)
In [5]:
for df in [train_df, test_df]:
df['predicted_price_diff'] = np.log(df.predicted_price) - np.log(df.price)
df['predicted_price_ratio'] = np.log(df.predicted_price) / np.log(df.price)
In [6]:
# fill in the NaN's.
for t in train_df.keys():
nacount = train_df[t].isnull().sum()
if nacount:
# nacount_test = test_df[t].isnull().sum()
print(t, nacount / len(train_df))#, nacount_test / len(test_df))
train_df.fillna(-99999, inplace=True)
test_df.fillna(-99999, inplace=True)
In [7]:
class MeansProcessor:
def __init__(self, key, outkey = None, tgt = 'interest'):
self.key = key
self.outkey = key if outkey is None else outkey
self.count = {}
self.means = {}
self.std = {}
self.global_means = {}
self.tgt = tgt
self.outkeys = [self.outkey + '_level', self.outkey + '_level_std']
def fit(self, df):
self.global_means[self.outkey + '_level'] = df[self.tgt].mean()
self.global_means[self.outkey + '_level_std'] = df[self.tgt].std()
for k in df.groupby(self.key, sort=False):
self.count[k[0]] = len(k[1])
if len(k[1]) < 0:
self.means[k[0]] = np.nan
self.std[k[0]] = np.nan
else:
self.means[k[0]] = np.mean(k[1][self.tgt])
self.std[k[0]] = np.std(k[1][self.tgt])
def predict(self, df, nans = False):
for l in self.outkeys:
df[l] = np.nan if nans else self.global_means[l]
df[self.outkey + '_count'] = 0
for k in df.groupby(self.key, sort=False):
if k[0] == 0:
continue
if k[0] in self.means:
df.loc[k[1].index, self.outkey + '_count'] = self.count[k[0]]
df.loc[k[1].index, self.outkey + '_level'] = self.means[k[0]]
df.loc[k[1].index, self.outkey + '_level_std'] = self.std[k[0]]
return df
def get_features(self):
return self.outkeys.copy() + [self.outkey + '_count']
# i kept the same index randomization (with fixed seed) so I could validate this code against
# the original...
target_num_map = {'low':0, 'medium':1, 'high':2}
train_y = np.array(train_df['interest_level'].apply(lambda x: target_num_map[x]))
def proc_fold(fold):
train_index = fold[0]
test_index = fold[1]
cv_train = train_df.iloc[train_index]
cv_valid = train_df.iloc[test_index][['interest_level', 'manager_id', 'building_id']]
cv_test = test_df.copy()
m_build = MeansProcessor('building_id', 'building_sort')
m_build.fit(cv_train)
cv_valid = m_build.predict(cv_valid)
cv_test = m_build.predict(cv_test)
m_mgr = MeansProcessor('manager_id', 'manager_sort')
m_mgr.fit(cv_train)
cv_valid = m_mgr.predict(cv_valid)
cv_test = m_mgr.predict(cv_test)
m_comb = MeansProcessor(['building_id', 'manager_id'], 'mb_comb')
m_comb.fit(cv_train)
cv_valid = m_comb.predict(cv_valid)
cv_test = m_comb.predict(cv_test)
return cv_train, cv_valid, cv_test
kf = model_selection.StratifiedKFold(n_splits=5, shuffle=True, random_state=2016)
folds = [(k[0], k[1]) for k in kf.split(list(range(train_df.shape[0])), train_y)]
#with Pool(5) as pool:
# rv = pool.map(proc_fold, folds)
import pickle
try:
rv = pickle.load(open('bag-model-groupfeatures_nonan.pkl', 'rb'))
except:
with Pool(5) as pool:
rv = pool.map(proc_fold, folds)
pickle.dump(rv, open('bag-model-groupfeatures_nonan.pkl', 'wb'))
# dummies to get feature id's
m_build = MeansProcessor('building_id', 'building_sort')
m_mgr = MeansProcessor('manager_id', 'manager_sort')
m_comb = MeansProcessor(['building_id', 'manager_id'], 'mb_comb')
group_features = m_build.get_features() + m_mgr.get_features() + m_comb.get_features()
#cv_test = [r[2] for r in rv]
cv_test = []
for r in rv:
cv_test.append(test_df.merge(r[2][group_features], left_index=True, right_index=True))
cv_allvalid = pd.concat([r[1] for r in rv])
train_df = train_df.merge(cv_allvalid[group_features], left_index=True, right_index=True)
new lightgbm tests
In [8]:
kf = model_selection.StratifiedKFold(n_splits=5, shuffle=True, random_state=2016)
folds = [(k[0], k[1]) for k in kf.split(list(range(train_df.shape[0])), train_df.interest_cat)]
In [9]:
# prep CV
cv_train = []
cv_valid = []
for tr_index, val_index in kf.split(train_df.index, train_df.interest_cat):
cv_train.append(train_df.loc[tr_index])
cv_valid.append(train_df.loc[val_index])
In [14]:
fl = features_to_use.copy() + m_build.get_features() + m_mgr.get_features()
#fl.append('manager_count')
fl.append('manager_lazy_rate')
#fl.append('predicted_price_ratio')
fl.append('predicted_price')
#fl.append('predicted_price_diff')
fl.append('density_gaussian02')
fl.append('density_exp01')
#fl.remove('street_address')
#fl.remove('half_bathroom')
#fl.append('manager_sort_count')
#fl.append('mb_comb_count')
#fl.append('desc_xp_count')
#fl.append('desc_xp_ratio')
#fl.append('desc_xp_first')
In [15]:
# adams features
fl.append('num_cap_share')
fl.append('num_nr_of_lines')
fl.append('num_redacted')
fl.append('num_email')
fl.append('num_phone_nr')
In [16]:
for f in train_df.keys():
#print(f)
if 'rot' in f:
fl.append(f)
fl.append('num_rho')
fl.append('num_phi')
In [17]:
t4_params = {
'boosting_type': 'gbdt', 'objective': 'multiclass', 'nthread': -1, 'silent': True,
'num_leaves': 2**5, 'learning_rate': 0.02, 'max_depth': -1, 'metric': ['multi_logloss'],
'max_bin': 255, 'subsample_for_bin': 50000,
'subsample': 0.8, 'subsample_freq': 1, 'colsample_bytree': 0.6, 'reg_alpha': 1, 'reg_lambda': 0,
'min_split_gain': 0.25, 'min_child_weight': .5, 'min_child_samples': 20, 'scale_pos_weight': 1}
lgbm_params = t4_params.copy()
lgbm_params['num_class'] = 3
In [18]:
lgbm_params
Out[18]:
In [19]:
start = time.time()
# prep CV
cv_train = []
cv_valid = []
for tr_index, val_index in kf.split(train_df.index, train_df.interest_cat):
cv_train.append(train_df.loc[tr_index])
cv_valid.append(train_df.loc[val_index])
models = []
cv_preds = []
df_cvpreds = []
for f in range(5):
dset = lgbm.Dataset(cv_train[f][fl], cv_train[f].interest_cat, silent=True)
dset_val = lgbm.Dataset(cv_valid[f][fl], cv_valid[f].interest_cat, silent=True)
models.append(lgbm.train(lgbm_params, dset, early_stopping_rounds=100, verbose_eval=False, valid_sets=dset_val, num_boost_round=2000))
#print('done training')
cv_preds.append(models[-1].predict(cv_valid[f][fl], num_iteration=models[-1].best_iteration))
df_cvpreds.append(pd.DataFrame(cv_preds[f], columns=['low', 'medium', 'high']))
df_cvpreds[f].index = cv_valid[f].index
df_cvpreds[f]['interest_cat'] = cv_valid[f].interest_cat
df_cvpreds[f]['listing_id'] = cv_valid[f].listing_id
df_cvpreds[f].set_index('listing_id', inplace=True)
print(f, sklearn.metrics.log_loss(df_cvpreds[f].interest_cat, df_cvpreds[f][['low', 'medium', 'high']]))
df_cvpreds = pd.concat(df_cvpreds)
tgts = ['low', 'medium', 'high']
print('combined: ', sklearn.metrics.log_loss(df_cvpreds.interest_cat, df_cvpreds[tgts]))
end = time.time()
print(end - start)
0 0.508595209192 1 0.49459901602 2 0.512629227307 3 0.499392814962 4 0.512765831483 combined: 0.50559598398 126.87185287475586
In [22]:
testpreds = []
for i, m in enumerate(models):
testpreds.append(m.predict(cv_test[i][fl], num_iteration=m.best_iteration))
f = np.array(testpreds).mean(axis=0)
df_testpreds = pd.DataFrame(f, columns=['low', 'medium', 'high'])
df_testpreds.index = test_df.index
df_testpreds['listing_id'] = test_df.listing_id
df_testpreds.set_index('listing_id', inplace=True)
df_output = pd.concat([df_testpreds, df_cvpreds[tgts]])
df_output.sort_index(inplace=True)
#df_output.to_pickle('bag-klightgbm-mgr.pkl')
df_fold = []
for i in range(len(testpreds)):
df_fold.append(pd.DataFrame(testpreds[i]))
df_fold[-1]['listing_id'] = test_df.listing_id
df_fold[-1].sort_values('listing_id', inplace=True)
df_fold[-1].set_index('listing_id', inplace=True)
pickle.dump((df_output, df_fold), open('modeloutput-klightgbm-clf-r2.pkl', 'wb'))
In [23]:
# regressor
start = time.time()
models = []
cv_preds = []
df_cvpreds = []
t4_params = {
'boosting_type': 'gbdt', 'objective': 'regression_l2', 'nthread': -1, 'silent': True,
'num_leaves': 2**6, 'learning_rate': 0.02, 'max_depth': -1, 'metric': ['l2'],
'max_bin': 127, 'subsample_for_bin': 50000,
'subsample': 0.8, 'subsample_freq': 1, 'colsample_bytree': 0.6, 'reg_alpha': 1, 'reg_lambda': 0,
'min_split_gain': .01, 'min_child_weight': 1, 'min_child_samples': 20, 'scale_pos_weight': 1}
lgbm_params = t4_params.copy()
lgbm_params['num_class'] = 1
for f in range(5):
dset = lgbm.Dataset(cv_train[f][fl], cv_train[f].interest, silent=True)
dset_val = lgbm.Dataset(cv_valid[f][fl], cv_valid[f].interest, silent=True)
models.append(lgbm.train(lgbm_params, dset, early_stopping_rounds=100, verbose_eval=False, valid_sets=dset_val, num_boost_round=2000))
#print('done training')
cv_preds.append(models[-1].predict(cv_valid[f][fl]))
df_cvpreds.append(pd.DataFrame(cv_preds[f], columns=['prediction']))
df_cvpreds[f].index = cv_valid[f].index
df_cvpreds[f]['interest'] = cv_valid[f].interest
df_cvpreds[f]['listing_id'] = cv_valid[f].listing_id
df_cvpreds[f].set_index('listing_id', inplace=True)
print(f, np.sqrt(sklearn.metrics.mean_squared_error(cv_valid[f].interest, df_cvpreds[f].prediction)))
df_cvpreds = pd.concat(df_cvpreds)
tgts = ['low', 'medium', 'high']
print('combined: ', np.sqrt(sklearn.metrics.mean_squared_error(df_cvpreds.interest, df_cvpreds.prediction)))
end = time.time()
print(end - start)
0 0.241634444613 1 0.237429872346 2 0.242643161794 3 0.239873575947 4 0.244510203144 combined: 0.241230155962 55.41301894187927
In [24]:
testpreds = []
for i, m in enumerate(models):
testpreds.append(m.predict(cv_test[i][fl], num_iteration=m.best_iteration))
f = np.array(testpreds).mean(axis=0)
df_testpreds = pd.DataFrame(f, columns=['prediction'])
df_testpreds.index = test_df.index
df_testpreds['listing_id'] = test_df.listing_id
df_testpreds.set_index('listing_id', inplace=True)
df_output = pd.concat([df_testpreds, df_cvpreds[['prediction']]])
df_output.sort_index(inplace=True)
df_fold = []
for i in range(len(testpreds)):
df_fold.append(pd.DataFrame(testpreds[i]))
df_fold[-1]['listing_id'] = test_df.listing_id
df_fold[-1].sort_values('listing_id', inplace=True)
df_fold[-1].set_index('listing_id', inplace=True)
pickle.dump((df_output, df_fold), open('modeloutput-lightgbm-reg-r2.pkl', 'wb'))
In [ ]: