lightgbm processor


In [1]:
import os
import sys
import operator
import numpy as np
import pandas as pd
from scipy import sparse
import xgboost as xgb
import random
from sklearn import model_selection, preprocessing, ensemble
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

import pickle
import time

import sklearn.cluster

import Levenshtein

from multiprocessing import Pool

import lightgbm as lgbm

In [2]:
train_df = pd.read_pickle('fin-dprep-train.pkl')
test_df = pd.read_pickle('fin-dprep-test.pkl')
features_to_use = pickle.load(open('fin-dprep-flist.pkl', 'rb'))

adams = pd.read_pickle('features-adams.pkl')

train_df = pd.merge(train_df, adams, left_on='listing_id', right_index=True)
test_df = pd.merge(test_df, adams, left_on='listing_id', right_index=True)

In [3]:
target_num_map_reg = {'low':0, 'medium': (.5 + (9/13)) / 2, 'high':1}
train_df['interest'] = np.array(train_df['interest_level'].apply(lambda x: target_num_map_reg[x]))

In [4]:
medium_price = pd.read_pickle('fin-medium-price.pkl')

train_df = pd.merge(train_df, medium_price, left_on='listing_id', right_index=True)
test_df = pd.merge(test_df, medium_price, left_on='listing_id', right_index=True)

In [5]:
for df in [train_df, test_df]:
    df['predicted_price_diff'] = np.log(df.predicted_price) - np.log(df.price)
    df['predicted_price_ratio'] = np.log(df.predicted_price) / np.log(df.price)

In [6]:
# fill in the NaN's.

for t in train_df.keys():
    nacount = train_df[t].isnull().sum()
    if nacount:
#        nacount_test = test_df[t].isnull().sum()
        print(t, nacount / len(train_df))#, nacount_test / len(test_df))
        
train_df.fillna(-99999, inplace=True)
test_df.fillna(-99999, inplace=True)


price_group 0.0488733992543
price_ratio 0.0488733992543
manager_shortdesc_rate 0.0688725887502
manager_building0_rate 0.0688725887502
manager_0feature_rate 0.0688725887502
manager_median_price 0.0688725887502
manager_lazy_rate 0.0688725887502

In [7]:
class MeansProcessor:
    def __init__(self, key, outkey = None, tgt = 'interest'):
        self.key = key
        self.outkey = key if outkey is None else outkey
        
        self.count = {}
        self.means = {}
        self.std = {}
        self.global_means = {}
        
        self.tgt = tgt
        
        self.outkeys = [self.outkey + '_level', self.outkey + '_level_std']
        
    def fit(self, df):
        self.global_means[self.outkey + '_level'] = df[self.tgt].mean()
        self.global_means[self.outkey + '_level_std'] = df[self.tgt].std()
            
        for k in df.groupby(self.key, sort=False):
            
            self.count[k[0]] = len(k[1])

            if len(k[1]) < 0:
                self.means[k[0]] = np.nan
                self.std[k[0]] = np.nan
            else:
                self.means[k[0]] = np.mean(k[1][self.tgt])
                self.std[k[0]] = np.std(k[1][self.tgt])
            
    def predict(self, df, nans = False):
        for l in self.outkeys:
            df[l] = np.nan if nans else self.global_means[l]
            
        df[self.outkey + '_count'] = 0
            
        for k in df.groupby(self.key, sort=False):
            if k[0] == 0:
                continue
            
            if k[0] in self.means:
                df.loc[k[1].index, self.outkey + '_count'] = self.count[k[0]]
                df.loc[k[1].index, self.outkey + '_level'] = self.means[k[0]]
                df.loc[k[1].index, self.outkey + '_level_std'] = self.std[k[0]]
        
        return df
    
    def get_features(self):
        return self.outkeys.copy() + [self.outkey + '_count']

# i kept the same index randomization (with fixed seed) so I could validate this code against
# the original...

target_num_map = {'low':0, 'medium':1, 'high':2}
train_y = np.array(train_df['interest_level'].apply(lambda x: target_num_map[x]))

def proc_fold(fold):
    train_index = fold[0]
    test_index = fold[1]
    
    cv_train = train_df.iloc[train_index]
    cv_valid = train_df.iloc[test_index][['interest_level', 'manager_id', 'building_id']]
    cv_test = test_df.copy()
    
    m_build = MeansProcessor('building_id', 'building_sort')
    m_build.fit(cv_train)
    cv_valid = m_build.predict(cv_valid)
    cv_test = m_build.predict(cv_test)

    m_mgr = MeansProcessor('manager_id', 'manager_sort')
    m_mgr.fit(cv_train)
    cv_valid = m_mgr.predict(cv_valid)
    cv_test = m_mgr.predict(cv_test)

    m_comb = MeansProcessor(['building_id', 'manager_id'], 'mb_comb')
    m_comb.fit(cv_train)
    cv_valid = m_comb.predict(cv_valid)
    cv_test = m_comb.predict(cv_test)

    return cv_train, cv_valid, cv_test

kf = model_selection.StratifiedKFold(n_splits=5, shuffle=True, random_state=2016)
folds = [(k[0], k[1]) for k in kf.split(list(range(train_df.shape[0])), train_y)]

#with Pool(5) as pool:
#    rv = pool.map(proc_fold, folds)

import pickle

try:
    rv = pickle.load(open('bag-model-groupfeatures_nonan.pkl', 'rb'))
except:
    with Pool(5) as pool:
        rv = pool.map(proc_fold, folds)

        pickle.dump(rv, open('bag-model-groupfeatures_nonan.pkl', 'wb'))

# dummies to get feature id's
m_build = MeansProcessor('building_id', 'building_sort')
m_mgr = MeansProcessor('manager_id', 'manager_sort')
m_comb = MeansProcessor(['building_id', 'manager_id'], 'mb_comb')

group_features = m_build.get_features() + m_mgr.get_features() + m_comb.get_features()

#cv_test = [r[2] for r in rv]
cv_test = []
for r in rv:
    cv_test.append(test_df.merge(r[2][group_features], left_index=True, right_index=True))

cv_allvalid = pd.concat([r[1] for r in rv])

train_df = train_df.merge(cv_allvalid[group_features], left_index=True, right_index=True)

new lightgbm tests


In [8]:
kf = model_selection.StratifiedKFold(n_splits=5, shuffle=True, random_state=2016)
folds = [(k[0], k[1]) for k in kf.split(list(range(train_df.shape[0])), train_df.interest_cat)]

In [9]:
# prep CV

cv_train = []
cv_valid = []

for tr_index, val_index in kf.split(train_df.index, train_df.interest_cat):
        cv_train.append(train_df.loc[tr_index])
        cv_valid.append(train_df.loc[val_index])

In [14]:
fl = features_to_use.copy() + m_build.get_features() + m_mgr.get_features() 

#fl.append('manager_count')
fl.append('manager_lazy_rate')
#fl.append('predicted_price_ratio')

fl.append('predicted_price')
#fl.append('predicted_price_diff')

fl.append('density_gaussian02')
fl.append('density_exp01')

#fl.remove('street_address')
#fl.remove('half_bathroom')

#fl.append('manager_sort_count')
#fl.append('mb_comb_count')

#fl.append('desc_xp_count')
#fl.append('desc_xp_ratio')
#fl.append('desc_xp_first')

In [15]:
# adams features

fl.append('num_cap_share')
fl.append('num_nr_of_lines')
fl.append('num_redacted')
fl.append('num_email')
fl.append('num_phone_nr')

In [16]:
for f in train_df.keys():
    #print(f)
    if 'rot' in f:
        fl.append(f)
        
fl.append('num_rho')
fl.append('num_phi')

In [17]:
t4_params = {
    'boosting_type': 'gbdt', 'objective': 'multiclass', 'nthread': -1, 'silent': True,
    'num_leaves': 2**5, 'learning_rate': 0.02, 'max_depth': -1, 'metric': ['multi_logloss'],
    'max_bin': 255, 'subsample_for_bin': 50000,
    'subsample': 0.8, 'subsample_freq': 1, 'colsample_bytree': 0.6, 'reg_alpha': 1, 'reg_lambda': 0,
    'min_split_gain': 0.25, 'min_child_weight': .5, 'min_child_samples': 20, 'scale_pos_weight': 1}

lgbm_params = t4_params.copy()
lgbm_params['num_class'] = 3

In [18]:
lgbm_params


Out[18]:
{'boosting_type': 'gbdt',
 'colsample_bytree': 0.6,
 'learning_rate': 0.02,
 'max_bin': 255,
 'max_depth': -1,
 'metric': ['multi_logloss'],
 'min_child_samples': 20,
 'min_child_weight': 0.5,
 'min_split_gain': 0.25,
 'nthread': -1,
 'num_class': 3,
 'num_leaves': 32,
 'objective': 'multiclass',
 'reg_alpha': 1,
 'reg_lambda': 0,
 'scale_pos_weight': 1,
 'silent': True,
 'subsample': 0.8,
 'subsample_for_bin': 50000,
 'subsample_freq': 1}

In [19]:
start = time.time()

# prep CV

cv_train = []
cv_valid = []

for tr_index, val_index in kf.split(train_df.index, train_df.interest_cat):
        cv_train.append(train_df.loc[tr_index])
        cv_valid.append(train_df.loc[val_index])

models = []
cv_preds = []
df_cvpreds = []

for f in range(5):
    dset = lgbm.Dataset(cv_train[f][fl], cv_train[f].interest_cat, silent=True)
    dset_val = lgbm.Dataset(cv_valid[f][fl], cv_valid[f].interest_cat, silent=True)
    models.append(lgbm.train(lgbm_params, dset, early_stopping_rounds=100, verbose_eval=False, valid_sets=dset_val, num_boost_round=2000))

    #print('done training')
    
    cv_preds.append(models[-1].predict(cv_valid[f][fl], num_iteration=models[-1].best_iteration))

    df_cvpreds.append(pd.DataFrame(cv_preds[f], columns=['low', 'medium', 'high']))
    df_cvpreds[f].index = cv_valid[f].index
    df_cvpreds[f]['interest_cat'] = cv_valid[f].interest_cat
    df_cvpreds[f]['listing_id'] = cv_valid[f].listing_id

    df_cvpreds[f].set_index('listing_id', inplace=True)
    print(f, sklearn.metrics.log_loss(df_cvpreds[f].interest_cat, df_cvpreds[f][['low', 'medium', 'high']]))

df_cvpreds = pd.concat(df_cvpreds)

tgts = ['low', 'medium', 'high']

print('combined: ', sklearn.metrics.log_loss(df_cvpreds.interest_cat, df_cvpreds[tgts]))

end = time.time()
print(end  - start)


0 0.508155010505
1 0.494528449743
2 0.512507285305
3 0.498951178481
4 0.512042681769
combined:  0.505236507453
125.58150362968445

0 0.508595209192 1 0.49459901602 2 0.512629227307 3 0.499392814962 4 0.512765831483 combined: 0.50559598398 126.87185287475586


In [22]:
testpreds = []
for i, m in enumerate(models):
    testpreds.append(m.predict(cv_test[i][fl], num_iteration=m.best_iteration))
    
f = np.array(testpreds).mean(axis=0)
df_testpreds = pd.DataFrame(f, columns=['low', 'medium', 'high'])
df_testpreds.index = test_df.index
df_testpreds['listing_id'] = test_df.listing_id
df_testpreds.set_index('listing_id', inplace=True)

df_output = pd.concat([df_testpreds, df_cvpreds[tgts]])
df_output.sort_index(inplace=True)

#df_output.to_pickle('bag-klightgbm-mgr.pkl')

df_fold = []
for i in range(len(testpreds)):
    df_fold.append(pd.DataFrame(testpreds[i]))
    df_fold[-1]['listing_id'] = test_df.listing_id
    df_fold[-1].sort_values('listing_id', inplace=True)
    df_fold[-1].set_index('listing_id', inplace=True)

pickle.dump((df_output, df_fold), open('modeloutput-klightgbm-clf-r2.pkl', 'wb'))

In [23]:
# regressor

start = time.time()

models = []
cv_preds = []
df_cvpreds = []

t4_params = {
    'boosting_type': 'gbdt', 'objective': 'regression_l2', 'nthread': -1, 'silent': True,
    'num_leaves': 2**6, 'learning_rate': 0.02, 'max_depth': -1, 'metric': ['l2'],
    'max_bin': 127, 'subsample_for_bin': 50000,
    'subsample': 0.8, 'subsample_freq': 1, 'colsample_bytree': 0.6, 'reg_alpha': 1, 'reg_lambda': 0,
    'min_split_gain': .01, 'min_child_weight': 1, 'min_child_samples': 20, 'scale_pos_weight': 1}

lgbm_params = t4_params.copy()
lgbm_params['num_class'] = 1

for f in range(5):
    dset = lgbm.Dataset(cv_train[f][fl], cv_train[f].interest, silent=True)
    dset_val = lgbm.Dataset(cv_valid[f][fl], cv_valid[f].interest, silent=True)
    models.append(lgbm.train(lgbm_params, dset, early_stopping_rounds=100, verbose_eval=False, valid_sets=dset_val, num_boost_round=2000))

    #print('done training')
    
    cv_preds.append(models[-1].predict(cv_valid[f][fl]))

    df_cvpreds.append(pd.DataFrame(cv_preds[f], columns=['prediction']))
    df_cvpreds[f].index = cv_valid[f].index
    df_cvpreds[f]['interest'] = cv_valid[f].interest
    df_cvpreds[f]['listing_id'] = cv_valid[f].listing_id

    df_cvpreds[f].set_index('listing_id', inplace=True)
    print(f, np.sqrt(sklearn.metrics.mean_squared_error(cv_valid[f].interest, df_cvpreds[f].prediction)))

df_cvpreds = pd.concat(df_cvpreds)

tgts = ['low', 'medium', 'high']

print('combined: ', np.sqrt(sklearn.metrics.mean_squared_error(df_cvpreds.interest, df_cvpreds.prediction)))

end = time.time()
print(end  - start)


0 0.241067085028
1 0.236874368886
2 0.241772109056
3 0.239208746768
4 0.244394229484
combined:  0.240676286602
54.50261068344116

0 0.241634444613 1 0.237429872346 2 0.242643161794 3 0.239873575947 4 0.244510203144 combined: 0.241230155962 55.41301894187927


In [24]:
testpreds = []
for i, m in enumerate(models):
    testpreds.append(m.predict(cv_test[i][fl], num_iteration=m.best_iteration))
    
f = np.array(testpreds).mean(axis=0)
df_testpreds = pd.DataFrame(f, columns=['prediction'])
df_testpreds.index = test_df.index
df_testpreds['listing_id'] = test_df.listing_id
df_testpreds.set_index('listing_id', inplace=True)

df_output = pd.concat([df_testpreds, df_cvpreds[['prediction']]])
df_output.sort_index(inplace=True)


df_fold = []
for i in range(len(testpreds)):
    df_fold.append(pd.DataFrame(testpreds[i]))
    df_fold[-1]['listing_id'] = test_df.listing_id
    df_fold[-1].sort_values('listing_id', inplace=True)
    df_fold[-1].set_index('listing_id', inplace=True)

pickle.dump((df_output, df_fold), open('modeloutput-lightgbm-reg-r2.pkl', 'wb'))

In [ ]: