Using this just for RandomForestClassifier - there's also ExtraTrees and RandomForestRegressors, but they didn't help my OOB, so they're not getting run here...
In [1]:
    
import os
import sys
import operator
import numpy as np
import pandas as pd
from scipy import sparse
import xgboost as xgb
import random
from sklearn import model_selection, preprocessing, ensemble
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import pickle
import time
import sklearn.cluster
import Levenshtein
from multiprocessing import Pool
import lightgbm as lgbm
    
In [2]:
    
train_df = pd.read_pickle('fin-dprep-train.pkl')
test_df = pd.read_pickle('fin-dprep-test.pkl')
features_to_use = pickle.load(open('fin-dprep-flist.pkl', 'rb'))
    
In [3]:
    
medium_price = pd.read_pickle('fin-medium-price.pkl')
train_df = pd.merge(train_df, medium_price, left_on='listing_id', right_index=True)
test_df = pd.merge(test_df, medium_price, left_on='listing_id', right_index=True)
    
In [ ]:
    
    
In [4]:
    
for df in [train_df, test_df]:
    df['predicted_price_diff'] = np.log(df.predicted_price) - np.log(df.price)
    df['predicted_price_ratio'] = np.log(df.predicted_price) / np.log(df.price)
    
In [5]:
    
# fill in the NaN's.
for t in train_df.keys():
    nacount = train_df[t].isnull().sum()
    if nacount:
#        nacount_test = test_df[t].isnull().sum()
        print(t, nacount / len(train_df))#, nacount_test / len(test_df))
        
train_df.fillna(0, inplace=True)
test_df.fillna(0, inplace=True)
    
    
In [6]:
    
class MeansProcessor:
    def __init__(self, key, outkey = None, tgt = 'interest'):
        self.key = key
        self.outkey = key if outkey is None else outkey
        
        self.count = {}
        self.means = {}
        self.std = {}
        self.global_means = {}
        
        self.tgt = tgt
        
        self.outkeys = [self.outkey + '_level', self.outkey + '_level_std']
        
    def fit(self, df):
        self.global_means[self.outkey + '_level'] = df[self.tgt].mean()
        self.global_means[self.outkey + '_level_std'] = df[self.tgt].std()
            
        for k in df.groupby(self.key, sort=False):
            
            self.count[k[0]] = len(k[1])
            if len(k[1]) < 0:
                self.means[k[0]] = np.nan
                self.std[k[0]] = np.nan
            else:
                self.means[k[0]] = np.mean(k[1][self.tgt])
                self.std[k[0]] = np.std(k[1][self.tgt])
            
    def predict(self, df, nans = False):
        for l in self.outkeys:
            df[l] = np.nan if nans else self.global_means[l]
            
        df[self.outkey + '_count'] = 0
            
        for k in df.groupby(self.key, sort=False):
            if k[0] == 0:
                continue
            
            if k[0] in self.means:
                df.loc[k[1].index, self.outkey + '_count'] = self.count[k[0]]
                df.loc[k[1].index, self.outkey + '_level'] = self.means[k[0]]
                df.loc[k[1].index, self.outkey + '_level_std'] = self.std[k[0]]
        
        return df
    
    def get_features(self):
        return self.outkeys.copy() + [self.outkey + '_count']
# i kept the same index randomization (with fixed seed) so I could validate this code against
# the original...
target_num_map = {'low':0, 'medium':1, 'high':2}
train_y = np.array(train_df['interest_level'].apply(lambda x: target_num_map[x]))
def proc_fold(fold):
    train_index = fold[0]
    test_index = fold[1]
    
    cv_train = train_df.iloc[train_index]
    cv_valid = train_df.iloc[test_index][['interest_level', 'manager_id', 'building_id']]
    cv_test = test_df.copy()
    
    m_build = MeansProcessor('building_id', 'building_sort')
    m_build.fit(cv_train)
    cv_valid = m_build.predict(cv_valid)
    cv_test = m_build.predict(cv_test)
    m_mgr = MeansProcessor('manager_id', 'manager_sort')
    m_mgr.fit(cv_train)
    cv_valid = m_mgr.predict(cv_valid)
    cv_test = m_mgr.predict(cv_test)
    m_comb = MeansProcessor(['building_id', 'manager_id'], 'mb_comb')
    m_comb.fit(cv_train)
    cv_valid = m_comb.predict(cv_valid)
    cv_test = m_comb.predict(cv_test)
    return cv_train, cv_valid, cv_test
kf = model_selection.StratifiedKFold(n_splits=5, shuffle=True, random_state=2016)
folds = [(k[0], k[1]) for k in kf.split(list(range(train_df.shape[0])), train_y)]
#with Pool(5) as pool:
#    rv = pool.map(proc_fold, folds)
import pickle
try:
    rv = pickle.load(open('0422-model-groupfeatures_nonan.pkl', 'rb'))
except:
    with Pool(5) as pool:
        rv = pool.map(proc_fold, folds)
        pickle.dump(rv, open('0422-model-groupfeatures_nonan.pkl', 'wb'))
# dummies to get feature id's
m_build = MeansProcessor('building_id', 'building_sort')
m_mgr = MeansProcessor('manager_id', 'manager_sort')
m_comb = MeansProcessor(['building_id', 'manager_id'], 'mb_comb')
group_features = m_build.get_features() + m_mgr.get_features() + m_comb.get_features()
#cv_test = [r[2] for r in rv]
cv_test = []
for r in rv:
    cv_test.append(test_df.merge(r[2][group_features], left_index=True, right_index=True))
cv_allvalid = pd.concat([r[1] for r in rv])
train_df = train_df.merge(cv_allvalid[group_features], left_index=True, right_index=True)
    
In [7]:
    
kf = model_selection.StratifiedKFold(n_splits=5, shuffle=True, random_state=2016)
folds = [(k[0], k[1]) for k in kf.split(list(range(train_df.shape[0])), train_df.interest_cat)]
    
In [8]:
    
for df in [train_df] + cv_test:
    df['price_t'] = df['price_t'].clip(0, 13000)
    df['price_per_room'] = df['price_per_room'].clip(0, 13000)
    #df['density_lin005'] = df['density_lin005'].clip(-50, 50)
    df['predicted_price_ratio'] = df['predicted_price_ratio'].clip(-50, 50)
    
In [9]:
    
train_df_normalized = train_df.copy()
cvtest_normalized = [df.copy() for df in cv_test]
train_df_normalized['listing_id_norm'] = train_df_normalized['listing_id']
for df in cvtest_normalized:
    df['listing_id_norm'] = df['listing_id']
normalized_keys = []
scaler = {}
for f in train_df.keys():
    if f[0:2] == 'f_' or f[0:3] == 'fm_':
        train_df_normalized[f] = train_df_normalized[f].clip(0, 1)
        for df in cvtest_normalized:
            df[f] = df[f].clip(0, 1)
    elif 'interest' in f or f == 'listing_id' or f == 'index':
        continue
    elif f == 'created' or train_df[f].dtype == 'O':
        train_df_normalized.drop(f, axis=1, inplace=True)
        for df in cvtest_normalized:
            df.drop(f, axis=1, inplace=True)
        continue
    else:
        #print(f, train_df[f].min(), train_df[f].max(), test_df[f].min(), test_df[f].max())
        scaler[f] = sklearn.preprocessing.StandardScaler()
        train_df_normalized[f] = scaler[f].fit_transform(train_df_normalized[f].values.reshape(-1,1))[:,0]
        for df in cvtest_normalized:
            df[f] = scaler[f].transform(df[f].values.reshape(-1,1))[:,0]
        
    normalized_keys.append(f)
    
    
models begin here
In [10]:
    
fl = normalized_keys.copy() + m_build.get_features() + m_mgr.get_features() 
#for f in ['density_exp01', 'density_exp005', 'density_lin005', 'density_gaussian001', 'density_gaussian', 'density_gaussian01', 'density_gaussian02', 'density_gaussian04']:
#    fl.remove(f)
    
#fl.append('density_gaussian02')
#fl.append('density_exp01')
fl.remove('predicted_price_ratio')
fl.remove('manager_building0_rate')
fl.remove('manager_shortdesc_rate')
fl.remove('manager_0feature_rate')
#fl.append('manager_sort_count')
    
In [11]:
    
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.datasets import load_digits
from sklearn.ensemble import RandomForestClassifier
from scipy.stats import randint as sp_randint
    
In [12]:
    
# RandomForestClassifier
start = time.time()
# prep CV
cv_train = []
cv_valid = []
for tr_index, val_index in kf.split(train_df.index, train_df.interest_cat):
        cv_train.append(train_df_normalized.loc[tr_index])
        cv_valid.append(train_df_normalized.loc[val_index])
models = []
cv_preds = []
df_cvpreds = []
for f in range(5):
    #dset = lgbm.Dataset(cv_train[f][fl], cv_train[f].interest_cat, silent=True)
    #dset_val = lgbm.Dataset(cv_valid[f][fl], cv_valid[f].interest_cat, silent=True)
    #models.append(lgbm.train(lgbm_params, dset, early_stopping_rounds=100, verbose_eval=False, valid_sets=dset_val, num_boost_round=2000))
    models.append(sklearn.ensemble.RandomForestClassifier(n_estimators=10, min_samples_split=8, min_samples_leaf=4, 
                                                          n_jobs=-1, class_weight=None, random_state=0))
    models[-1].fit(cv_train[f][fl].values, cv_train[f].interest_cat.values)
    
    best = None
    
    for nest in range(20, 2000, 10):
        models[-1].set_params(warm_start = True, n_estimators=nest)
        models[-1].fit(cv_train[f][fl].values, cv_train[f].interest_cat.values)
    
        preds = models[-1].predict_proba(cv_valid[f][fl].values)
        score = sklearn.metrics.log_loss(cv_valid[f].interest_cat, preds)
        
        print(nest, score)
        if best is None or score < best[0]:
            best = (score, nest)
        elif nest - best[1] >= 50:
            break
            
    #models[-1].set_params(n_estimators = best[1])
    
    #print('done training')
    
    cv_preds.append(models[-1].predict_proba(cv_valid[f][fl].values))
    df_cvpreds.append(pd.DataFrame(cv_preds[f], columns=['low', 'medium', 'high']))
    df_cvpreds[f].index = cv_valid[f].index
    df_cvpreds[f]['interest_cat'] = cv_valid[f].interest_cat
    df_cvpreds[f]['listing_id'] = cv_valid[f].listing_id
    df_cvpreds[f].set_index('listing_id', inplace=True)
    print(f, sklearn.metrics.log_loss(df_cvpreds[f].interest_cat, df_cvpreds[f][['low', 'medium', 'high']]))
df_cvpreds = pd.concat(df_cvpreds)
tgts = ['low', 'medium', 'high']
print('combined: ', sklearn.metrics.log_loss(df_cvpreds.interest_cat, df_cvpreds[tgts]))
end = time.time()
print(end  - start)
    
    
In [15]:
    
testpreds = []
for i, m in enumerate(models):
    testpreds.append(m.predict_proba(cvtest_normalized[i][fl]))
    
f = np.array(testpreds).mean(axis=0)
df_testpreds = pd.DataFrame(f, columns=['low', 'medium', 'high'])
df_testpreds.index = test_df.index
df_testpreds['listing_id'] = test_df.listing_id
df_testpreds.set_index('listing_id', inplace=True)
df_output = pd.concat([df_testpreds, df_cvpreds[tgts]])
df_output.sort_index(inplace=True)
    
In [16]:
    
df_fold = []
for i in range(len(testpreds)):
    df_fold.append(pd.DataFrame(testpreds[i]))
    df_fold[-1]['listing_id'] = test_df.listing_id
    df_fold[-1].sort_values('listing_id', inplace=True)
    df_fold[-1].set_index('listing_id', inplace=True)
pickle.dump((df_output, df_fold), open('model-output-rf.pkl', 'wb'))
    
In [17]:
    
# RandomForestRegressor
start = time.time()
# prep CV
cv_train = []
cv_valid = []
for tr_index, val_index in kf.split(train_df.index, train_df.interest_cat):
        cv_train.append(train_df_normalized.loc[tr_index])
        cv_valid.append(train_df_normalized.loc[val_index])
models = []
cv_preds = []
df_cvpreds = []
for f in range(5):
    #dset = lgbm.Dataset(cv_train[f][fl], cv_train[f].interest_cat, silent=True)
    #dset_val = lgbm.Dataset(cv_valid[f][fl], cv_valid[f].interest_cat, silent=True)
    #models.append(lgbm.train(lgbm_params, dset, early_stopping_rounds=100, verbose_eval=False, valid_sets=dset_val, num_boost_round=2000))
    models.append(sklearn.ensemble.RandomForestRegressor(n_estimators=10, min_samples_split=8, min_samples_leaf=4, 
                                                          n_jobs=-1, random_state=0))
    models[-1].fit(cv_train[f][fl].values, cv_train[f].interest_cat.values)
    
    best = None
    
    for nest in range(20, 2000, 10):
        models[-1].set_params(warm_start = True, n_estimators=nest)
        models[-1].fit(cv_train[f][fl].values, cv_train[f].interest.values)
    
        preds = models[-1].predict(cv_valid[f][fl].values)
        #score = sklearn.metrics.log_loss(cv_valid[f].interest, preds)
        score = np.sqrt(sklearn.metrics.mean_squared_error(cv_valid[f].interest, preds))
        
        print(nest, score)
        if best is None or score < best[0]:
            best = (score, nest)
        elif nest - best[1] >= 50:
            break
            
    #models[-1].set_params(n_estimators = best[1])
    
    #print('done training')
    
    cv_preds.append(models[-1].predict(cv_valid[f][fl].values))
    df_cvpreds.append(pd.DataFrame(cv_preds[f], columns=['prediction']))
    df_cvpreds[f].index = cv_valid[f].index
    df_cvpreds[f]['interest'] = cv_valid[f].interest
    df_cvpreds[f]['listing_id'] = cv_valid[f].listing_id
    df_cvpreds[f].set_index('listing_id', inplace=True)
    print(f) #, np.sqrt(sklearn.metrics.mean_squared_error(cv_valid[f].interest, preds)))
df_cvpreds = pd.concat(df_cvpreds)
tgts = ['low', 'medium', 'high']
print('combined: ', np.sqrt(sklearn.metrics.mean_squared_error(df_cvpreds.interest, df_cvpreds.prediction))) #sklearn.metrics.log_loss(df_cvpreds.interest_cat, df_cvpreds[tgts]))
end = time.time()
print(end  - start)
    
    
In [18]:
    
testpreds = []
for i, m in enumerate(models):
    testpreds.append(m.predict(cvtest_normalized[i][fl]))
    
In [19]:
    
f = np.array(testpreds).mean(axis=0)
df_testpreds = pd.DataFrame(f, columns=['prediction'])
df_testpreds.index = test_df.index
df_testpreds['listing_id'] = test_df.listing_id
df_testpreds.set_index('listing_id', inplace=True)
df_output = pd.concat([df_testpreds, df_cvpreds[['prediction']]])
df_output.sort_index(inplace=True)
df_output.to_pickle('bag-model-rfr-v1.pkl')
df_fold = []
for i in range(len(testpreds)):
    df_fold.append(pd.DataFrame(testpreds[i]))
    df_fold[-1]['listing_id'] = test_df.listing_id
    df_fold[-1].sort_values('listing_id', inplace=True)
    df_fold[-1].set_index('listing_id', inplace=True)
pickle.dump((df_output, df_fold), open('bagta-model-rfr-v1.pkl', 'wb'))
    
In [23]:
    
# ExtraTreesRegressor 
start = time.time()
# prep CV
cv_train = []
cv_valid = []
for tr_index, val_index in kf.split(train_df.index, train_df.interest_cat):
        cv_train.append(train_df_normalized.loc[tr_index])
        cv_valid.append(train_df_normalized.loc[val_index])
models = []
cv_preds = []
df_cvpreds = []
for f in range(5):
    models.append(sklearn.ensemble.ExtraTreesRegressor(n_estimators=10, min_samples_split=8, min_samples_leaf=4, 
                                                          n_jobs=-1, random_state=0))
    models[-1].fit(cv_train[f][fl].values, cv_train[f].interest_cat.values)
    
    best = None
    
    for nest in range(20, 2000, 10):
        models[-1].set_params(warm_start = True, n_estimators=nest)
        models[-1].fit(cv_train[f][fl].values, cv_train[f].interest.values)
    
        preds = models[-1].predict(cv_valid[f][fl].values)
        #score = sklearn.metrics.log_loss(cv_valid[f].interest, preds)
        score = np.sqrt(sklearn.metrics.mean_squared_error(cv_valid[f].interest, preds))
        
        print(nest, score)
        if best is None or score < best[0]:
            best = (score, nest)
        elif nest - best[1] >= 50:
            break
            
    cv_preds.append(models[-1].predict(cv_valid[f][fl].values))
    df_cvpreds.append(pd.DataFrame(cv_preds[f], columns=['prediction']))
    df_cvpreds[f].index = cv_valid[f].index
    df_cvpreds[f]['interest'] = cv_valid[f].interest
    df_cvpreds[f]['listing_id'] = cv_valid[f].listing_id
    df_cvpreds[f].set_index('listing_id', inplace=True)
    print(f) #, np.sqrt(sklearn.metrics.mean_squared_error(cv_valid[f].interest, preds)))
df_cvpreds = pd.concat(df_cvpreds)
tgts = ['low', 'medium', 'high']
print('combined: ', np.sqrt(sklearn.metrics.mean_squared_error(df_cvpreds.interest, df_cvpreds.prediction))) #sklearn.metrics.log_loss(df_cvpreds.interest_cat, df_cvpreds[tgts]))
end = time.time()
print(end  - start)
    
    
In [24]:
    
testpreds = []
for i, m in enumerate(models):
    testpreds.append(m.predict(cvtest_normalized[i][fl]))
    
f = np.array(testpreds).mean(axis=0)
df_testpreds = pd.DataFrame(f, columns=['prediction'])
df_testpreds.index = test_df.index
df_testpreds['listing_id'] = test_df.listing_id
df_testpreds.set_index('listing_id', inplace=True)
df_output = pd.concat([df_testpreds, df_cvpreds[['prediction']]])
df_output.sort_index(inplace=True)
#df_output.to_pickle('0423-model-etr-v1.pkl')
df_fold = []
for i in range(len(testpreds)):
    df_fold.append(pd.DataFrame(testpreds[i]))
    df_fold[-1]['listing_id'] = test_df.listing_id
    df_fold[-1].sort_values('listing_id', inplace=True)
    df_fold[-1].set_index('listing_id', inplace=True)
pickle.dump((df_output, df_fold), open('bagta-model-etr-v1.pkl', 'wb'))
    
In [103]:
    
df_output
    
    Out[103]:
In [79]:
    
fl = ['price', 'manager_sort_level', 'latitude', 'longitude', 'bedrooms', 'bathrooms', 'density_exp005', 'predicted_price_diff', 'created_hour']
# ET
# RandomForestClassifier
start = time.time()
# prep CV
cv_train = []
cv_valid = []
for tr_index, val_index in kf.split(train_df.index, train_df.interest_cat):
        cv_train.append(train_df_normalized.loc[tr_index])
        cv_valid.append(train_df_normalized.loc[val_index])
models = []
cv_preds = []
df_cvpreds = []
for f in range(5):
    #dset = lgbm.Dataset(cv_train[f][fl], cv_train[f].interest_cat, silent=True)
    #dset_val = lgbm.Dataset(cv_valid[f][fl], cv_valid[f].interest_cat, silent=True)
    #models.append(lgbm.train(lgbm_params, dset, early_stopping_rounds=100, verbose_eval=False, valid_sets=dset_val, num_boost_round=2000))
    models.append(sklearn.ensemble.ExtraTreesClassifier(n_estimators=10, max_features=len(fl),
                                                          n_jobs=-1, class_weight=None, random_state=0))
    models[-1].fit(cv_train[f][fl].values, cv_train[f].interest_cat.values)
    
    best = None
    
    for nest in range(20, 2000, 10):
        models[-1].set_params(warm_start = True, n_estimators=nest)
        models[-1].fit(cv_train[f][fl].values, cv_train[f].interest_cat.values)
    
        preds = models[-1].predict_proba(cv_valid[f][fl].values)
        score = sklearn.metrics.log_loss(cv_valid[f].interest_cat, preds)
        
        print(nest, score)
        if best is None or score < best[0]:
            best = (score, nest)
        elif nest - best[1] >= 50:
            break
            
    #models[-1].set_params(n_estimators = best[1])
    
    #print('done training')
    
    cv_preds.append(models[-1].predict_proba(cv_valid[f][fl].values))
    df_cvpreds.append(pd.DataFrame(cv_preds[f], columns=['low', 'medium', 'high']))
    df_cvpreds[f].index = cv_valid[f].index
    df_cvpreds[f]['interest_cat'] = cv_valid[f].interest_cat
    df_cvpreds[f]['listing_id'] = cv_valid[f].listing_id
    df_cvpreds[f].set_index('listing_id', inplace=True)
    print(f, sklearn.metrics.log_loss(df_cvpreds[f].interest_cat, df_cvpreds[f][['low', 'medium', 'high']]))
df_cvpreds = pd.concat(df_cvpreds)
tgts = ['low', 'medium', 'high']
print('combined: ', sklearn.metrics.log_loss(df_cvpreds.interest_cat, df_cvpreds[tgts]))
end = time.time()
print(end  - start)
    
    
    
In [ ]:
    
    
In [34]:
    
train_df['desc_xp_count'] = train_df.description.apply(lambda x: x.count('!'))
train_df['desc_xp_ratio'] = train_df.description.apply(lambda x: (x.count('!') / len(x)) if len(x) else 0)
train_df['desc_xp_first'] = train_df.description.apply(lambda x: x.find('!') if len(x) else 0)
train_df['desc_xp_inv_first'] = train_df.description.apply(lambda x: (len(x) - x.find('!')) if len(x) else 0)
train_df['desc_xp_inv_mult'] = train_df.desc_xp_count * train_df.desc_xp_inv_first
    
In [30]:
    
# prep CV
cv_train = []
cv_valid = []
for tr_index, val_index in kf.split(train_df.index, train_df.interest_cat):
        cv_train.append(train_df.loc[tr_index])
        cv_valid.append(train_df.loc[val_index])
    
In [74]:
    
t4_params = {
    'boosting_type': 'gbdt', 'objective': 'multiclass', 'nthread': -1, 'silent': True,
    'num_leaves': 2**5, 'learning_rate': 0.02, 'max_depth': -1, 'metric': ['multi_logloss'],
    'max_bin': 255, 'subsample_for_bin': 50000,
    'subsample': 0.8, 'subsample_freq': 1, 'colsample_bytree': 0.6, 'reg_alpha': 1, 'reg_lambda': 0,
    'min_split_gain': 0.25, 'min_child_weight': .5, 'min_child_samples': 20, 'scale_pos_weight': 1}
lgbm_params = t4_params.copy()
lgbm_params['num_class'] = 3
    
In [78]:
    
fl = ['price', 'manager_sort_level', 'latitude', 'longitude', 'bedrooms', 'bathrooms', 'density_exp005', 'predicted_price_diff', 'created_hour']
start = time.time()
# prep CV
cv_train = []
cv_valid = []
for tr_index, val_index in kf.split(train_df.index, train_df.interest_cat):
        cv_train.append(train_df_normalized.loc[tr_index])
        cv_valid.append(train_df_normalized.loc[val_index])
models = []
cv_preds = []
df_cvpreds = []
for f in range(5):
    dset = lgbm.Dataset(cv_train[f][fl], cv_train[f].interest_cat, silent=True)
    dset_val = lgbm.Dataset(cv_valid[f][fl], cv_valid[f].interest_cat, silent=True)
    models.append(lgbm.train(lgbm_params, dset, early_stopping_rounds=100, verbose_eval=False, valid_sets=dset_val, num_boost_round=2000))
    #print('done training')
    
    cv_preds.append(models[-1].predict(cv_valid[f][fl]))
    df_cvpreds.append(pd.DataFrame(cv_preds[f], columns=['low', 'medium', 'high']))
    df_cvpreds[f].index = cv_valid[f].index
    df_cvpreds[f]['interest_cat'] = cv_valid[f].interest_cat
    df_cvpreds[f]['listing_id'] = cv_valid[f].listing_id
    df_cvpreds[f].set_index('listing_id', inplace=True)
    print(f, sklearn.metrics.log_loss(df_cvpreds[f].interest_cat, df_cvpreds[f][['low', 'medium', 'high']]))
df_cvpreds = pd.concat(df_cvpreds)
tgts = ['low', 'medium', 'high']
print('combined: ', sklearn.metrics.log_loss(df_cvpreds.interest_cat, df_cvpreds[tgts]))
end = time.time()
print(end  - start)
    
    
In [77]:
    
testpreds = []
for i, m in enumerate(models):
    testpreds.append(m.predict(cv_test[i][fl], num_iteration=m.best_iteration))
    
f = np.array(testpreds).mean(axis=0)
df_testpreds = pd.DataFrame(f, columns=['low', 'medium', 'high'])
df_testpreds.index = test_df.index
df_testpreds['listing_id'] = test_df.listing_id
df_testpreds.set_index('listing_id', inplace=True)
df_output = pd.concat([df_testpreds, df_cvpreds[tgts]])
df_output.sort_index(inplace=True)
df_output.to_pickle('0423-minimod-lgbm-lf-v1.pkl')
    
In [23]:
    
corrs = []
for k in fl:
    corrs.append((k, train_df[k].corr(train_df.interest)))
    
In [24]:
    
sorted(corrs, key=operator.itemgetter(1))
    
    Out[24]:
In [25]:
    
pd.Series.corr?
    
In [38]:
    
train_df[~(train_df.f_doorman == 1)].price.mean()
    
    Out[38]: