This notebook estimates the price for a rental if it has medium interest, the idea being to create a framing effect for the model.


In [1]:
import os
import sys
import operator
import numpy as np
import pandas as pd
from scipy import sparse
import xgboost as xgb
import random
from sklearn import model_selection, preprocessing, ensemble
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

import pickle

import sklearn.cluster

import Levenshtein

from multiprocessing import Pool

In [2]:
train_df = pd.read_pickle('fin-dprep-train.pkl')
test_df = pd.read_pickle('fin-dprep-test.pkl')

features_to_use = pickle.load(open('fin-dprep-flist.pkl', 'rb'))

In [3]:
adams = pd.read_pickle('features-adams.pkl')

train_df = pd.merge(train_df, adams, left_on='listing_id', right_index=True)
test_df = pd.merge(test_df, adams, left_on='listing_id', right_index=True)

In [4]:
# For *this* models purposes, clip the prices to increase overall accuracy...
# there are a few >200000 entries with low interest.  This is renthop, not buyhop.

train_df.price = train_df.price.clip(0, 13000)

In [5]:
class MeansProcessor:
    def __init__(self, key, outkey = None, tgt = 'interest_cat'):
        self.key = key
        self.outkey = key if outkey is None else outkey
        
        self.count = {}
        self.means = {}
        self.std = {}
        self.global_means = 0
        
        self.tgt = tgt
        
        self.outkeys = [self.outkey + '_level', self.outkey + '_level_std']
        
    def fit(self, df):
        self.global_means = df[self.tgt].mean()
            
        for k in df.groupby(self.key, sort=False):
            
            self.count[k[0]] = len(k[1])

            if len(k[1]) < 0:
                self.means[k[0]] = np.nan
                self.std[k[0]] = np.nan
            else:
                self.means[k[0]] = np.mean(k[1][self.tgt])
                self.std[k[0]] = np.std(k[1][self.tgt])
            
    def predict(self, df):
        for l in self.outkeys:
            df[l] = np.nan # self.global_means[l]
            
        df[self.outkey + '_count'] = 0
            
        for k in df.groupby(self.key, sort=False):
            if k[0] == 0:
                continue
            
            if k[0] in self.means:
                df.loc[k[1].index, self.outkey + '_count'] = self.count[k[0]]
                df.loc[k[1].index, self.outkey + '_level'] = self.means[k[0]]
                df.loc[k[1].index, self.outkey + '_level_std'] = self.std[k[0]]
        
        return df
    
    def get_features(self):
        return self.outkeys.copy() + [self.outkey + '_count']

# i kept the same index randomization (with fixed seed) so I could validate this code against
# the original...

target_num_map = {'low':0, 'medium':1, 'high':2}
train_y = np.array(train_df['interest_level'].apply(lambda x: target_num_map[x]))

def proc_fold(fold):
    train_index = fold[0]
    test_index = fold[1]
    
    cv_train = train_df.iloc[train_index]
    cv_valid = train_df.iloc[test_index][['interest_level', 'manager_id', 'building_id']]
    cv_test = test_df.copy()
    
    m_build = MeansProcessor('building_id', 'building_sort')
    m_build.fit(cv_train)
    cv_valid = m_build.predict(cv_valid)
    cv_test = m_build.predict(cv_test)

    m_mgr = MeansProcessor('manager_id', 'manager_sort')
    m_mgr.fit(cv_train)
    cv_valid = m_mgr.predict(cv_valid)
    cv_test = m_mgr.predict(cv_test)

    m_comb = MeansProcessor(['building_id', 'manager_id'], 'mb_comb')
    m_comb.fit(cv_train)
    cv_valid = m_comb.predict(cv_valid)
    cv_test = m_comb.predict(cv_test)

    return cv_train, cv_valid, cv_test

kf = model_selection.StratifiedKFold(n_splits=5, shuffle=True, random_state=2016)
folds = [(k[0], k[1]) for k in kf.split(list(range(train_df.shape[0])), train_y)]

#with Pool(5) as pool:
#    rv = pool.map(proc_fold, folds)

import pickle

try:
    rv = pickle.load(open('0420-model-groupfeatures.pkl', 'rb'))
except:
    with Pool(5) as pool:
        rv = pool.map(proc_fold, folds)

        pickle.dump(rv, open('0420-model-groupfeatures.pkl', 'wb'))

# dummies to get feature id's
m_build = MeansProcessor('building_id', 'building_sort')
m_mgr = MeansProcessor('manager_id', 'manager_sort')
m_comb = MeansProcessor(['building_id', 'manager_id'], 'mb_comb')

group_features = m_build.get_features() + m_mgr.get_features() + m_comb.get_features()

cv_test = []
for r in rv:
    cv_test.append(test_df.merge(r[2][group_features], left_index=True, right_index=True))

cv_allvalid = pd.concat([r[1] for r in rv])

train_df = train_df.merge(cv_allvalid[group_features], left_index=True, right_index=True)

In [6]:
adams_features = ['num_rot15_X', 'num_rot15_Y', 'num_rot30_X', 'num_rot30_Y', 'num_rot45_X', 'num_rot45_Y', 'num_rot60_X', 'num_rot60_Y', 'num_rho', 'num_phi', 'num_cap_share', 'num_nr_of_lines', 'num_redacted', 'num_email', 'num_phone_nr']

In [7]:
#fl = features_to_use + m_build.get_features() + m_mgr.get_features() + m_comb.get_features() + tfidf_fn
fl = features_to_use.copy()  + group_features + adams_features.copy()

fl.remove('price')
fl.remove('price_t')
fl.remove('price_per_room')

fl.append('density_exp01')

In [8]:
for f in fl:
    if 'price' in f:
        print(f)

In [9]:
def run_to_stackdf(run):
    df_testpreds = pd.DataFrame(run[2].mean(axis=0))
    df_testpreds.columns = ['level']
    df_testpreds['listing_id'] = cv_test[0].listing_id
    df_allpreds = pd.concat([run[1][['level', 'listing_id']], df_testpreds])

    df_allpreds.sort_values('listing_id', inplace=True)
    df_allpreds.set_index('listing_id', inplace=True)

    return df_allpreds

In [10]:
def runXGB1(train_X, train_y, test_X, test_y=None, feature_names=None, seed_val=0, num_rounds=4000):
    param = {}
    param['objective'] = 'reg:linear'
    #param['tree_method'] = 'hist'
    param['eta'] = 0.02
    param['max_depth'] = 8
    param['silent'] = 1
    param['num_class'] = 1
    param['eval_metric'] = "rmse"
    param['min_child_weight'] = 1
    param['subsample'] = 0.7
    param['colsample_bytree'] = 0.7
    param['seed'] = seed_val
    param['base_score'] = train_y.mean()
    num_rounds = num_rounds

    plst = list(param.items())
    xgtrain = xgb.DMatrix(train_X, label=train_y)

    if test_y is not None:
        xgtest = xgb.DMatrix(test_X, label=test_y)
        watchlist = [ (xgtrain,'train'), (xgtest, 'test') ]
        model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=50, verbose_eval=10)
    else:
        xgtest = xgb.DMatrix(test_X)
        model = xgb.train(plst, xgtrain, num_rounds)

    pred_test_y = model.predict(xgtest, ntree_limit=model.best_ntree_limit)
    return pred_test_y, model

In [11]:
def run_cv1(train_df, cv_test, kf, features_to_use):
    
    cv_preds = []
    cv_scores = []
    models = []
    test_preds = []
    
    fold = 0

    for dev_index, val_index in kf.split(train_df, train_df.interest_cat):

        cv_cur_train = train_df.loc[dev_index]
        cv_cur_train = cv_cur_train[cv_cur_train.interest_cat == 1]
        
        cv_cur_valid = train_df.loc[val_index]
        cv_cur_valid_cut = cv_cur_valid[cv_cur_valid.interest_cat == 1]
        
        dev_X, val_X = cv_cur_train[features_to_use], cv_cur_valid_cut[features_to_use]
        dev_y, val_y = cv_cur_train['price'], cv_cur_valid_cut['price']

        #dev_X, val_X = cv_cur_train[features_to_use], cv_cur_valid[features_to_use]
        #dev_y, val_y = cv_cur_train['price'], cv_cur_valid['price']
        
        preds, model = runXGB1(dev_X, dev_y, val_X, val_y)
        models.append(model)

        cv_scores.append(model.best_score)
        print(cv_scores)

        
        fullpreds = model.predict(xgb.DMatrix(cv_cur_valid[features_to_use]), ntree_limit=model.best_ntree_limit)
        
        cut_df = train_df.loc[val_index]
        
        out_df = pd.DataFrame(fullpreds)
        out_df.columns = ["predicted_price"]
        out_df["listing_id"] = cut_df.listing_id.values
        out_df['interest_tgt'] = cut_df.interest.values
        out_df['interest_cat'] = cut_df.interest_cat.values
        out_df['price'] = cut_df.price.values

        cv_preds.append(out_df)

        xgtest = xgb.DMatrix(cv_test[fold][features_to_use])
        test_preds.append(model.predict(xgtest, ntree_limit=model.best_ntree_limit))

    df_cv = pd.concat(cv_preds)
    apreds = np.array(test_preds)
    
    return models, df_cv, apreds

In [12]:
kfold = model_selection.KFold(n_splits=5, shuffle=True, random_state=2016)
rv1 = run_cv1(train_df, cv_test, kfold, fl)


[0]	train-rmse:1212.93	test-rmse:1284.48
Multiple eval metrics have been passed: 'test-rmse' will be used for early stopping.

Will train until test-rmse hasn't improved in 50 rounds.
[10]	train-rmse:1049.77	test-rmse:1125.45
[20]	train-rmse:911.424	test-rmse:994.703
[30]	train-rmse:794.478	test-rmse:883.538
[40]	train-rmse:706.644	test-rmse:805.849
[50]	train-rmse:630.405	test-rmse:740.14
[60]	train-rmse:564.437	test-rmse:684.354
[70]	train-rmse:511.564	test-rmse:642.181
[80]	train-rmse:471.172	test-rmse:611.073
[90]	train-rmse:436.284	test-rmse:585.844
[100]	train-rmse:407.465	test-rmse:566.492
[110]	train-rmse:382.323	test-rmse:550.549
[120]	train-rmse:360.514	test-rmse:537.608
[130]	train-rmse:342.527	test-rmse:527.76
[140]	train-rmse:326.854	test-rmse:519.952
[150]	train-rmse:313.089	test-rmse:512.856
[160]	train-rmse:301.006	test-rmse:507.323
[170]	train-rmse:290.959	test-rmse:503.4
[180]	train-rmse:281.665	test-rmse:499.999
[190]	train-rmse:273.405	test-rmse:496.912
[200]	train-rmse:265.878	test-rmse:494.367
[210]	train-rmse:258.714	test-rmse:492.188
[220]	train-rmse:252.07	test-rmse:489.898
[230]	train-rmse:246.052	test-rmse:487.953
[240]	train-rmse:240.483	test-rmse:486.451
[250]	train-rmse:235.496	test-rmse:485.126
[260]	train-rmse:230.766	test-rmse:483.992
[270]	train-rmse:225.955	test-rmse:482.899
[280]	train-rmse:221.885	test-rmse:481.829
[290]	train-rmse:217.714	test-rmse:480.78
[300]	train-rmse:213.451	test-rmse:480.117
[310]	train-rmse:210.11	test-rmse:479.256
[320]	train-rmse:206.937	test-rmse:478.613
[330]	train-rmse:203.875	test-rmse:478.103
[340]	train-rmse:200.885	test-rmse:477.644
[350]	train-rmse:197.865	test-rmse:476.939
[360]	train-rmse:195.659	test-rmse:476.45
[370]	train-rmse:192.749	test-rmse:475.982
[380]	train-rmse:189.889	test-rmse:475.479
[390]	train-rmse:187.173	test-rmse:474.748
[400]	train-rmse:184.613	test-rmse:474.401
[410]	train-rmse:182.049	test-rmse:474.067
[420]	train-rmse:179.611	test-rmse:473.682
[430]	train-rmse:177.023	test-rmse:473.521
[440]	train-rmse:174.473	test-rmse:473.085
[450]	train-rmse:172.611	test-rmse:472.798
[460]	train-rmse:170.264	test-rmse:472.474
[470]	train-rmse:168.578	test-rmse:472.4
[480]	train-rmse:166.326	test-rmse:472.234
[490]	train-rmse:164.261	test-rmse:472.032
[500]	train-rmse:162.143	test-rmse:471.87
[510]	train-rmse:159.947	test-rmse:471.558
[520]	train-rmse:157.759	test-rmse:471.297
[530]	train-rmse:155.991	test-rmse:471.269
[540]	train-rmse:153.85	test-rmse:471.051
[550]	train-rmse:152.009	test-rmse:470.821
[560]	train-rmse:150.151	test-rmse:470.538
[570]	train-rmse:148.301	test-rmse:470.33
[580]	train-rmse:146.714	test-rmse:470.225
[590]	train-rmse:144.963	test-rmse:470.102
[600]	train-rmse:142.998	test-rmse:469.846
[610]	train-rmse:141.31	test-rmse:469.784
[620]	train-rmse:139.551	test-rmse:469.581
[630]	train-rmse:137.977	test-rmse:469.461
[640]	train-rmse:136.258	test-rmse:469.353
[650]	train-rmse:134.67	test-rmse:469.292
[660]	train-rmse:132.928	test-rmse:469.259
[670]	train-rmse:131.217	test-rmse:469.286
[680]	train-rmse:129.587	test-rmse:469.089
[690]	train-rmse:127.75	test-rmse:468.913
[700]	train-rmse:126.387	test-rmse:468.725
[710]	train-rmse:125	test-rmse:468.638
[720]	train-rmse:123.36	test-rmse:468.427
[730]	train-rmse:121.818	test-rmse:468.403
[740]	train-rmse:120.344	test-rmse:468.325
[750]	train-rmse:118.803	test-rmse:468.38
[760]	train-rmse:117.343	test-rmse:468.25
[770]	train-rmse:115.862	test-rmse:468.132
[780]	train-rmse:114.341	test-rmse:467.985
[790]	train-rmse:112.857	test-rmse:467.913
[800]	train-rmse:111.507	test-rmse:467.783
[810]	train-rmse:109.985	test-rmse:467.677
[820]	train-rmse:108.912	test-rmse:467.627
[830]	train-rmse:107.696	test-rmse:467.636
[840]	train-rmse:106.364	test-rmse:467.538
[850]	train-rmse:105.276	test-rmse:467.441
[860]	train-rmse:103.894	test-rmse:467.382
[870]	train-rmse:102.86	test-rmse:467.327
[880]	train-rmse:101.787	test-rmse:467.276
[890]	train-rmse:100.611	test-rmse:467.204
[900]	train-rmse:99.6646	test-rmse:467.155
[910]	train-rmse:98.4534	test-rmse:467.105
[920]	train-rmse:97.2788	test-rmse:466.989
[930]	train-rmse:96.1336	test-rmse:466.898
[940]	train-rmse:95.2843	test-rmse:466.82
[950]	train-rmse:94.3913	test-rmse:466.761
[960]	train-rmse:93.4498	test-rmse:466.764
[970]	train-rmse:92.5665	test-rmse:466.757
[980]	train-rmse:91.5703	test-rmse:466.746
[990]	train-rmse:90.3517	test-rmse:466.663
[1000]	train-rmse:89.3921	test-rmse:466.652
[1010]	train-rmse:88.4737	test-rmse:466.578
[1020]	train-rmse:87.4526	test-rmse:466.579
[1030]	train-rmse:86.3765	test-rmse:466.468
[1040]	train-rmse:85.3393	test-rmse:466.409
[1050]	train-rmse:84.3778	test-rmse:466.273
[1060]	train-rmse:83.3058	test-rmse:466.221
[1070]	train-rmse:82.3818	test-rmse:466.15
[1080]	train-rmse:81.4326	test-rmse:466.096
[1090]	train-rmse:80.5588	test-rmse:466.027
[1100]	train-rmse:79.6888	test-rmse:466.015
[1110]	train-rmse:78.8266	test-rmse:465.978
[1120]	train-rmse:77.7994	test-rmse:465.917
[1130]	train-rmse:76.9771	test-rmse:465.87
[1140]	train-rmse:76.2434	test-rmse:465.764
[1150]	train-rmse:75.3945	test-rmse:465.775
[1160]	train-rmse:74.5431	test-rmse:465.783
[1170]	train-rmse:73.8033	test-rmse:465.716
[1180]	train-rmse:72.9307	test-rmse:465.633
[1190]	train-rmse:71.9914	test-rmse:465.656
[1200]	train-rmse:71.409	test-rmse:465.651
[1210]	train-rmse:70.7395	test-rmse:465.633
[1220]	train-rmse:70.0794	test-rmse:465.632
[1230]	train-rmse:69.1972	test-rmse:465.599
[1240]	train-rmse:68.4339	test-rmse:465.526
[1250]	train-rmse:67.7091	test-rmse:465.476
[1260]	train-rmse:66.9075	test-rmse:465.442
[1270]	train-rmse:66.2171	test-rmse:465.418
[1280]	train-rmse:65.4888	test-rmse:465.374
[1290]	train-rmse:64.725	test-rmse:465.401
[1300]	train-rmse:64.0736	test-rmse:465.368
[1310]	train-rmse:63.4649	test-rmse:465.377
[1320]	train-rmse:62.742	test-rmse:465.319
[1330]	train-rmse:62.1236	test-rmse:465.308
[1340]	train-rmse:61.3875	test-rmse:465.31
[1350]	train-rmse:60.6235	test-rmse:465.33
[1360]	train-rmse:59.8336	test-rmse:465.3
[1370]	train-rmse:59.1582	test-rmse:465.289
[1380]	train-rmse:58.5741	test-rmse:465.293
[1390]	train-rmse:57.9464	test-rmse:465.304
[1400]	train-rmse:57.3005	test-rmse:465.241
[1410]	train-rmse:56.7383	test-rmse:465.211
[1420]	train-rmse:56.144	test-rmse:465.188
[1430]	train-rmse:55.5382	test-rmse:465.167
[1440]	train-rmse:54.8719	test-rmse:465.135
[1450]	train-rmse:54.3091	test-rmse:465.141
[1460]	train-rmse:53.7604	test-rmse:465.131
[1470]	train-rmse:53.1343	test-rmse:465.09
[1480]	train-rmse:52.6026	test-rmse:465.107
[1490]	train-rmse:52.1874	test-rmse:465.115
[1500]	train-rmse:51.7098	test-rmse:465.064
[1510]	train-rmse:51.1618	test-rmse:465.042
[1520]	train-rmse:50.6458	test-rmse:465.024
[1530]	train-rmse:50.0854	test-rmse:465.006
[1540]	train-rmse:49.5298	test-rmse:464.971
[1550]	train-rmse:48.9371	test-rmse:464.935
[1560]	train-rmse:48.4611	test-rmse:464.9
[1570]	train-rmse:47.9616	test-rmse:464.878
[1580]	train-rmse:47.519	test-rmse:464.861
[1590]	train-rmse:46.9734	test-rmse:464.837
[1600]	train-rmse:46.409	test-rmse:464.809
[1610]	train-rmse:45.8874	test-rmse:464.797
[1620]	train-rmse:45.3582	test-rmse:464.817
[1630]	train-rmse:44.7546	test-rmse:464.83
[1640]	train-rmse:44.3207	test-rmse:464.839
[1650]	train-rmse:43.8502	test-rmse:464.815
Stopping. Best iteration:
[1603]	train-rmse:46.2534	test-rmse:464.796

[464.795746]
[0]	train-rmse:1237.01	test-rmse:1146.2
Multiple eval metrics have been passed: 'test-rmse' will be used for early stopping.

Will train until test-rmse hasn't improved in 50 rounds.
[10]	train-rmse:1068.13	test-rmse:992.926
[20]	train-rmse:933.712	test-rmse:874.529
[30]	train-rmse:824.394	test-rmse:781.886
[40]	train-rmse:730.681	test-rmse:704.172
[50]	train-rmse:654.067	test-rmse:643.373
[60]	train-rmse:587.63	test-rmse:592.94
[70]	train-rmse:531.987	test-rmse:550.819
[80]	train-rmse:486.08	test-rmse:519.301
[90]	train-rmse:446.548	test-rmse:493.378
[100]	train-rmse:413.814	test-rmse:473.403
[110]	train-rmse:388.594	test-rmse:459.43
[120]	train-rmse:367.828	test-rmse:449.636
[130]	train-rmse:346.385	test-rmse:439.395
[140]	train-rmse:329.425	test-rmse:431.8
[150]	train-rmse:314.709	test-rmse:425.997
[160]	train-rmse:302.268	test-rmse:421.833
[170]	train-rmse:292.01	test-rmse:418.381
[180]	train-rmse:282.463	test-rmse:415.783
[190]	train-rmse:273.646	test-rmse:413.709
[200]	train-rmse:265.47	test-rmse:411.362
[210]	train-rmse:258.438	test-rmse:409.82
[220]	train-rmse:252.589	test-rmse:408.489
[230]	train-rmse:246.46	test-rmse:407.027
[240]	train-rmse:241.297	test-rmse:405.974
[250]	train-rmse:236.015	test-rmse:404.808
[260]	train-rmse:231.127	test-rmse:404.137
[270]	train-rmse:227.132	test-rmse:403.444
[280]	train-rmse:222.78	test-rmse:402.89
[290]	train-rmse:218.049	test-rmse:401.918
[300]	train-rmse:214.15	test-rmse:401.214
[310]	train-rmse:211.02	test-rmse:400.669
[320]	train-rmse:207.594	test-rmse:400.093
[330]	train-rmse:204.552	test-rmse:399.802
[340]	train-rmse:201.643	test-rmse:399.391
[350]	train-rmse:198.702	test-rmse:398.915
[360]	train-rmse:195.811	test-rmse:398.711
[370]	train-rmse:193.067	test-rmse:398.591
[380]	train-rmse:190.674	test-rmse:398.529
[390]	train-rmse:187.925	test-rmse:398.318
[400]	train-rmse:185.622	test-rmse:398.008
[410]	train-rmse:182.858	test-rmse:397.671
[420]	train-rmse:180.893	test-rmse:397.428
[430]	train-rmse:178.572	test-rmse:397.124
[440]	train-rmse:176.174	test-rmse:396.713
[450]	train-rmse:173.859	test-rmse:396.59
[460]	train-rmse:171.666	test-rmse:396.432
[470]	train-rmse:169.594	test-rmse:396.245
[480]	train-rmse:167.42	test-rmse:395.973
[490]	train-rmse:165.251	test-rmse:395.782
[500]	train-rmse:163.03	test-rmse:395.669
[510]	train-rmse:161.007	test-rmse:395.447
[520]	train-rmse:158.883	test-rmse:395.171
[530]	train-rmse:157.035	test-rmse:394.981
[540]	train-rmse:154.719	test-rmse:394.715
[550]	train-rmse:152.908	test-rmse:394.713
[560]	train-rmse:151.108	test-rmse:394.597
[570]	train-rmse:149.295	test-rmse:394.359
[580]	train-rmse:147.273	test-rmse:394.187
[590]	train-rmse:145.112	test-rmse:394.027
[600]	train-rmse:143.554	test-rmse:393.88
[610]	train-rmse:141.89	test-rmse:393.775
[620]	train-rmse:140.23	test-rmse:393.645
[630]	train-rmse:138.633	test-rmse:393.565
[640]	train-rmse:136.719	test-rmse:393.437
[650]	train-rmse:134.889	test-rmse:393.51
[660]	train-rmse:133.25	test-rmse:393.311
[670]	train-rmse:131.687	test-rmse:393.178
[680]	train-rmse:130.175	test-rmse:393.106
[690]	train-rmse:128.671	test-rmse:392.922
[700]	train-rmse:127.314	test-rmse:392.771
[710]	train-rmse:125.788	test-rmse:392.662
[720]	train-rmse:124.571	test-rmse:392.651
[730]	train-rmse:123.151	test-rmse:392.523
[740]	train-rmse:121.896	test-rmse:392.518
[750]	train-rmse:120.378	test-rmse:392.393
[760]	train-rmse:118.869	test-rmse:392.3
[770]	train-rmse:117.209	test-rmse:392.109
[780]	train-rmse:115.821	test-rmse:392.191
[790]	train-rmse:114.611	test-rmse:392.167
[800]	train-rmse:113.309	test-rmse:392.189
[810]	train-rmse:111.852	test-rmse:392.138
[820]	train-rmse:110.79	test-rmse:392.147
Stopping. Best iteration:
[770]	train-rmse:117.209	test-rmse:392.109

[464.795746, 392.108948]
[0]	train-rmse:1212.04	test-rmse:1289.94
Multiple eval metrics have been passed: 'test-rmse' will be used for early stopping.

Will train until test-rmse hasn't improved in 50 rounds.
[10]	train-rmse:1041.8	test-rmse:1126.6
[20]	train-rmse:912.064	test-rmse:1005.81
[30]	train-rmse:795.723	test-rmse:900.535
[40]	train-rmse:706.441	test-rmse:823.106
[50]	train-rmse:627.529	test-rmse:753.767
[60]	train-rmse:562.054	test-rmse:696.96
[70]	train-rmse:512.101	test-rmse:654.836
[80]	train-rmse:467.12	test-rmse:618.38
[90]	train-rmse:433.71	test-rmse:594.179
[100]	train-rmse:405.365	test-rmse:573.246
[110]	train-rmse:379.203	test-rmse:555.406
[120]	train-rmse:357.135	test-rmse:540.524
[130]	train-rmse:338.606	test-rmse:529.749
[140]	train-rmse:322.478	test-rmse:520.017
[150]	train-rmse:308.983	test-rmse:513.205
[160]	train-rmse:297.055	test-rmse:506.343
[170]	train-rmse:286.38	test-rmse:501.143
[180]	train-rmse:276.825	test-rmse:496.251
[190]	train-rmse:268.915	test-rmse:492.433
[200]	train-rmse:261.31	test-rmse:489.015
[210]	train-rmse:254.541	test-rmse:486.237
[220]	train-rmse:248.624	test-rmse:483.912
[230]	train-rmse:242.648	test-rmse:481.902
[240]	train-rmse:237.697	test-rmse:480.039
[250]	train-rmse:232.554	test-rmse:478.191
[260]	train-rmse:228.057	test-rmse:477.008
[270]	train-rmse:223.167	test-rmse:476.036
[280]	train-rmse:219.121	test-rmse:475.127
[290]	train-rmse:215.548	test-rmse:474.218
[300]	train-rmse:211.998	test-rmse:473.136
[310]	train-rmse:208.738	test-rmse:472.11
[320]	train-rmse:205.516	test-rmse:471.27
[330]	train-rmse:202.486	test-rmse:470.651
[340]	train-rmse:199.538	test-rmse:470.155
[350]	train-rmse:196.579	test-rmse:469.814
[360]	train-rmse:193.901	test-rmse:469.214
[370]	train-rmse:190.537	test-rmse:468.747
[380]	train-rmse:187.803	test-rmse:468.227
[390]	train-rmse:185.32	test-rmse:467.685
[400]	train-rmse:182.889	test-rmse:467.428
[410]	train-rmse:180.257	test-rmse:467.048
[420]	train-rmse:178.505	test-rmse:466.683
[430]	train-rmse:176.469	test-rmse:466.431
[440]	train-rmse:174.332	test-rmse:466.011
[450]	train-rmse:171.96	test-rmse:465.594
[460]	train-rmse:169.416	test-rmse:465.122
[470]	train-rmse:167.188	test-rmse:464.708
[480]	train-rmse:165.027	test-rmse:464.232
[490]	train-rmse:162.83	test-rmse:464.02
[500]	train-rmse:160.857	test-rmse:463.971
[510]	train-rmse:158.688	test-rmse:463.819
[520]	train-rmse:156.799	test-rmse:463.301
[530]	train-rmse:154.985	test-rmse:463.168
[540]	train-rmse:152.978	test-rmse:462.857
[550]	train-rmse:151.075	test-rmse:462.724
[560]	train-rmse:149.183	test-rmse:462.58
[570]	train-rmse:147.269	test-rmse:462.393
[580]	train-rmse:145.194	test-rmse:462.239
[590]	train-rmse:143.882	test-rmse:462.151
[600]	train-rmse:142.306	test-rmse:461.968
[610]	train-rmse:140.302	test-rmse:461.888
[620]	train-rmse:138.662	test-rmse:461.77
[630]	train-rmse:136.823	test-rmse:461.601
[640]	train-rmse:135.042	test-rmse:461.434
[650]	train-rmse:133.409	test-rmse:461.216
[660]	train-rmse:131.607	test-rmse:460.987
[670]	train-rmse:130.344	test-rmse:460.842
[680]	train-rmse:128.94	test-rmse:460.695
[690]	train-rmse:127.308	test-rmse:460.596
[700]	train-rmse:125.411	test-rmse:460.419
[710]	train-rmse:123.883	test-rmse:460.195
[720]	train-rmse:122.61	test-rmse:460.038
[730]	train-rmse:121.047	test-rmse:459.935
[740]	train-rmse:119.678	test-rmse:459.813
[750]	train-rmse:117.878	test-rmse:459.66
[760]	train-rmse:116.53	test-rmse:459.727
[770]	train-rmse:115.002	test-rmse:459.589
[780]	train-rmse:113.587	test-rmse:459.449
[790]	train-rmse:112.201	test-rmse:459.396
[800]	train-rmse:111.124	test-rmse:459.241
[810]	train-rmse:109.613	test-rmse:459.129
[820]	train-rmse:108.173	test-rmse:459.011
[830]	train-rmse:106.93	test-rmse:459.004
[840]	train-rmse:105.826	test-rmse:458.882
[850]	train-rmse:104.332	test-rmse:458.924
[860]	train-rmse:102.87	test-rmse:458.885
[870]	train-rmse:101.471	test-rmse:458.845
[880]	train-rmse:100.32	test-rmse:458.794
[890]	train-rmse:99.0372	test-rmse:458.773
[900]	train-rmse:97.8131	test-rmse:458.626
[910]	train-rmse:96.6766	test-rmse:458.641
[920]	train-rmse:95.7575	test-rmse:458.534
[930]	train-rmse:94.7713	test-rmse:458.477
[940]	train-rmse:93.6435	test-rmse:458.412
[950]	train-rmse:92.4066	test-rmse:458.306
[960]	train-rmse:91.4072	test-rmse:458.285
[970]	train-rmse:90.4039	test-rmse:458.262
[980]	train-rmse:89.1536	test-rmse:458.219
[990]	train-rmse:88.0537	test-rmse:458.141
[1000]	train-rmse:86.9478	test-rmse:458.141
[1010]	train-rmse:86.2058	test-rmse:458.188
[1020]	train-rmse:85.2434	test-rmse:458.191
[1030]	train-rmse:84.2884	test-rmse:458.201
[1040]	train-rmse:83.3494	test-rmse:458.237
Stopping. Best iteration:
[997]	train-rmse:87.3053	test-rmse:458.096

[464.795746, 392.108948, 458.096497]
[0]	train-rmse:1238.57	test-rmse:1179.92
Multiple eval metrics have been passed: 'test-rmse' will be used for early stopping.

Will train until test-rmse hasn't improved in 50 rounds.
[10]	train-rmse:1075.45	test-rmse:1027.16
[20]	train-rmse:938.979	test-rmse:900.595
[30]	train-rmse:831.088	test-rmse:804.363
[40]	train-rmse:732.245	test-rmse:716.377
[50]	train-rmse:650.773	test-rmse:646.362
[60]	train-rmse:587.397	test-rmse:595.702
[70]	train-rmse:532.547	test-rmse:553.201
[80]	train-rmse:487.583	test-rmse:521.599
[90]	train-rmse:448.307	test-rmse:494.251
[100]	train-rmse:417.527	test-rmse:475.189
[110]	train-rmse:390.273	test-rmse:459.547
[120]	train-rmse:368.21	test-rmse:448.511
[130]	train-rmse:348.699	test-rmse:439.712
[140]	train-rmse:331.469	test-rmse:431.826
[150]	train-rmse:317.492	test-rmse:426.785
[160]	train-rmse:304.68	test-rmse:422.445
[170]	train-rmse:293.965	test-rmse:419.752
[180]	train-rmse:283.908	test-rmse:416.361
[190]	train-rmse:275.322	test-rmse:414.181
[200]	train-rmse:267.821	test-rmse:412.386
[210]	train-rmse:260.467	test-rmse:410.439
[220]	train-rmse:253.779	test-rmse:408.52
[230]	train-rmse:247.822	test-rmse:407.214
[240]	train-rmse:241.97	test-rmse:406.091
[250]	train-rmse:236.602	test-rmse:404.897
[260]	train-rmse:231.435	test-rmse:403.895
[270]	train-rmse:227.214	test-rmse:403.083
[280]	train-rmse:223.535	test-rmse:402.68
[290]	train-rmse:219.045	test-rmse:402.272
[300]	train-rmse:215.61	test-rmse:401.424
[310]	train-rmse:211.989	test-rmse:400.947
[320]	train-rmse:208.382	test-rmse:400.468
[330]	train-rmse:205.464	test-rmse:400.038
[340]	train-rmse:202.351	test-rmse:399.642
[350]	train-rmse:199.518	test-rmse:399.195
[360]	train-rmse:196.593	test-rmse:398.637
[370]	train-rmse:194.287	test-rmse:398.287
[380]	train-rmse:191.583	test-rmse:397.902
[390]	train-rmse:189.025	test-rmse:397.363
[400]	train-rmse:186.675	test-rmse:397.095
[410]	train-rmse:183.831	test-rmse:396.768
[420]	train-rmse:181.08	test-rmse:396.477
[430]	train-rmse:178.599	test-rmse:396.054
[440]	train-rmse:175.68	test-rmse:395.61
[450]	train-rmse:173.143	test-rmse:395.41
[460]	train-rmse:170.627	test-rmse:395.111
[470]	train-rmse:168.225	test-rmse:394.755
[480]	train-rmse:166.275	test-rmse:394.679
[490]	train-rmse:164.061	test-rmse:394.534
[500]	train-rmse:161.746	test-rmse:394.246
[510]	train-rmse:159.963	test-rmse:393.963
[520]	train-rmse:158.216	test-rmse:393.645
[530]	train-rmse:156.075	test-rmse:393.599
[540]	train-rmse:153.953	test-rmse:393.36
[550]	train-rmse:152.068	test-rmse:393.327
[560]	train-rmse:150.12	test-rmse:393.121
[570]	train-rmse:148.312	test-rmse:393.027
[580]	train-rmse:146.244	test-rmse:392.708
[590]	train-rmse:144.414	test-rmse:392.558
[600]	train-rmse:142.793	test-rmse:392.346
[610]	train-rmse:141.159	test-rmse:392.221
[620]	train-rmse:139.573	test-rmse:392.242
[630]	train-rmse:137.801	test-rmse:391.949
[640]	train-rmse:136.481	test-rmse:391.755
[650]	train-rmse:135.016	test-rmse:391.716
[660]	train-rmse:133.591	test-rmse:391.653
[670]	train-rmse:132.253	test-rmse:391.595
[680]	train-rmse:130.581	test-rmse:391.45
[690]	train-rmse:129.122	test-rmse:391.377
[700]	train-rmse:127.406	test-rmse:391.231
[710]	train-rmse:125.843	test-rmse:391.197
[720]	train-rmse:124.169	test-rmse:391.076
[730]	train-rmse:122.812	test-rmse:390.916
[740]	train-rmse:121.445	test-rmse:390.738
[750]	train-rmse:120.218	test-rmse:390.629
[760]	train-rmse:118.785	test-rmse:390.472
[770]	train-rmse:117.515	test-rmse:390.392
[780]	train-rmse:116.078	test-rmse:390.263
[790]	train-rmse:114.58	test-rmse:390.244
[800]	train-rmse:113.371	test-rmse:390.223
[810]	train-rmse:112.093	test-rmse:390.055
[820]	train-rmse:111.031	test-rmse:389.944
[830]	train-rmse:109.83	test-rmse:389.873
[840]	train-rmse:108.553	test-rmse:389.81
[850]	train-rmse:107.387	test-rmse:389.7
[860]	train-rmse:106.009	test-rmse:389.608
[870]	train-rmse:104.726	test-rmse:389.643
[880]	train-rmse:103.416	test-rmse:389.538
[890]	train-rmse:102.122	test-rmse:389.445
[900]	train-rmse:101.149	test-rmse:389.32
[910]	train-rmse:100.118	test-rmse:389.291
[920]	train-rmse:98.8148	test-rmse:389.212
[930]	train-rmse:97.8013	test-rmse:389.174
[940]	train-rmse:96.5796	test-rmse:389.163
[950]	train-rmse:95.4383	test-rmse:389.065
[960]	train-rmse:94.0688	test-rmse:389.046
[970]	train-rmse:92.9625	test-rmse:388.943
[980]	train-rmse:91.9494	test-rmse:388.885
[990]	train-rmse:90.7649	test-rmse:388.783
[1000]	train-rmse:89.6393	test-rmse:388.686
[1010]	train-rmse:88.5355	test-rmse:388.64
[1020]	train-rmse:87.5531	test-rmse:388.55
[1030]	train-rmse:86.566	test-rmse:388.497
[1040]	train-rmse:85.6903	test-rmse:388.553
[1050]	train-rmse:84.956	test-rmse:388.543
[1060]	train-rmse:84.0227	test-rmse:388.563
[1070]	train-rmse:83.2278	test-rmse:388.513
[1080]	train-rmse:82.3431	test-rmse:388.376
[1090]	train-rmse:81.4568	test-rmse:388.325
[1100]	train-rmse:80.7043	test-rmse:388.274
[1110]	train-rmse:79.8138	test-rmse:388.274
[1120]	train-rmse:78.883	test-rmse:388.24
[1130]	train-rmse:78.048	test-rmse:388.184
[1140]	train-rmse:77.1145	test-rmse:388.141
[1150]	train-rmse:76.1787	test-rmse:388.127
[1160]	train-rmse:75.2773	test-rmse:388.061
[1170]	train-rmse:74.4183	test-rmse:388.007
[1180]	train-rmse:73.5512	test-rmse:387.967
[1190]	train-rmse:72.8744	test-rmse:387.946
[1200]	train-rmse:71.9695	test-rmse:387.943
[1210]	train-rmse:71.1813	test-rmse:387.921
[1220]	train-rmse:70.3811	test-rmse:387.921
[1230]	train-rmse:69.446	test-rmse:387.914
[1240]	train-rmse:68.742	test-rmse:387.871
[1250]	train-rmse:67.9126	test-rmse:387.875
[1260]	train-rmse:67.1885	test-rmse:387.866
[1270]	train-rmse:66.3248	test-rmse:387.797
[1280]	train-rmse:65.5527	test-rmse:387.82
[1290]	train-rmse:64.9109	test-rmse:387.835
[1300]	train-rmse:64.2182	test-rmse:387.818
[1310]	train-rmse:63.5509	test-rmse:387.816
[1320]	train-rmse:62.8854	test-rmse:387.833
Stopping. Best iteration:
[1271]	train-rmse:66.2682	test-rmse:387.789

[464.795746, 392.108948, 458.096497, 387.788696]
[0]	train-rmse:1221.15	test-rmse:1225.36
Multiple eval metrics have been passed: 'test-rmse' will be used for early stopping.

Will train until test-rmse hasn't improved in 50 rounds.
[10]	train-rmse:1046.23	test-rmse:1059.1
[20]	train-rmse:909.255	test-rmse:932.358
[30]	train-rmse:796.644	test-rmse:830.71
[40]	train-rmse:709.562	test-rmse:753.73
[50]	train-rmse:631.758	test-rmse:687.52
[60]	train-rmse:566.227	test-rmse:633.227
[70]	train-rmse:513.958	test-rmse:591.905
[80]	train-rmse:470.709	test-rmse:559.334
[90]	train-rmse:435.308	test-rmse:534.852
[100]	train-rmse:405.9	test-rmse:515.801
[110]	train-rmse:381.474	test-rmse:500.603
[120]	train-rmse:360.628	test-rmse:488.473
[130]	train-rmse:342.549	test-rmse:478.527
[140]	train-rmse:327.34	test-rmse:470.917
[150]	train-rmse:313.456	test-rmse:464.625
[160]	train-rmse:301.69	test-rmse:459.043
[170]	train-rmse:290.907	test-rmse:454.031
[180]	train-rmse:281.503	test-rmse:450.305
[190]	train-rmse:273.171	test-rmse:447.128
[200]	train-rmse:265.023	test-rmse:444.16
[210]	train-rmse:258.369	test-rmse:441.286
[220]	train-rmse:252.278	test-rmse:438.798
[230]	train-rmse:246.289	test-rmse:437.148
[240]	train-rmse:241.179	test-rmse:435.617
[250]	train-rmse:236.111	test-rmse:433.828
[260]	train-rmse:231.427	test-rmse:432.544
[270]	train-rmse:226.371	test-rmse:431.304
[280]	train-rmse:222.458	test-rmse:430.494
[290]	train-rmse:218.459	test-rmse:429.391
[300]	train-rmse:214.982	test-rmse:428.777
[310]	train-rmse:211.64	test-rmse:428.008
[320]	train-rmse:208.688	test-rmse:427.205
[330]	train-rmse:204.892	test-rmse:426.691
[340]	train-rmse:201.804	test-rmse:425.988
[350]	train-rmse:198.957	test-rmse:425.368
[360]	train-rmse:195.831	test-rmse:424.881
[370]	train-rmse:193.364	test-rmse:424.409
[380]	train-rmse:190.676	test-rmse:424.043
[390]	train-rmse:187.68	test-rmse:423.325
[400]	train-rmse:185.063	test-rmse:422.635
[410]	train-rmse:182.73	test-rmse:422.312
[420]	train-rmse:180.392	test-rmse:422.018
[430]	train-rmse:178.501	test-rmse:421.73
[440]	train-rmse:176.105	test-rmse:421.321
[450]	train-rmse:174.012	test-rmse:421.074
[460]	train-rmse:171.855	test-rmse:420.698
[470]	train-rmse:169.538	test-rmse:420.376
[480]	train-rmse:167.562	test-rmse:420.128
[490]	train-rmse:165.288	test-rmse:419.785
[500]	train-rmse:163.176	test-rmse:419.514
[510]	train-rmse:160.741	test-rmse:419.561
[520]	train-rmse:158.643	test-rmse:419.32
[530]	train-rmse:156.653	test-rmse:419.002
[540]	train-rmse:154.982	test-rmse:418.918
[550]	train-rmse:153.057	test-rmse:418.648
[560]	train-rmse:151.464	test-rmse:418.518
[570]	train-rmse:149.608	test-rmse:418.41
[580]	train-rmse:147.87	test-rmse:418.283
[590]	train-rmse:145.95	test-rmse:418.035
[600]	train-rmse:143.971	test-rmse:417.908
[610]	train-rmse:142.006	test-rmse:417.822
[620]	train-rmse:140.3	test-rmse:417.718
[630]	train-rmse:138.546	test-rmse:417.638
[640]	train-rmse:136.933	test-rmse:417.53
[650]	train-rmse:135.498	test-rmse:417.477
[660]	train-rmse:134.179	test-rmse:417.365
[670]	train-rmse:132.792	test-rmse:417.185
[680]	train-rmse:131.473	test-rmse:417.176
[690]	train-rmse:130.137	test-rmse:416.94
[700]	train-rmse:128.552	test-rmse:416.828
[710]	train-rmse:126.961	test-rmse:416.728
[720]	train-rmse:125.362	test-rmse:416.553
[730]	train-rmse:123.958	test-rmse:416.396
[740]	train-rmse:122.8	test-rmse:416.248
[750]	train-rmse:121.305	test-rmse:416.163
[760]	train-rmse:119.872	test-rmse:416.037
[770]	train-rmse:118.398	test-rmse:415.873
[780]	train-rmse:116.968	test-rmse:415.819
[790]	train-rmse:115.604	test-rmse:415.65
[800]	train-rmse:114.396	test-rmse:415.593
[810]	train-rmse:113.247	test-rmse:415.53
[820]	train-rmse:111.902	test-rmse:415.36
[830]	train-rmse:110.428	test-rmse:415.241
[840]	train-rmse:109.174	test-rmse:415.21
[850]	train-rmse:107.772	test-rmse:415.058
[860]	train-rmse:106.779	test-rmse:414.987
[870]	train-rmse:105.543	test-rmse:414.898
[880]	train-rmse:104.4	test-rmse:414.798
[890]	train-rmse:103.181	test-rmse:414.685
[900]	train-rmse:101.849	test-rmse:414.546
[910]	train-rmse:100.702	test-rmse:414.414
[920]	train-rmse:99.9064	test-rmse:414.44
[930]	train-rmse:98.7296	test-rmse:414.372
[940]	train-rmse:97.7568	test-rmse:414.304
[950]	train-rmse:96.6581	test-rmse:414.18
[960]	train-rmse:95.7884	test-rmse:414.13
[970]	train-rmse:94.7286	test-rmse:413.962
[980]	train-rmse:93.7414	test-rmse:413.823
[990]	train-rmse:92.6218	test-rmse:413.659
[1000]	train-rmse:91.6478	test-rmse:413.603
[1010]	train-rmse:90.5217	test-rmse:413.557
[1020]	train-rmse:89.3576	test-rmse:413.539
[1030]	train-rmse:88.539	test-rmse:413.44
[1040]	train-rmse:87.5043	test-rmse:413.367
[1050]	train-rmse:86.5545	test-rmse:413.315
[1060]	train-rmse:85.5875	test-rmse:413.322
[1070]	train-rmse:84.511	test-rmse:413.266
[1080]	train-rmse:83.6725	test-rmse:413.278
[1090]	train-rmse:82.7241	test-rmse:413.255
[1100]	train-rmse:81.8277	test-rmse:413.168
[1110]	train-rmse:80.9218	test-rmse:413.114
[1120]	train-rmse:80.068	test-rmse:413.062
[1130]	train-rmse:79.1454	test-rmse:412.943
[1140]	train-rmse:78.2343	test-rmse:412.908
[1150]	train-rmse:77.4423	test-rmse:412.883
[1160]	train-rmse:76.5839	test-rmse:412.856
[1170]	train-rmse:75.6677	test-rmse:412.824
[1180]	train-rmse:74.8021	test-rmse:412.774
[1190]	train-rmse:73.9893	test-rmse:412.751
[1200]	train-rmse:73.1994	test-rmse:412.679
[1210]	train-rmse:72.3661	test-rmse:412.58
[1220]	train-rmse:71.4705	test-rmse:412.52
[1230]	train-rmse:70.6592	test-rmse:412.479
[1240]	train-rmse:69.8624	test-rmse:412.479
[1250]	train-rmse:69.1206	test-rmse:412.423
[1260]	train-rmse:68.2792	test-rmse:412.397
[1270]	train-rmse:67.5481	test-rmse:412.414
[1280]	train-rmse:66.822	test-rmse:412.372
[1290]	train-rmse:66.1103	test-rmse:412.334
[1300]	train-rmse:65.3961	test-rmse:412.294
[1310]	train-rmse:64.7491	test-rmse:412.23
[1320]	train-rmse:64.0865	test-rmse:412.195
[1330]	train-rmse:63.452	test-rmse:412.186
[1340]	train-rmse:62.8637	test-rmse:412.152
[1350]	train-rmse:62.1453	test-rmse:412.081
[1360]	train-rmse:61.4667	test-rmse:412.092
[1370]	train-rmse:60.7208	test-rmse:412.042
[1380]	train-rmse:60.1003	test-rmse:412.003
[1390]	train-rmse:59.3859	test-rmse:411.976
[1400]	train-rmse:58.6591	test-rmse:411.918
[1410]	train-rmse:58.0907	test-rmse:411.912
[1420]	train-rmse:57.3702	test-rmse:411.87
[1430]	train-rmse:56.8044	test-rmse:411.851
[1440]	train-rmse:56.2166	test-rmse:411.816
[1450]	train-rmse:55.5042	test-rmse:411.803
[1460]	train-rmse:54.9606	test-rmse:411.782
[1470]	train-rmse:54.3784	test-rmse:411.767
[1480]	train-rmse:53.8607	test-rmse:411.755
[1490]	train-rmse:53.3006	test-rmse:411.717
[1500]	train-rmse:52.7615	test-rmse:411.723
[1510]	train-rmse:52.2512	test-rmse:411.695
[1520]	train-rmse:51.725	test-rmse:411.646
[1530]	train-rmse:51.0558	test-rmse:411.611
[1540]	train-rmse:50.5893	test-rmse:411.584
[1550]	train-rmse:50.0364	test-rmse:411.569
[1560]	train-rmse:49.4643	test-rmse:411.505
[1570]	train-rmse:48.9073	test-rmse:411.45
[1580]	train-rmse:48.2431	test-rmse:411.394
[1590]	train-rmse:47.685	test-rmse:411.41
[1600]	train-rmse:47.1061	test-rmse:411.403
[1610]	train-rmse:46.5904	test-rmse:411.418
[1620]	train-rmse:46.1401	test-rmse:411.413
[1630]	train-rmse:45.6702	test-rmse:411.386
Stopping. Best iteration:
[1582]	train-rmse:48.1432	test-rmse:411.383

[464.795746, 392.108948, 458.096497, 387.788696, 411.382812]

In [13]:
rv1_subset = rv1[1][rv1[1].interest_cat == 1]
rmse_subset = np.sqrt(sklearn.metrics.mean_squared_error(rv1_subset.predicted_price, rv1_subset.price))
mae_subset = sklearn.metrics.mean_absolute_error(rv1_subset.predicted_price, rv1_subset.price)

mae_full = sklearn.metrics.mean_absolute_error(rv1[1].predicted_price, rv1[1].price)

rmse_full = np.sqrt(sklearn.metrics.mean_squared_error(rv1[1].predicted_price, rv1[1].price))

ldiff = np.log(rv1[1].predicted_price) - np.log(rv1[1].price)

print(ldiff.mean(), rv1[1].interest_tgt.corr(ldiff), mae_subset, mae_full, rmse_full)


-0.05851528659725915 0.342664562213 249.479908003 501.178934954 1004.80374477

In [14]:
#rv1[1].columns = ['predicted_price', 'listing_id', 'interest_tgt']

df_testpreds = pd.DataFrame(rv1[2].mean(axis=0))
df_testpreds.columns = ['predicted_price']
df_testpreds['listing_id'] = cv_test[0].listing_id
df_allpreds = pd.concat([rv1[1][['predicted_price', 'listing_id']], df_testpreds])

df_allpreds.sort_values('listing_id', inplace=True)
df_allpreds.set_index('listing_id', inplace=True)

df_allpreds.to_pickle('fin-medium-price-r2.pkl')

In [15]:
df_allprices = pd.concat([train_df[['listing_id', 'price']], test_df[['listing_id', 'price']]]).copy()

df_allprices.set_index('listing_id', inplace=True)
df_allprices.sort_index(inplace=True)

df_allpreds_logdiff = df_allpreds.copy()
df_allpreds_logdiff['logdiff'] = np.log(df_allpreds_logdiff.predicted_price) - np.log(df_allprices.price)
df_allpreds_logdiff.drop('predicted_price', axis=1, inplace=True)

#df_allpreds_logdiff.to_pickle('bag-submodel-medium-logdiff-r1.pkl')

df_fold = []
for f in range(5):
    df_fold.append(pd.DataFrame(rv1[2][f]))
    df_fold[-1].columns = ['predicted_price']
    df_fold[-1]['logdiff'] = np.log(df_fold[-1].predicted_price) - np.log(cv_test[0].price)
    df_fold[-1].drop('predicted_price', axis=1, inplace=True)
    
    df_fold[-1]['listing_id'] = test_df.listing_id
    df_fold[-1].sort_values('listing_id', inplace=True)
    df_fold[-1].set_index('listing_id', inplace=True)

pickle.dump((df_allpreds_logdiff, df_fold), open('model-medium-logdiff-r2.pkl', 'wb'))

In [16]:
df_tp_cmp = df_testpreds.copy()
df_tp_cmp['price'] = cv_test[0]['price']

In [17]:
df_tp_cmp_cut = df_tp_cmp.copy()
df_tp_cmp_cut.price = df_tp_cmp_cut.price.clip(0, 13000)

In [18]:
np.sqrt(sklearn.metrics.mean_squared_error(df_tp_cmp_cut.price, df_tp_cmp_cut.predicted_price))


Out[18]:
992.68426190494222

In [ ]:
# XXX update for final package

In [19]:
df_allprices = pd.concat([train_df[['listing_id', 'price']], test_df[['listing_id', 'price']]]).copy()

df_allprices.set_index('listing_id', inplace=True)
df_allprices.sort_index(inplace=True)

In [20]:
df_allpreds_logdiff = df_allpreds.copy()
df_allpreds_logdiff['logdiff'] = np.log(df_allpreds_logdiff.predicted_price) - np.log(df_allprices.price)
df_allpreds_logdiff.drop('predicted_price', axis=1, inplace=True)
df_allpreds_logdiff.to_pickle('fin-submodel-medium-logdiff.pkl')