Using this just for RandomForestClassifier - there's also ExtraTrees and RandomForestRegressors, but they didn't help my OOB, so they're not getting run here...


In [1]:
import os
import sys
import operator
import numpy as np
import pandas as pd
from scipy import sparse
import xgboost as xgb
import random
from sklearn import model_selection, preprocessing, ensemble
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

import pickle
import time

import sklearn.cluster

import Levenshtein

from multiprocessing import Pool

import lightgbm as lgbm

In [2]:
train_df = pd.read_pickle('fin-dprep-train.pkl')
test_df = pd.read_pickle('fin-dprep-test.pkl')
features_to_use = pickle.load(open('fin-dprep-flist.pkl', 'rb'))

In [3]:
medium_price = pd.read_pickle('fin-medium-price.pkl')

train_df = pd.merge(train_df, medium_price, left_on='listing_id', right_index=True)
test_df = pd.merge(test_df, medium_price, left_on='listing_id', right_index=True)

In [ ]:


In [4]:
for df in [train_df, test_df]:
    df['predicted_price_diff'] = np.log(df.predicted_price) - np.log(df.price)
    df['predicted_price_ratio'] = np.log(df.predicted_price) / np.log(df.price)

In [5]:
# fill in the NaN's.

for t in train_df.keys():
    nacount = train_df[t].isnull().sum()
    if nacount:
#        nacount_test = test_df[t].isnull().sum()
        print(t, nacount / len(train_df))#, nacount_test / len(test_df))
        
train_df.fillna(0, inplace=True)
test_df.fillna(0, inplace=True)


price_group 0.0488733992543
price_ratio 0.0488733992543
manager_shortdesc_rate 0.0688725887502
manager_building0_rate 0.0688725887502
manager_0feature_rate 0.0688725887502
manager_median_price 0.0688725887502
manager_lazy_rate 0.0688725887502

In [6]:
class MeansProcessor:
    def __init__(self, key, outkey = None, tgt = 'interest'):
        self.key = key
        self.outkey = key if outkey is None else outkey
        
        self.count = {}
        self.means = {}
        self.std = {}
        self.global_means = {}
        
        self.tgt = tgt
        
        self.outkeys = [self.outkey + '_level', self.outkey + '_level_std']
        
    def fit(self, df):
        self.global_means[self.outkey + '_level'] = df[self.tgt].mean()
        self.global_means[self.outkey + '_level_std'] = df[self.tgt].std()
            
        for k in df.groupby(self.key, sort=False):
            
            self.count[k[0]] = len(k[1])

            if len(k[1]) < 0:
                self.means[k[0]] = np.nan
                self.std[k[0]] = np.nan
            else:
                self.means[k[0]] = np.mean(k[1][self.tgt])
                self.std[k[0]] = np.std(k[1][self.tgt])
            
    def predict(self, df, nans = False):
        for l in self.outkeys:
            df[l] = np.nan if nans else self.global_means[l]
            
        df[self.outkey + '_count'] = 0
            
        for k in df.groupby(self.key, sort=False):
            if k[0] == 0:
                continue
            
            if k[0] in self.means:
                df.loc[k[1].index, self.outkey + '_count'] = self.count[k[0]]
                df.loc[k[1].index, self.outkey + '_level'] = self.means[k[0]]
                df.loc[k[1].index, self.outkey + '_level_std'] = self.std[k[0]]
        
        return df
    
    def get_features(self):
        return self.outkeys.copy() + [self.outkey + '_count']

# i kept the same index randomization (with fixed seed) so I could validate this code against
# the original...

target_num_map = {'low':0, 'medium':1, 'high':2}
train_y = np.array(train_df['interest_level'].apply(lambda x: target_num_map[x]))

def proc_fold(fold):
    train_index = fold[0]
    test_index = fold[1]
    
    cv_train = train_df.iloc[train_index]
    cv_valid = train_df.iloc[test_index][['interest_level', 'manager_id', 'building_id']]
    cv_test = test_df.copy()
    
    m_build = MeansProcessor('building_id', 'building_sort')
    m_build.fit(cv_train)
    cv_valid = m_build.predict(cv_valid)
    cv_test = m_build.predict(cv_test)

    m_mgr = MeansProcessor('manager_id', 'manager_sort')
    m_mgr.fit(cv_train)
    cv_valid = m_mgr.predict(cv_valid)
    cv_test = m_mgr.predict(cv_test)

    m_comb = MeansProcessor(['building_id', 'manager_id'], 'mb_comb')
    m_comb.fit(cv_train)
    cv_valid = m_comb.predict(cv_valid)
    cv_test = m_comb.predict(cv_test)

    return cv_train, cv_valid, cv_test

kf = model_selection.StratifiedKFold(n_splits=5, shuffle=True, random_state=2016)
folds = [(k[0], k[1]) for k in kf.split(list(range(train_df.shape[0])), train_y)]

#with Pool(5) as pool:
#    rv = pool.map(proc_fold, folds)

import pickle

try:
    rv = pickle.load(open('0422-model-groupfeatures_nonan.pkl', 'rb'))
except:
    with Pool(5) as pool:
        rv = pool.map(proc_fold, folds)

        pickle.dump(rv, open('0422-model-groupfeatures_nonan.pkl', 'wb'))

# dummies to get feature id's
m_build = MeansProcessor('building_id', 'building_sort')
m_mgr = MeansProcessor('manager_id', 'manager_sort')
m_comb = MeansProcessor(['building_id', 'manager_id'], 'mb_comb')

group_features = m_build.get_features() + m_mgr.get_features() + m_comb.get_features()

#cv_test = [r[2] for r in rv]
cv_test = []
for r in rv:
    cv_test.append(test_df.merge(r[2][group_features], left_index=True, right_index=True))

cv_allvalid = pd.concat([r[1] for r in rv])

train_df = train_df.merge(cv_allvalid[group_features], left_index=True, right_index=True)

In [7]:
kf = model_selection.StratifiedKFold(n_splits=5, shuffle=True, random_state=2016)
folds = [(k[0], k[1]) for k in kf.split(list(range(train_df.shape[0])), train_df.interest_cat)]

In [8]:
for df in [train_df] + cv_test:
    df['price_t'] = df['price_t'].clip(0, 13000)
    df['price_per_room'] = df['price_per_room'].clip(0, 13000)
    #df['density_lin005'] = df['density_lin005'].clip(-50, 50)
    df['predicted_price_ratio'] = df['predicted_price_ratio'].clip(-50, 50)

In [9]:
train_df_normalized = train_df.copy()
cvtest_normalized = [df.copy() for df in cv_test]

train_df_normalized['listing_id_norm'] = train_df_normalized['listing_id']
for df in cvtest_normalized:
    df['listing_id_norm'] = df['listing_id']

normalized_keys = []

scaler = {}
for f in train_df.keys():
    if f[0:2] == 'f_' or f[0:3] == 'fm_':
        train_df_normalized[f] = train_df_normalized[f].clip(0, 1)
        for df in cvtest_normalized:
            df[f] = df[f].clip(0, 1)
    elif 'interest' in f or f == 'listing_id' or f == 'index':
        continue
    elif f == 'created' or train_df[f].dtype == 'O':
        train_df_normalized.drop(f, axis=1, inplace=True)
        for df in cvtest_normalized:
            df.drop(f, axis=1, inplace=True)
        continue
    else:
        #print(f, train_df[f].min(), train_df[f].max(), test_df[f].min(), test_df[f].max())
        scaler[f] = sklearn.preprocessing.StandardScaler()
        train_df_normalized[f] = scaler[f].fit_transform(train_df_normalized[f].values.reshape(-1,1))[:,0]
        for df in cvtest_normalized:
            df[f] = scaler[f].transform(df[f].values.reshape(-1,1))[:,0]
        
    normalized_keys.append(f)


/opt/conda/lib/python3.6/site-packages/sklearn/utils/validation.py:431: DataConversionWarning: Data with input dtype int64 was converted to float64 by StandardScaler.
  warnings.warn(msg, DataConversionWarning)
/opt/conda/lib/python3.6/site-packages/sklearn/utils/validation.py:431: DataConversionWarning: Data with input dtype uint8 was converted to float64 by StandardScaler.
  warnings.warn(msg, DataConversionWarning)

models begin here


In [10]:
fl = normalized_keys.copy() + m_build.get_features() + m_mgr.get_features() 

#for f in ['density_exp01', 'density_exp005', 'density_lin005', 'density_gaussian001', 'density_gaussian', 'density_gaussian01', 'density_gaussian02', 'density_gaussian04']:
#    fl.remove(f)
    
#fl.append('density_gaussian02')
#fl.append('density_exp01')


fl.remove('predicted_price_ratio')
fl.remove('manager_building0_rate')
fl.remove('manager_shortdesc_rate')
fl.remove('manager_0feature_rate')
#fl.append('manager_sort_count')

In [11]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.datasets import load_digits
from sklearn.ensemble import RandomForestClassifier

from scipy.stats import randint as sp_randint

In [12]:
# RandomForestClassifier

start = time.time()

# prep CV

cv_train = []
cv_valid = []

for tr_index, val_index in kf.split(train_df.index, train_df.interest_cat):
        cv_train.append(train_df_normalized.loc[tr_index])
        cv_valid.append(train_df_normalized.loc[val_index])

models = []
cv_preds = []
df_cvpreds = []

for f in range(5):
    #dset = lgbm.Dataset(cv_train[f][fl], cv_train[f].interest_cat, silent=True)
    #dset_val = lgbm.Dataset(cv_valid[f][fl], cv_valid[f].interest_cat, silent=True)
    #models.append(lgbm.train(lgbm_params, dset, early_stopping_rounds=100, verbose_eval=False, valid_sets=dset_val, num_boost_round=2000))

    models.append(sklearn.ensemble.RandomForestClassifier(n_estimators=10, min_samples_split=8, min_samples_leaf=4, 
                                                          n_jobs=-1, class_weight=None, random_state=0))
    models[-1].fit(cv_train[f][fl].values, cv_train[f].interest_cat.values)
    
    best = None
    
    for nest in range(20, 2000, 10):
        models[-1].set_params(warm_start = True, n_estimators=nest)
        models[-1].fit(cv_train[f][fl].values, cv_train[f].interest_cat.values)
    
        preds = models[-1].predict_proba(cv_valid[f][fl].values)
        score = sklearn.metrics.log_loss(cv_valid[f].interest_cat, preds)
        
        print(nest, score)

        if best is None or score < best[0]:
            best = (score, nest)
        elif nest - best[1] >= 50:
            break
            
    #models[-1].set_params(n_estimators = best[1])
    
    #print('done training')
    
    cv_preds.append(models[-1].predict_proba(cv_valid[f][fl].values))

    df_cvpreds.append(pd.DataFrame(cv_preds[f], columns=['low', 'medium', 'high']))
    df_cvpreds[f].index = cv_valid[f].index
    df_cvpreds[f]['interest_cat'] = cv_valid[f].interest_cat
    df_cvpreds[f]['listing_id'] = cv_valid[f].listing_id

    df_cvpreds[f].set_index('listing_id', inplace=True)
    print(f, sklearn.metrics.log_loss(df_cvpreds[f].interest_cat, df_cvpreds[f][['low', 'medium', 'high']]))

df_cvpreds = pd.concat(df_cvpreds)

tgts = ['low', 'medium', 'high']

print('combined: ', sklearn.metrics.log_loss(df_cvpreds.interest_cat, df_cvpreds[tgts]))

end = time.time()
print(end  - start)


20 0.583326595675
30 0.564583867318
40 0.55640282163
50 0.55297015512
60 0.5522085791
70 0.551036046135
80 0.548456980448
90 0.548120494467
100 0.547398887508
110 0.544103625174
120 0.543885010018
130 0.544056941458
140 0.54390063626
150 0.543760753053
160 0.543396538985
170 0.543209339341
180 0.543164062785
190 0.542978549216
200 0.542855772845
210 0.543000699285
220 0.542942881345
230 0.542866927348
240 0.542786932211
250 0.542523391965
260 0.542565326172
270 0.542638907563
280 0.54240001092
290 0.542447548681
300 0.542322008059
310 0.542182701203
320 0.542270802642
330 0.542336732247
340 0.542359290234
350 0.542300408339
360 0.54224689899
0 0.54225151966
20 0.562783574225
30 0.548940895073
40 0.54181185985
50 0.541158801244
60 0.536130767212
70 0.535276797363
80 0.534634191309
90 0.533877558573
100 0.533935561793
110 0.533680906437
120 0.533434391824
130 0.533563194397
140 0.533453484687
150 0.533840530709
160 0.533867656229
170 0.534021518884
1 0.534020801947
20 0.581065021749
30 0.567104196892
40 0.561966914781
50 0.560780961077
60 0.559883134769
70 0.559798202296
80 0.559824806949
90 0.556573024527
100 0.556363028813
110 0.555279292596
120 0.555371860471
130 0.554880342154
140 0.554479990089
150 0.554159784638
160 0.553850745848
170 0.553841967563
180 0.553713775747
190 0.553578098739
200 0.55374881907
210 0.553489736269
220 0.55326695587
230 0.552938222335
240 0.553020986018
250 0.553026068486
260 0.552959724985
270 0.553111571617
280 0.553161002829
2 0.553161002829
20 0.578229223158
30 0.555197172787
40 0.544756684931
50 0.542575196754
60 0.537883203802
70 0.537039498183
80 0.537024055324
90 0.53650875323
100 0.53613324296
110 0.535639719436
120 0.535070599762
130 0.535057540713
140 0.53486864125
150 0.534871004532
160 0.53469699692
170 0.534779934021
180 0.534756825351
190 0.53488130408
200 0.534973703974
210 0.534663816394
220 0.53461253022
230 0.534636041631
240 0.534600009185
250 0.534468238843
260 0.534389680192
270 0.53435420039
280 0.534360791867
290 0.534124690113
300 0.533993792513
310 0.53392397759
320 0.533825480635
330 0.533920006369
340 0.533957016486
350 0.533963494517
360 0.534028083067
370 0.534104233505
3 0.534104465691
20 0.561290957294
30 0.552537609844
40 0.550273801026
50 0.546055169757
60 0.545341194622
70 0.544762377792
80 0.545337594656
90 0.54530081966
100 0.54499010043
110 0.544512647803
120 0.544640565763
130 0.544987138375
140 0.544987325486
150 0.544647371472
160 0.544413032983
170 0.544429358433
180 0.544313543677
190 0.544202514081
200 0.544090518446
210 0.544068915668
220 0.543878450701
230 0.543670319656
240 0.543678987666
250 0.543644955547
260 0.543527044093
270 0.543448000535
280 0.543397094819
290 0.543317170962
300 0.543130129912
310 0.543179524876
320 0.543183938801
330 0.543302750131
340 0.543164052025
350 0.543040585011
360 0.542905006679
370 0.543029878136
380 0.543009262076
390 0.542911897401
400 0.542816423949
410 0.542783278916
420 0.542763489198
430 0.542770826272
440 0.542859408146
450 0.542828698009
460 0.542808545945
470 0.542842700015
4 0.542845939345
combined:  0.541276650507
106.8114025592804

In [15]:
testpreds = []
for i, m in enumerate(models):
    testpreds.append(m.predict_proba(cvtest_normalized[i][fl]))
    
f = np.array(testpreds).mean(axis=0)
df_testpreds = pd.DataFrame(f, columns=['low', 'medium', 'high'])
df_testpreds.index = test_df.index
df_testpreds['listing_id'] = test_df.listing_id
df_testpreds.set_index('listing_id', inplace=True)

df_output = pd.concat([df_testpreds, df_cvpreds[tgts]])
df_output.sort_index(inplace=True)

In [16]:
df_fold = []
for i in range(len(testpreds)):
    df_fold.append(pd.DataFrame(testpreds[i]))
    df_fold[-1]['listing_id'] = test_df.listing_id
    df_fold[-1].sort_values('listing_id', inplace=True)
    df_fold[-1].set_index('listing_id', inplace=True)

pickle.dump((df_output, df_fold), open('model-output-rf.pkl', 'wb'))

In [17]:
# RandomForestRegressor

start = time.time()

# prep CV

cv_train = []
cv_valid = []

for tr_index, val_index in kf.split(train_df.index, train_df.interest_cat):
        cv_train.append(train_df_normalized.loc[tr_index])
        cv_valid.append(train_df_normalized.loc[val_index])

models = []
cv_preds = []
df_cvpreds = []

for f in range(5):
    #dset = lgbm.Dataset(cv_train[f][fl], cv_train[f].interest_cat, silent=True)
    #dset_val = lgbm.Dataset(cv_valid[f][fl], cv_valid[f].interest_cat, silent=True)
    #models.append(lgbm.train(lgbm_params, dset, early_stopping_rounds=100, verbose_eval=False, valid_sets=dset_val, num_boost_round=2000))

    models.append(sklearn.ensemble.RandomForestRegressor(n_estimators=10, min_samples_split=8, min_samples_leaf=4, 
                                                          n_jobs=-1, random_state=0))
    models[-1].fit(cv_train[f][fl].values, cv_train[f].interest_cat.values)
    
    best = None
    
    for nest in range(20, 2000, 10):
        models[-1].set_params(warm_start = True, n_estimators=nest)
        models[-1].fit(cv_train[f][fl].values, cv_train[f].interest.values)
    
        preds = models[-1].predict(cv_valid[f][fl].values)
        #score = sklearn.metrics.log_loss(cv_valid[f].interest, preds)
        score = np.sqrt(sklearn.metrics.mean_squared_error(cv_valid[f].interest, preds))
        
        print(nest, score)

        if best is None or score < best[0]:
            best = (score, nest)
        elif nest - best[1] >= 50:
            break
            
    #models[-1].set_params(n_estimators = best[1])
    
    #print('done training')
    
    cv_preds.append(models[-1].predict(cv_valid[f][fl].values))

    df_cvpreds.append(pd.DataFrame(cv_preds[f], columns=['prediction']))
    df_cvpreds[f].index = cv_valid[f].index
    df_cvpreds[f]['interest'] = cv_valid[f].interest
    df_cvpreds[f]['listing_id'] = cv_valid[f].listing_id

    df_cvpreds[f].set_index('listing_id', inplace=True)
    print(f) #, np.sqrt(sklearn.metrics.mean_squared_error(cv_valid[f].interest, preds)))

df_cvpreds = pd.concat(df_cvpreds)

tgts = ['low', 'medium', 'high']

print('combined: ', np.sqrt(sklearn.metrics.mean_squared_error(df_cvpreds.interest, df_cvpreds.prediction))) #sklearn.metrics.log_loss(df_cvpreds.interest_cat, df_cvpreds[tgts]))

end = time.time()
print(end  - start)


20 0.292096527436
30 0.27158341902
40 0.264031400668
50 0.260278668073
60 0.258340367339
70 0.257256241264
80 0.256278995325
90 0.255620284908
100 0.255145913323
110 0.254906806619
120 0.254613388379
130 0.254556479169
140 0.254476706799
150 0.254357550832
160 0.254319593032
170 0.254194507073
180 0.254108090616
190 0.253921341687
200 0.253864447126
210 0.253780455416
220 0.253819769413
230 0.253767379979
240 0.253785431681
250 0.253761105406
260 0.253740080722
270 0.253761059263
280 0.25375415227
290 0.253710537549
300 0.253675982788
310 0.253636129461
320 0.253587363658
330 0.253578664685
340 0.253577816128
350 0.253545655204
360 0.253587681629
370 0.253598081195
380 0.253641764732
390 0.253668734815
400 0.253672682618
0
20 0.288663671028
30 0.269145129711
40 0.261705426042
50 0.257644651819
60 0.255716143771
70 0.254316551319
80 0.253379513614
90 0.252792486215
100 0.252399287617
110 0.252110819435
120 0.25183553323
130 0.251595262932
140 0.2514430371
150 0.251363101497
160 0.251331927216
170 0.251231358171
180 0.251229292031
190 0.251189188387
200 0.251157898452
210 0.251102108538
220 0.251070116779
230 0.251012631391
240 0.250991117624
250 0.250915933825
260 0.250904907014
270 0.250883708858
280 0.250830066884
290 0.250738903935
300 0.25075216601
310 0.250735592478
320 0.250724483593
330 0.250712536064
340 0.250716903534
350 0.250737307891
360 0.250769685447
370 0.250739842698
380 0.250699324154
390 0.250678117982
400 0.250654393346
410 0.250638374331
420 0.250602163605
430 0.250606822425
440 0.250613238692
450 0.250632343474
460 0.250633719966
470 0.250651342121
1
20 0.290728218853
30 0.270090446308
40 0.26200313198
50 0.258193734273
60 0.255989014562
70 0.254811633243
80 0.253681111082
90 0.253203097308
100 0.252681203667
110 0.25253607403
120 0.25221657199
130 0.25220297189
140 0.251956246097
150 0.251761179596
160 0.251665499285
170 0.251544226367
180 0.251417933513
190 0.251379921165
200 0.251300335463
210 0.251230261546
220 0.251092773244
230 0.25103111302
240 0.251027699221
250 0.250911459031
260 0.250812126883
270 0.250813353346
280 0.250776996921
290 0.250767935637
300 0.250799775671
310 0.250786670838
320 0.250745889336
330 0.250706696559
340 0.250660843042
350 0.250619661697
360 0.25058727034
370 0.250588085369
380 0.250561649487
390 0.250566989561
400 0.250555367458
410 0.250546774583
420 0.250537658781
430 0.250506466141
440 0.250523005416
450 0.250552987965
460 0.250554599387
470 0.250495331231
480 0.250505600383
490 0.250516069046
500 0.250506750367
510 0.250515921321
520 0.25050412628
2
20 0.292519255183
30 0.272364145534
40 0.265123020429
50 0.261619978587
60 0.2594816275
70 0.258091070286
80 0.257361566746
90 0.256906541566
100 0.256119764913
110 0.255910837938
120 0.255515574327
130 0.255303915242
140 0.255277545892
150 0.255069583023
160 0.254869947336
170 0.254802933986
180 0.254608872088
190 0.254545208054
200 0.254507767058
210 0.254505530924
220 0.254519274059
230 0.254442178245
240 0.25440531127
250 0.25435110239
260 0.254318631332
270 0.254302263911
280 0.254268191967
290 0.254222208893
300 0.254222212093
310 0.25418427338
320 0.254143289261
330 0.254106367775
340 0.254040016274
350 0.253976828892
360 0.253918581181
370 0.253912962479
380 0.253884793225
390 0.25388197996
400 0.253853957961
410 0.253860413858
420 0.253839425352
430 0.253843977321
440 0.253801808635
450 0.253840304854
460 0.253834744361
470 0.253796474512
480 0.253781248444
490 0.253802767695
500 0.253786743513
510 0.253781921602
520 0.253750070407
530 0.253744075553
540 0.253722467468
550 0.253715523742
560 0.253696303681
570 0.25367324204
580 0.253682224606
590 0.253726785829
600 0.253705875935
610 0.253710557395
620 0.253715211205
3
20 0.285993527193
30 0.266573793439
40 0.25879002171
50 0.255533693649
60 0.253832140916
70 0.252539048909
80 0.251929711247
90 0.251491405077
100 0.251021055009
110 0.250746407271
120 0.250581776695
130 0.25041612223
140 0.250263525502
150 0.250110520749
160 0.250152574155
170 0.24998957347
180 0.249872093185
190 0.249812304586
200 0.249795661493
210 0.249745699932
220 0.249789447723
230 0.249795194914
240 0.249724962986
250 0.249694975462
260 0.249701397706
270 0.249702820665
280 0.249690487538
290 0.249698008351
300 0.249674060573
310 0.249679327351
320 0.249693493052
330 0.2496972194
340 0.249652750728
350 0.249668294997
360 0.249675548951
370 0.249684540692
380 0.249620026422
390 0.249601759359
400 0.249569850334
410 0.24952814133
420 0.249520565015
430 0.249538822391
440 0.249559005194
450 0.249579253835
460 0.249531499628
470 0.249539053219
4
combined:  0.251622570092
733.7799310684204

In [18]:
testpreds = []
for i, m in enumerate(models):
    testpreds.append(m.predict(cvtest_normalized[i][fl]))

In [19]:
f = np.array(testpreds).mean(axis=0)
df_testpreds = pd.DataFrame(f, columns=['prediction'])
df_testpreds.index = test_df.index
df_testpreds['listing_id'] = test_df.listing_id
df_testpreds.set_index('listing_id', inplace=True)

df_output = pd.concat([df_testpreds, df_cvpreds[['prediction']]])
df_output.sort_index(inplace=True)

df_output.to_pickle('bag-model-rfr-v1.pkl')

df_fold = []
for i in range(len(testpreds)):
    df_fold.append(pd.DataFrame(testpreds[i]))
    df_fold[-1]['listing_id'] = test_df.listing_id
    df_fold[-1].sort_values('listing_id', inplace=True)
    df_fold[-1].set_index('listing_id', inplace=True)

pickle.dump((df_output, df_fold), open('bagta-model-rfr-v1.pkl', 'wb'))

In [23]:
# ExtraTreesRegressor 

start = time.time()

# prep CV

cv_train = []
cv_valid = []

for tr_index, val_index in kf.split(train_df.index, train_df.interest_cat):
        cv_train.append(train_df_normalized.loc[tr_index])
        cv_valid.append(train_df_normalized.loc[val_index])

models = []
cv_preds = []
df_cvpreds = []

for f in range(5):
    models.append(sklearn.ensemble.ExtraTreesRegressor(n_estimators=10, min_samples_split=8, min_samples_leaf=4, 
                                                          n_jobs=-1, random_state=0))
    models[-1].fit(cv_train[f][fl].values, cv_train[f].interest_cat.values)
    
    best = None
    
    for nest in range(20, 2000, 10):
        models[-1].set_params(warm_start = True, n_estimators=nest)
        models[-1].fit(cv_train[f][fl].values, cv_train[f].interest.values)
    
        preds = models[-1].predict(cv_valid[f][fl].values)
        #score = sklearn.metrics.log_loss(cv_valid[f].interest, preds)
        score = np.sqrt(sklearn.metrics.mean_squared_error(cv_valid[f].interest, preds))
        
        print(nest, score)

        if best is None or score < best[0]:
            best = (score, nest)
        elif nest - best[1] >= 50:
            break
            
    cv_preds.append(models[-1].predict(cv_valid[f][fl].values))

    df_cvpreds.append(pd.DataFrame(cv_preds[f], columns=['prediction']))
    df_cvpreds[f].index = cv_valid[f].index
    df_cvpreds[f]['interest'] = cv_valid[f].interest
    df_cvpreds[f]['listing_id'] = cv_valid[f].listing_id

    df_cvpreds[f].set_index('listing_id', inplace=True)
    print(f) #, np.sqrt(sklearn.metrics.mean_squared_error(cv_valid[f].interest, preds)))

df_cvpreds = pd.concat(df_cvpreds)

tgts = ['low', 'medium', 'high']

print('combined: ', np.sqrt(sklearn.metrics.mean_squared_error(df_cvpreds.interest, df_cvpreds.prediction))) #sklearn.metrics.log_loss(df_cvpreds.interest_cat, df_cvpreds[tgts]))

end = time.time()
print(end  - start)


20 0.284186195488
30 0.265958821954
40 0.259247013231
50 0.255700746787
60 0.254181923445
70 0.253214218752
80 0.252695012866
90 0.252027572135
100 0.251631224067
110 0.251455173568
120 0.251306111599
130 0.251134836482
140 0.251014448712
150 0.250942882723
160 0.250903977167
170 0.250743492843
180 0.250684503264
190 0.250643587619
200 0.250566978491
210 0.25058266343
220 0.25055057091
230 0.250530526918
240 0.250527243495
250 0.250551237175
260 0.250560912678
270 0.250485336496
280 0.250482299271
290 0.250537845928
300 0.250539261569
310 0.250481284099
320 0.250469261469
330 0.250471813276
340 0.250442490443
350 0.250460725332
360 0.250453082106
370 0.250426486544
380 0.250404520127
390 0.250404315517
400 0.250401116492
410 0.250393721715
420 0.250360003323
430 0.250321644975
440 0.250371296554
450 0.250378230768
460 0.250378768944
470 0.250390197725
480 0.250379463062
0
20 0.277909206138
30 0.260961212184
40 0.254957303532
50 0.252181429124
60 0.25054498715
70 0.249493731192
80 0.248997085677
90 0.248600347234
100 0.248320109586
110 0.248004225282
120 0.247835044231
130 0.24782488548
140 0.247692259476
150 0.247625051149
160 0.247644663031
170 0.247641842889
180 0.247599835406
190 0.24755873364
200 0.247516698521
210 0.247611074128
220 0.247589095626
230 0.247585329012
240 0.247555983698
250 0.247578555147
1
20 0.279437848693
30 0.262597021505
40 0.256085189488
50 0.252885016383
60 0.251324123849
70 0.250177741214
80 0.249569798608
90 0.24907969855
100 0.248879226936
110 0.248659361916
120 0.248385725573
130 0.248327560327
140 0.248172488983
150 0.248142117227
160 0.248005135104
170 0.247932404141
180 0.247799044292
190 0.247677460945
200 0.247713640133
210 0.247727829863
220 0.247767096162
230 0.247744626499
240 0.247760678558
2
20 0.286311060095
30 0.267857892859
40 0.261177427306
50 0.258316203734
60 0.256563555497
70 0.255549429323
80 0.254903169957
90 0.254342115395
100 0.254152715173
110 0.25382730177
120 0.253439904966
130 0.253307999208
140 0.253122704541
150 0.25302293974
160 0.252954508119
170 0.252931143902
180 0.25297215171
190 0.252880828962
200 0.252814203959
210 0.252785305549
220 0.252713438838
230 0.252728024135
240 0.252654002036
250 0.252671182776
260 0.252676015832
270 0.252710006793
280 0.252658088195
290 0.252630598003
300 0.252597595952
310 0.252607986677
320 0.252603701801
330 0.252584892507
340 0.25257929267
350 0.25255261029
360 0.2525640472
370 0.252548823099
380 0.252545793087
390 0.252516762755
400 0.252511256166
410 0.25250520446
420 0.252489598999
430 0.252481725793
440 0.252477014857
450 0.25249748863
460 0.252462062143
470 0.252454949075
480 0.252453407567
490 0.252436185586
500 0.252438507632
510 0.252436307846
520 0.252460117115
530 0.25250837519
540 0.252502643942
3
20 0.279757520208
30 0.262314917827
40 0.255272604591
50 0.252270190604
60 0.250580830838
70 0.249558192435
80 0.24878338172
90 0.248221700707
100 0.247886811001
110 0.247661978421
120 0.247519596268
130 0.247443389607
140 0.247308986651
150 0.247295058239
160 0.247277008966
170 0.247183701174
180 0.247058857609
190 0.247099345959
200 0.247019333104
210 0.24694931582
220 0.246895420869
230 0.246870235678
240 0.246845654676
250 0.246870588151
260 0.246904954677
270 0.246960761029
280 0.246956366469
290 0.246925393441
4
combined:  0.249038188226
335.64626002311707

In [24]:
testpreds = []
for i, m in enumerate(models):
    testpreds.append(m.predict(cvtest_normalized[i][fl]))
    
f = np.array(testpreds).mean(axis=0)
df_testpreds = pd.DataFrame(f, columns=['prediction'])
df_testpreds.index = test_df.index
df_testpreds['listing_id'] = test_df.listing_id
df_testpreds.set_index('listing_id', inplace=True)

df_output = pd.concat([df_testpreds, df_cvpreds[['prediction']]])
df_output.sort_index(inplace=True)

#df_output.to_pickle('0423-model-etr-v1.pkl')

df_fold = []
for i in range(len(testpreds)):
    df_fold.append(pd.DataFrame(testpreds[i]))
    df_fold[-1]['listing_id'] = test_df.listing_id
    df_fold[-1].sort_values('listing_id', inplace=True)
    df_fold[-1].set_index('listing_id', inplace=True)

pickle.dump((df_output, df_fold), open('bagta-model-etr-v1.pkl', 'wb'))

In [103]:
df_output


Out[103]:
prediction
listing_id
6811957 0.410999
6811958 0.609036
6811960 0.409615
6811964 0.538739
6811965 0.683138
6811966 0.615311
6811971 0.345358
6811973 0.376863
6811974 0.515847
6811975 0.392642
6811983 0.234846
6811984 0.271377
6811985 0.243789
6811988 0.571760
6811990 0.002913
6811992 0.048332
6811995 0.001468
6811997 0.000419
6812000 0.024324
6812002 0.010578
6812004 0.000000
6812005 0.367943
6812009 0.002533
6812012 0.000309
6812016 0.000634
6812025 0.187664
6812032 0.177738
6812033 0.000000
6812035 0.040983
6812041 0.002123
... ...
7714406 0.040422
7714408 0.035560
7714547 0.323051
7724353 0.498246
7724437 0.141485
7724549 0.287647
7724798 0.633203
7724814 0.149701
7724905 0.685805
7731142 0.296909
7731327 0.073033
7731330 0.063305
7742631 0.021842
7742634 0.015231
7742636 0.019382
7742642 0.033682
7742644 0.009324
7742670 0.448826
7742794 0.015639
7742803 0.007231
7742859 0.423839
7742861 0.014970
7748247 0.120063
7748250 0.117711
7748251 0.113714
7748271 0.266247
7748273 0.610056
7753784 0.217309
7754429 0.247798
7761779 0.267889

124011 rows × 1 columns


In [79]:
fl = ['price', 'manager_sort_level', 'latitude', 'longitude', 'bedrooms', 'bathrooms', 'density_exp005', 'predicted_price_diff', 'created_hour']


# ET

# RandomForestClassifier

start = time.time()

# prep CV

cv_train = []
cv_valid = []

for tr_index, val_index in kf.split(train_df.index, train_df.interest_cat):
        cv_train.append(train_df_normalized.loc[tr_index])
        cv_valid.append(train_df_normalized.loc[val_index])

models = []
cv_preds = []
df_cvpreds = []

for f in range(5):
    #dset = lgbm.Dataset(cv_train[f][fl], cv_train[f].interest_cat, silent=True)
    #dset_val = lgbm.Dataset(cv_valid[f][fl], cv_valid[f].interest_cat, silent=True)
    #models.append(lgbm.train(lgbm_params, dset, early_stopping_rounds=100, verbose_eval=False, valid_sets=dset_val, num_boost_round=2000))

    models.append(sklearn.ensemble.ExtraTreesClassifier(n_estimators=10, max_features=len(fl),
                                                          n_jobs=-1, class_weight=None, random_state=0))
    models[-1].fit(cv_train[f][fl].values, cv_train[f].interest_cat.values)
    
    best = None
    
    for nest in range(20, 2000, 10):
        models[-1].set_params(warm_start = True, n_estimators=nest)
        models[-1].fit(cv_train[f][fl].values, cv_train[f].interest_cat.values)
    
        preds = models[-1].predict_proba(cv_valid[f][fl].values)
        score = sklearn.metrics.log_loss(cv_valid[f].interest_cat, preds)
        
        print(nest, score)

        if best is None or score < best[0]:
            best = (score, nest)
        elif nest - best[1] >= 50:
            break
            
    #models[-1].set_params(n_estimators = best[1])
    
    #print('done training')
    
    cv_preds.append(models[-1].predict_proba(cv_valid[f][fl].values))

    df_cvpreds.append(pd.DataFrame(cv_preds[f], columns=['low', 'medium', 'high']))
    df_cvpreds[f].index = cv_valid[f].index
    df_cvpreds[f]['interest_cat'] = cv_valid[f].interest_cat
    df_cvpreds[f]['listing_id'] = cv_valid[f].listing_id

    df_cvpreds[f].set_index('listing_id', inplace=True)
    print(f, sklearn.metrics.log_loss(df_cvpreds[f].interest_cat, df_cvpreds[f][['low', 'medium', 'high']]))

df_cvpreds = pd.concat(df_cvpreds)

tgts = ['low', 'medium', 'high']

print('combined: ', sklearn.metrics.log_loss(df_cvpreds.interest_cat, df_cvpreds[tgts]))

end = time.time()
print(end  - start)


20 1.12296132847
30 0.942927949061
40 0.844550481435
50 0.797762267704
60 0.781118544176
70 0.74291014053
80 0.735813953207
90 0.725778708524
100 0.70352403112
110 0.691069629282
120 0.678172029392
130 0.67497619837
140 0.674740438006
150 0.671048130923
160 0.6705096715
170 0.670215898684
180 0.669729628895
190 0.666112938059
200 0.663259300476
210 0.659661933232
220 0.650185280136
230 0.649839284451
240 0.643923314802
250 0.641127095932
260 0.641433678261
270 0.641525789111
280 0.641486492967
290 0.638562517295
300 0.638519512836
310 0.635716730599
320 0.632999902856
330 0.629937480721
340 0.626842628871
350 0.623850249736
360 0.62094618567
370 0.620845163869
380 0.621167801268
390 0.621128292064
400 0.621091840209
410 0.620958795269
420 0.620816925182
430 0.6179475613
440 0.617827747385
450 0.617820263081
460 0.617769334832
470 0.6176277653
480 0.617659207137
490 0.617575733492
500 0.617419231484
510 0.617287101022
520 0.617173388505
530 0.617194686212
540 0.61432017067
550 0.611588672624
560 0.611509047252
570 0.611427517184
580 0.611334120612
590 0.611373150902
600 0.611306176219
610 0.611092536397
620 0.611180251979
630 0.61106415847
640 0.610959944597
650 0.610938318333
660 0.610844003849
670 0.610756027815
680 0.610747982488
690 0.610665601763
700 0.607833521745
710 0.607776406854
720 0.607757565377
730 0.607818510202
740 0.607888076719
750 0.607843050913
760 0.607741772671
770 0.607750527192
780 0.607788381863
790 0.607772626284
800 0.60770873325
810 0.607763485246
820 0.607754458216
830 0.607787353974
840 0.607797565373
850 0.607825986548
0 0.607828544787
20 1.10185707068
30 0.940357615011
40 0.853368044373
50 0.777234776211
60 0.747959302898
70 0.720702818221
80 0.703610418258
90 0.690133832248
100 0.678175103677
110 0.669014515232
120 0.656960532383
130 0.64487290796
140 0.641455351607
150 0.638787917673
160 0.638440354228
170 0.634663974477
180 0.631481926523
190 0.631158908466
200 0.631167727761
210 0.628357396904
220 0.628679823583
230 0.628503569146
240 0.628559145908
250 0.628334177603
260 0.628607419494
270 0.622853172476
280 0.622757646894
290 0.622827772797
300 0.622503811004
310 0.622510135138
320 0.622457266134
330 0.619686554758
340 0.619839250155
350 0.619980193387
360 0.619694766805
370 0.616822527337
380 0.616659973874
390 0.616512255606
400 0.6165203991
410 0.616422420893
420 0.616267921785
430 0.616100675155
440 0.616145195551
450 0.616161321951
460 0.615974765542
470 0.615899567967
480 0.616023513858
490 0.615948775805
500 0.61598523593
510 0.613021142595
520 0.613113456034
530 0.613246089813
540 0.613058815803
550 0.610188614129
560 0.610232955589
570 0.610096941896
580 0.610074178384
590 0.610003131693
600 0.609978260475
610 0.607291324069
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-79-1e556fc0ec01> in <module>()
     36         models[-1].fit(cv_train[f][fl].values, cv_train[f].interest_cat.values)
     37 
---> 38         preds = models[-1].predict_proba(cv_valid[f][fl].values)
     39         score = sklearn.metrics.log_loss(cv_valid[f].interest_cat, preds)
     40 

/opt/conda/lib/python3.6/site-packages/sklearn/ensemble/forest.py in predict_proba(self, X)
    584         Parallel(n_jobs=n_jobs, verbose=self.verbose, backend="threading")(
    585             delayed(accumulate_prediction)(e.predict_proba, X, all_proba)
--> 586             for e in self.estimators_)
    587 
    588         for proba in all_proba:

/opt/conda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self, iterable)
    787                 # consumption.
    788                 self._iterating = False
--> 789             self.retrieve()
    790             # Make sure that we get a last message telling us we are done
    791             elapsed_time = time.time() - self._start_time

/opt/conda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in retrieve(self)
    697             try:
    698                 if getattr(self._backend, 'supports_timeout', False):
--> 699                     self._output.extend(job.get(timeout=self.timeout))
    700                 else:
    701                     self._output.extend(job.get())

/opt/conda/lib/python3.6/multiprocessing/pool.py in get(self, timeout)
    600 
    601     def get(self, timeout=None):
--> 602         self.wait(timeout)
    603         if not self.ready():
    604             raise TimeoutError

/opt/conda/lib/python3.6/multiprocessing/pool.py in wait(self, timeout)
    597 
    598     def wait(self, timeout=None):
--> 599         self._event.wait(timeout)
    600 
    601     def get(self, timeout=None):

/opt/conda/lib/python3.6/threading.py in wait(self, timeout)
    549             signaled = self._flag
    550             if not signaled:
--> 551                 signaled = self._cond.wait(timeout)
    552             return signaled
    553 

/opt/conda/lib/python3.6/threading.py in wait(self, timeout)
    293         try:    # restore state no matter what (e.g., KeyboardInterrupt)
    294             if timeout is None:
--> 295                 waiter.acquire()
    296                 gotit = True
    297             else:

KeyboardInterrupt: 

In [ ]:


In [34]:
train_df['desc_xp_count'] = train_df.description.apply(lambda x: x.count('!'))
train_df['desc_xp_ratio'] = train_df.description.apply(lambda x: (x.count('!') / len(x)) if len(x) else 0)
train_df['desc_xp_first'] = train_df.description.apply(lambda x: x.find('!') if len(x) else 0)
train_df['desc_xp_inv_first'] = train_df.description.apply(lambda x: (len(x) - x.find('!')) if len(x) else 0)
train_df['desc_xp_inv_mult'] = train_df.desc_xp_count * train_df.desc_xp_inv_first

In [30]:
# prep CV

cv_train = []
cv_valid = []

for tr_index, val_index in kf.split(train_df.index, train_df.interest_cat):
        cv_train.append(train_df.loc[tr_index])
        cv_valid.append(train_df.loc[val_index])

In [74]:
t4_params = {
    'boosting_type': 'gbdt', 'objective': 'multiclass', 'nthread': -1, 'silent': True,
    'num_leaves': 2**5, 'learning_rate': 0.02, 'max_depth': -1, 'metric': ['multi_logloss'],
    'max_bin': 255, 'subsample_for_bin': 50000,
    'subsample': 0.8, 'subsample_freq': 1, 'colsample_bytree': 0.6, 'reg_alpha': 1, 'reg_lambda': 0,
    'min_split_gain': 0.25, 'min_child_weight': .5, 'min_child_samples': 20, 'scale_pos_weight': 1}

lgbm_params = t4_params.copy()
lgbm_params['num_class'] = 3

In [78]:
fl = ['price', 'manager_sort_level', 'latitude', 'longitude', 'bedrooms', 'bathrooms', 'density_exp005', 'predicted_price_diff', 'created_hour']

start = time.time()

# prep CV

cv_train = []
cv_valid = []

for tr_index, val_index in kf.split(train_df.index, train_df.interest_cat):
        cv_train.append(train_df_normalized.loc[tr_index])
        cv_valid.append(train_df_normalized.loc[val_index])

models = []
cv_preds = []
df_cvpreds = []

for f in range(5):
    dset = lgbm.Dataset(cv_train[f][fl], cv_train[f].interest_cat, silent=True)
    dset_val = lgbm.Dataset(cv_valid[f][fl], cv_valid[f].interest_cat, silent=True)
    models.append(lgbm.train(lgbm_params, dset, early_stopping_rounds=100, verbose_eval=False, valid_sets=dset_val, num_boost_round=2000))

    #print('done training')
    
    cv_preds.append(models[-1].predict(cv_valid[f][fl]))

    df_cvpreds.append(pd.DataFrame(cv_preds[f], columns=['low', 'medium', 'high']))
    df_cvpreds[f].index = cv_valid[f].index
    df_cvpreds[f]['interest_cat'] = cv_valid[f].interest_cat
    df_cvpreds[f]['listing_id'] = cv_valid[f].listing_id

    df_cvpreds[f].set_index('listing_id', inplace=True)
    print(f, sklearn.metrics.log_loss(df_cvpreds[f].interest_cat, df_cvpreds[f][['low', 'medium', 'high']]))

df_cvpreds = pd.concat(df_cvpreds)

tgts = ['low', 'medium', 'high']

print('combined: ', sklearn.metrics.log_loss(df_cvpreds.interest_cat, df_cvpreds[tgts]))

end = time.time()
print(end  - start)


0 0.552516080553
1 0.557726118266
2 0.561741087415
3 0.550305671681
4 0.556885845
combined:  0.555834896702
30.58432674407959

In [77]:
testpreds = []
for i, m in enumerate(models):
    testpreds.append(m.predict(cv_test[i][fl], num_iteration=m.best_iteration))
    


f = np.array(testpreds).mean(axis=0)
df_testpreds = pd.DataFrame(f, columns=['low', 'medium', 'high'])
df_testpreds.index = test_df.index
df_testpreds['listing_id'] = test_df.listing_id
df_testpreds.set_index('listing_id', inplace=True)

df_output = pd.concat([df_testpreds, df_cvpreds[tgts]])
df_output.sort_index(inplace=True)

df_output.to_pickle('0423-minimod-lgbm-lf-v1.pkl')

In [23]:
corrs = []
for k in fl:
    corrs.append((k, train_df[k].corr(train_df.interest)))

In [24]:
sorted(corrs, key=operator.itemgetter(1))


Out[24]:
[('time_stamp', -0.18019039565076073),
 ('display_address', -0.10638217137972825),
 ('address_ratio', -0.08435685614698217),
 ('half_bathroom', -0.082952509296588234),
 ('bathrooms', -0.08290129995713677),
 ('f_doorman', -0.080408870770706103),
 ('f_dogs_allowed', -0.069394270715112102),
 ('f_cats_allowed', -0.063240578842069212),
 ('f_garage', -0.061826833663457474),
 ('f_fitness_center', -0.041625294080062145),
 ('created_day', -0.035318942170163382),
 ('f_hardwood', -0.034737405598447645),
 ('f_fireplace', -0.028046029385565645),
 ('listing_id', -0.027602467182586288),
 ('density', -0.025715142282351982),
 ('price', -0.024422071937294336),
 ('f_gym_fitness', -0.022982979101316876),
 ('f_childrens_playroom', -0.022148888512394029),
 ('created_dayofyear', -0.01735538096319977),
 ('f_garden', -0.015741001576469776),
 ('f_bike_room', -0.01528765527419634),
 ('f_indoor_pool', -0.015284345235786326),
 ('f_brownstone', -0.014610002072087057),
 ('f_in_unit_washer_dryer', -0.01431467498798949),
 ('f_basement_storage', -0.012890619050425546),
 ('f_full_service_garage', -0.012212815755274534),
 ('f_attended_lobby', -0.011686935531385678),
 ('f_elevator', -0.011476862630146167),
 ('f_flex_2', -0.010534141055214741),
 ('f_deck', -0.0092224583955734189),
 ('f_children', -0.0086410666422274945),
 ('f_gym', -0.0082083729374437427),
 ('f_community_recreation_facilities', -0.0081500757799385544),
 ('f_green_building', -0.0071012278061486928),
 ('f_gym_in_building', -0.0065206020145217045),
 ('created_month', -0.0061852653157780805),
 ('f_common_parking_garage', -0.005147245710204181),
 ('longitude', -0.0050569641324356438),
 ('f_common_roof_deck', -0.0048727963704918538),
 ('f_courtyard', -0.0045883216661881969),
 ('f_billiards_room', -0.0038024320121321656),
 ('f_assigned_parking_space', -0.0033295860094368574),
 ('f__elev_lndry_bldg_', -0.0033295860094368461),
 ('f_building_common_outdoor_space', -0.0031056379002125881),
 ('f_business_center', -0.0029117495448706594),
 ('f_health_club', -0.0024321602612660054),
 ('f__exposed_brick_', -0.002322360662596785),
 ('f__housekeeping', -0.0021853517141903547),
 ('f_eat_in_kitchen', -0.0020386978680090261),
 ('f_common_terrace', -0.0018935202527496405),
 ('f_guarantors_accepted', -0.00094177186030862294),
 ('f_dry_cleaning_service', -0.00083470955370943886),
 ('f_common_garden', 1.3353799342668742e-05),
 ('f_backyard', 1.3353799342670887e-05),
 ('manager_id', 0.00056176989826598595),
 ('f__massive_1br_home_', 0.00067699465665070028),
 ('f_exposed_brick', 0.00074941493255863178),
 ('f__steps_to_the_park_', 0.00085359314689171237),
 ('f__cook', 0.00085359314689171357),
 ('f__2_blks_to_bedford_l_stop_', 0.00085359314689171465),
 ('f__elev_bldg_', 0.0011043518255861612),
 ('f_air_conditioning', 0.0016366854073997566),
 ('f__new_', 0.001828861177199033),
 ('location_cluster', 0.0018840451304264888),
 ('f__all_modern_', 0.0039695255502763191),
 ('f__gut_renovated_', 0.0039695255502763399),
 ('latitude', 0.0049073440510841951),
 ('f_cable_satellite_tv', 0.0080760774246337371),
 ('f_bike_storage', 0.0093330695720866969),
 ('f_common_backyard', 0.0096461282863857922),
 ('f__eat_in_kitchen_', 0.0096666060280858352),
 ('f__roomy_closets_', 0.010510177682981531),
 ('f_duplex', 0.011471394007603922),
 ('f_all_utilities_included', 0.013041615555849661),
 ('f_exclusive', 0.013136518550611027),
 ('f_gut_renovated', 0.01613081467205774),
 ('street_address', 0.019052348109443833),
 ('f_high_ceilings', 0.027371224128742402),
 ('f_common_outdoor_space', 0.028384918338367097),
 ('f_garden_patio', 0.028765444492280207),
 ('bedrooms', 0.03226416360187126),
 ('price_t', nan),
 ('price_per_room', nan),
 ('f__lndry_bldg_', 0.0018288611771990336),
 ('f__pets_ok_', 0.0036234866535139426),
 ('num_photos', 0.032443526361860056),
 ('num_features', 0.036203321687355482),
 ('f_dining_room', 0.042353666277858075),
 ('f_high_speed_internet', 0.043987016691473119),
 ('f_granite_kitchen', 0.045707149180638937),
 ('f_furnished', 0.05899236336182722),
 ('num_description_words', 0.059600303599038874),
 ('created_year', nan),
 ('density_exp01', -0.12664096547567719),
 ('density_gaussian02', -0.12655465248169487),
 ('f_laundry_room', -0.064347961338260956),
 ('f_pre_war', -0.061531609886662321),
 ('f_simplex', -0.056956086267372315),
 ('f_lowrise', -0.051771134728901054),
 ('f_publicoutdoor', -0.047563284386792269),
 ('f_pool', -0.041122569264336349),
 ('f_laundry', -0.025308403625298671),
 ('f_on_site_garage', -0.023947414414147546),
 ('fm_highrise', -0.023744291510852444),
 ('f_wifi_access', -0.023445677854650314),
 ('fm_laundry_in_unit', -0.023208695244561435),
 ('f_residents_lounge', -0.02296087881928718),
 ('f_patio', -0.021322315830169768),
 ('f_residents_garden', -0.019574778601242959),
 ('f_storage', -0.019218575986104974),
 ('f_valet', -0.01902145478783817),
 ('f_view', -0.01659705254991202),
 ('f_valet_parking', -0.01522521911415551),
 ('fm_concierge', -0.013053920330976699),
 ('f_live_in_superintendent', -0.012833359087058079),
 ('manager_sort_count', -0.012737214053333432),
 ('f_washer_dryer', -0.012383657755988382),
 ('f_outdoor_entertainment_space', -0.012212815755274534),
 ('f_washer_dryer_in_building', -0.012212815755274534),
 ('f_post_war', -0.011651821900543313),
 ('f_luxury_building', -0.010758356836323512),
 ('f_storage_room', -0.010369949748277719),
 ('f_mail_room', -0.0096898040282843729),
 ('f_outdoor_areas', -0.0095976459986946618),
 ('f_s_playroom', -0.0094450920737276112),
 ('f_virtual_doorman', -0.0088199779675000346),
 ('f_playroom_nursery', -0.00863453952189671),
 ('f_lounge_room', -0.0084396988819418275),
 ('f_lounge', -0.0079623892141454881),
 ('f_outdoor_pool', -0.0076393841256326882),
 ('f_rooftop_terrace', -0.006433744403303604),
 ('f_pets_allowed', -0.005890294507310768),
 ('f_live_work', -0.0056822860208247975),
 ('f_on_site_parking_lot', -0.0054402381060455049),
 ('f_media_room', -0.0052242965752724172),
 ('f_midrise', -0.0050523024457746479),
 ('f_parking', -0.0040952497852532813),
 ('f_private_parking', -0.0033295860094368574),
 ('f_on_site_parking_available', -0.0021853517141903547),
 ('f_private_laundry_room_on_every_floor', -0.0021853517141903547),
 ('f_video_intercom', -0.001742441328140536),
 ('f_on_site_parking', -0.0010789368190528032),
 ('f_playroom', -0.00048741135479725227),
 ('f_swimming_pool', -1.5576570199936427e-06),
 ('f_pets', 5.7442859210958739e-05),
 ('f_skylight', 6.1704612679377492e-05),
 ('f_private_backyard', 0.0010294415377170136),
 ('f_pet_friendly', 0.0012430227863272339),
 ('f_on_site_super', 0.0014427686355460187),
 ('f_private_terrace', 0.0017721051988860825),
 ('f_multi_level', 0.0020037309586999241),
 ('f_laundry_on_floor', 0.0020104606859344437),
 ('f_sundeck', 0.0023777623698929113),
 ('fm_central_air', 0.0026890996432358152),
 ('f_private_deck', 0.0028419835492644765),
 ('fm_roofdeck', 0.0041152504973862043),
 ('f_private_balcony', 0.0042523875576649106),
 ('f_shared_backyard', 0.0044205608484328227),
 ('f__gourmet_kitchen_', 0.0047942713626554275),
 ('f_package_room', 0.0048943939695505825),
 ('f_tenant_lounge', 0.0052429360945830516),
 ('f_marble_bathroom', 0.0055612887306597499),
 ('f_screening_room', 0.0055946887588022298),
 ('f__mr_clean_approved_', 0.0058894197300211433),
 ('f_s_kitchen_', 0.0067399181056956442),
 ('f_sublet', 0.0076622977446006981),
 ('f_washer_', 0.0077820041915908321),
 ('f__dryer', 0.0088266594544249288),
 ('f__walls_of_windows_', 0.0097443889947863935),
 ('f_party_room', 0.01098340935049859),
 ('f_private_roof_deck', 0.011321069890618856),
 ('f_wheelchair_access', 0.01166586068667011),
 ('f_microwave', 0.013644952320824692),
 ('f_pets_on_approval', 0.01457350539438552),
 ('manager_lazy_rate', 0.017201954295038208),
 ('f_on_site_laundry', 0.017211331947220233),
 ('f_new_construction', 0.017295140971784848),
 ('f_loft', 0.019483178894036963),
 ('f_no_pets', 0.023172154895398464),
 ('f_terrace', 0.024553590925121126),
 ('f_large_living_room', 0.024578224958028968),
 ('f_stainless_steel_appliances', 0.024920531211593405),
 ('f_balcony', 0.025347065565347576),
 ('f_laundry_in_unit', 0.026173174323366123),
 ('f_walk_in_closet', 0.026246890597488821),
 ('f_newly_renovated', 0.027189781168521433),
 ('f_shares_ok', 0.027281116820508038),
 ('f_parking_space', 0.029896300824096392),
 ('f_sauna', 0.032461086485931843),
 ('f__photos', 0.033161748690333184),
 ('f_actual_apt', 0.033161748690333184),
 ('f_short_term_allowed', 0.0419340840154293),
 ('f_private_outdoor_space', 0.044363533155687439),
 ('f_marble_bath', 0.044398264957522415),
 ('f_light', 0.045461906059264832),
 ('f_renovated', 0.052988112481286084),
 ('f_subway', 0.056982902177297019),
 ('f_outdoor_space', 0.058042658065696244),
 ('f_dishwasher', 0.060757389192004081),
 ('f_laundry_in_building', 0.090721668431859762),
 ('f_reduced_fee', 0.1009788272834992),
 ('f_hardwood_floors', 0.12875781444326573),
 ('building_id', 0.12909021814854379),
 ('f_no_fee', 0.14051729344363473),
 ('created_hour', 0.1637370746023887),
 ('predicted_price_diff', 0.34323636451459627)]

In [25]:
pd.Series.corr?

In [38]:
train_df[~(train_df.f_doorman == 1)].price.mean()


Out[38]:
3184.9364928241475