NN model


In [1]:
import os
import sys
import operator
import numpy as np
import pandas as pd
from scipy import sparse
import xgboost as xgb
import random
from sklearn import model_selection, preprocessing, ensemble
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

import pickle
import time

import sklearn.cluster

#import Levenshtein

from multiprocessing import Pool

#import lightgbm as lgbm

In [2]:
train_df = pd.read_pickle('fin-dprep-train.pkl')
test_df = pd.read_pickle('fin-dprep-test.pkl')
features_to_use = pickle.load(open('fin-dprep-flist.pkl', 'rb'))

In [3]:
medium_price = pd.read_pickle('fin-medium-price.pkl')

train_df = pd.merge(train_df, medium_price, left_on='listing_id', right_index=True)
test_df = pd.merge(test_df, medium_price, left_on='listing_id', right_index=True)

In [ ]:


In [4]:
for df in [train_df, test_df]:
    df['predicted_price_diff'] = np.log(df.predicted_price) - np.log(df.price)
    df['predicted_price_ratio'] = np.log(df.predicted_price) / np.log(df.price)

In [5]:
# fill in the NaN's.

for t in train_df.keys():
    nacount = train_df[t].isnull().sum()
    if nacount:
#        nacount_test = test_df[t].isnull().sum()
        print(t, nacount / len(train_df))#, nacount_test / len(test_df))
        
train_df.fillna(0, inplace=True)
test_df.fillna(0, inplace=True)


price_group 0.0488733992543
price_ratio 0.0488733992543
manager_shortdesc_rate 0.0688725887502
manager_building0_rate 0.0688725887502
manager_0feature_rate 0.0688725887502
manager_median_price 0.0688725887502
manager_lazy_rate 0.0688725887502

In [6]:
class MeansProcessor:
    def __init__(self, key, outkey = None, tgt = 'interest'):
        self.key = key
        self.outkey = key if outkey is None else outkey
        
        self.count = {}
        self.means = {}
        self.std = {}
        self.global_means = {}
        
        self.tgt = tgt
        
        self.outkeys = [self.outkey + '_level', self.outkey + '_level_std']
        
    def fit(self, df):
        self.global_means[self.outkey + '_level'] = df[self.tgt].mean()
        self.global_means[self.outkey + '_level_std'] = df[self.tgt].std()
            
        for k in df.groupby(self.key, sort=False):
            
            self.count[k[0]] = len(k[1])

            if len(k[1]) < 0:
                self.means[k[0]] = np.nan
                self.std[k[0]] = np.nan
            else:
                self.means[k[0]] = np.mean(k[1][self.tgt])
                self.std[k[0]] = np.std(k[1][self.tgt])
            
    def predict(self, df, nans = False):
        for l in self.outkeys:
            df[l] = np.nan if nans else self.global_means[l]
            
        df[self.outkey + '_count'] = 0
            
        for k in df.groupby(self.key, sort=False):
            if k[0] == 0:
                continue
            
            if k[0] in self.means:
                df.loc[k[1].index, self.outkey + '_count'] = self.count[k[0]]
                df.loc[k[1].index, self.outkey + '_level'] = self.means[k[0]]
                df.loc[k[1].index, self.outkey + '_level_std'] = self.std[k[0]]
        
        return df
    
    def get_features(self):
        return self.outkeys.copy() + [self.outkey + '_count']

# i kept the same index randomization (with fixed seed) so I could validate this code against
# the original...

target_num_map = {'low':0, 'medium':1, 'high':2}
train_y = np.array(train_df['interest_level'].apply(lambda x: target_num_map[x]))

def proc_fold(fold):
    train_index = fold[0]
    test_index = fold[1]
    
    cv_train = train_df.iloc[train_index]
    cv_valid = train_df.iloc[test_index][['interest_level', 'manager_id', 'building_id']]
    cv_test = test_df.copy()
    
    m_build = MeansProcessor('building_id', 'building_sort')
    m_build.fit(cv_train)
    cv_valid = m_build.predict(cv_valid)
    cv_test = m_build.predict(cv_test)

    m_mgr = MeansProcessor('manager_id', 'manager_sort')
    m_mgr.fit(cv_train)
    cv_valid = m_mgr.predict(cv_valid)
    cv_test = m_mgr.predict(cv_test)

    m_comb = MeansProcessor(['building_id', 'manager_id'], 'mb_comb')
    m_comb.fit(cv_train)
    cv_valid = m_comb.predict(cv_valid)
    cv_test = m_comb.predict(cv_test)

    return cv_train, cv_valid, cv_test

kf = model_selection.StratifiedKFold(n_splits=5, shuffle=True, random_state=2016)
folds = [(k[0], k[1]) for k in kf.split(list(range(train_df.shape[0])), train_y)]

#with Pool(5) as pool:
#    rv = pool.map(proc_fold, folds)

import pickle

try:
    rv = pickle.load(open('bag-model-groupfeatures_nonan.pkl', 'rb'))
except:
    with Pool(5) as pool:
        rv = pool.map(proc_fold, folds)

        pickle.dump(rv, open('bag-model-groupfeatures_nonan.pkl', 'wb'))

# dummies to get feature id's
m_build = MeansProcessor('building_id', 'building_sort')
m_mgr = MeansProcessor('manager_id', 'manager_sort')
m_comb = MeansProcessor(['building_id', 'manager_id'], 'mb_comb')

group_features = m_build.get_features() + m_mgr.get_features() + m_comb.get_features()

#cv_test = [r[2] for r in rv]
cv_test = []
for r in rv:
    cv_test.append(test_df.merge(r[2][group_features], left_index=True, right_index=True))

cv_allvalid = pd.concat([r[1] for r in rv])

train_df = train_df.merge(cv_allvalid[group_features], left_index=True, right_index=True)

In [7]:
kf = model_selection.StratifiedKFold(n_splits=5, shuffle=True, random_state=2016)
folds = [(k[0], k[1]) for k in kf.split(list(range(train_df.shape[0])), train_df.interest_cat)]

In [8]:
for df in [train_df] + cv_test:
    df['price_t'] = df['price_t'].clip(0, 13000)
    df['price_per_room'] = df['price_per_room'].clip(0, 13000)
#    df['density_lin005'] = df['density_lin005'].clip(-50, 50)
    df['predicted_price_ratio'] = df['predicted_price_ratio'].clip(-50, 50)

In [9]:
train_df.pos.dtype == 'O'


Out[9]:
True

In [10]:
train_df_normalized = train_df.copy()
cvtest_normalized = [df.copy() for df in cv_test]

train_df_normalized['listing_id_norm'] = train_df_normalized['listing_id']
for df in cvtest_normalized:
    df['listing_id_norm'] = df['listing_id']

normalized_keys = []

scaler = {}
for f in train_df.keys():
    if f[0:2] == 'f_' or f[0:3] == 'fm_':
        train_df_normalized[f] = train_df_normalized[f].clip(0, 1)
        for df in cvtest_normalized:
            df[f] = df[f].clip(0, 1)
    elif 'interest' in f or f == 'listing_id' or f == 'index':
        continue
    elif f == 'created' or train_df[f].dtype == 'O':
        train_df_normalized.drop(f, axis=1, inplace=True)
        for df in cvtest_normalized:
            df.drop(f, axis=1, inplace=True)
        continue
    else:
        #print(f, train_df[f].min(), train_df[f].max(), test_df[f].min(), test_df[f].max())
        scaler[f] = sklearn.preprocessing.StandardScaler()
        train_df_normalized[f] = scaler[f].fit_transform(train_df_normalized[f].values.reshape(-1,1))[:,0]
        for df in cvtest_normalized:
            df[f] = scaler[f].transform(df[f].values.reshape(-1,1))[:,0]
        
    normalized_keys.append(f)


/usr/local/lib/python3.5/dist-packages/sklearn/utils/validation.py:429: DataConversionWarning: Data with input dtype int64 was converted to float64 by StandardScaler.
  warnings.warn(msg, _DataConversionWarning)
/usr/local/lib/python3.5/dist-packages/sklearn/utils/validation.py:429: DataConversionWarning: Data with input dtype uint8 was converted to float64 by StandardScaler.
  warnings.warn(msg, _DataConversionWarning)

models begin here


In [11]:
# prep CV

cv_train = []
cv_valid = []

for tr_index, val_index in kf.split(train_df.index, train_df.interest_cat):
        cv_train.append(train_df_normalized.loc[tr_index])
        cv_valid.append(train_df_normalized.loc[val_index])

In [12]:
fl = normalized_keys.copy() # + m_build.get_features() + m_mgr.get_features() 

#for f in ['density_exp01', 'density_exp005', 'density_lin005', 'density_gaussian001', 'density_gaussian', 'density_gaussian01', 'density_gaussian02', 'density_gaussian04']:
#    fl.remove(f)
    
#fl.append('density_gaussian02')
#fl.append('density_exp01')


fl.remove('predicted_price_ratio')
fl.remove('manager_building0_rate')
fl.remove('manager_shortdesc_rate')
fl.remove('manager_0feature_rate')
#fl.append('manager_sort_count')

In [13]:
len(fl)


Out[13]:
238

In [14]:
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding
from keras.layers import Conv1D, Conv2D, GlobalMaxPooling1D, GlobalMaxPooling2D, MaxPooling1D
from keras.layers import Reshape
import keras

from keras.layers.merge import add, concatenate
from keras.models import Model
from keras.layers import Input

from keras.layers.merge import add, concatenate
from keras.models import Model
from keras.layers import Input

#from keras.layers.recurrent import GRU
from keras.layers import Flatten


Using TensorFlow backend.

In [15]:
#cv_train[0].interest

def buildmodel(num_inputs, shape=[(32, .1), (16, .1)]):
    layers = [Input(shape=(num_inputs,))]

    for s in shape:
        layers.append(Dense(s[0], activation='relu')(layers[-1]))
        layers.append(Dropout(s[1])(layers[-1]))

    output = Dense(3, activation='softmax', name='output')(layers[-1])

    model = Model(inputs=layers[0], outputs=output)

    model.compile(loss='categorical_crossentropy',
                  optimizer='adam')

    return model

In [16]:
cv_train[0][fl].values.shape[1]


Out[16]:
238

In [17]:
m = buildmodel(num_inputs=cv_train[0][fl].values.shape[1])

In [18]:
# plenty of code to do this, but it's simple enough
def oneheat(y):
    rv = np.zeros((len(y), 3))

    for i in [0, 1, 2]:
        rv[:,i] = (y == i)

    return rv

In [19]:
models = []
df_folds = []
test_preds = []

for fold in range(5):
    m = buildmodel(num_inputs=cv_train[fold][fl].values.shape[1], shape=[(64, .2), (32, .1)])

    bst_model_path = 'tmpnny.h5'

    ES = keras.callbacks.EarlyStopping(patience=10)
    MC = keras.callbacks.ModelCheckpoint(bst_model_path, save_best_only=True, save_weights_only=True)

    tmp_train_x = cv_train[fold][fl].values
    tmp_train_y = oneheat(cv_train[fold].interest_cat)

    tmp_valid_x = cv_valid[fold][fl].values
    tmp_valid_y = oneheat(cv_valid[fold].interest_cat)

    test_x = cvtest_normalized[fold][fl].values
    
    history = m.fit(tmp_train_x, tmp_train_y, batch_size=256, epochs=120, verbose=2, validation_data=(tmp_valid_x, tmp_valid_y), callbacks=[MC, ES])
    
    m.load_weights(bst_model_path)

    tpreds = m.predict(tmp_valid_x)

    df_tmp = pd.DataFrame(tpreds)
    df_tmp.set_index(cv_valid[fold].listing_id, inplace=True)

    df_tmp.columns = [['low', 'medium', 'high']]
#    df_tmp['listing_id'] = cv_valid[fold].listing_id
    df_tmp['interest_cat'] = cv_valid[fold].interest_cat.values
    #break

    #print(log_loss(self.train_info.iloc[valid_idx].interest_level, df_tmp[self.tgts]))

    df_folds.append(df_tmp)

    test_preds.append(m.predict(test_x))

    models.append(m)


Train on 39481 samples, validate on 9871 samples
Epoch 1/120
2s - loss: 0.6571 - val_loss: 0.5736
Epoch 2/120
1s - loss: 0.5670 - val_loss: 0.5530
Epoch 3/120
1s - loss: 0.5525 - val_loss: 0.5451
Epoch 4/120
1s - loss: 0.5436 - val_loss: 0.5424
Epoch 5/120
1s - loss: 0.5386 - val_loss: 0.5404
Epoch 6/120
1s - loss: 0.5351 - val_loss: 0.5373
Epoch 7/120
1s - loss: 0.5296 - val_loss: 0.5371
Epoch 8/120
1s - loss: 0.5258 - val_loss: 0.5352
Epoch 9/120
1s - loss: 0.5246 - val_loss: 0.5358
Epoch 10/120
1s - loss: 0.5217 - val_loss: 0.5348
Epoch 11/120
1s - loss: 0.5197 - val_loss: 0.5348
Epoch 12/120
1s - loss: 0.5159 - val_loss: 0.5346
Epoch 13/120
1s - loss: 0.5159 - val_loss: 0.5350
Epoch 14/120
1s - loss: 0.5117 - val_loss: 0.5347
Epoch 15/120
1s - loss: 0.5098 - val_loss: 0.5342
Epoch 16/120
1s - loss: 0.5078 - val_loss: 0.5339
Epoch 17/120
1s - loss: 0.5077 - val_loss: 0.5340
Epoch 18/120
1s - loss: 0.5044 - val_loss: 0.5357
Epoch 19/120
1s - loss: 0.5039 - val_loss: 0.5366
Epoch 20/120
1s - loss: 0.5014 - val_loss: 0.5352
Epoch 21/120
1s - loss: 0.5013 - val_loss: 0.5363
Epoch 22/120
1s - loss: 0.4992 - val_loss: 0.5357
Epoch 23/120
1s - loss: 0.4986 - val_loss: 0.5358
Epoch 24/120
1s - loss: 0.4978 - val_loss: 0.5365
Epoch 25/120
1s - loss: 0.4949 - val_loss: 0.5364
Epoch 26/120
1s - loss: 0.4949 - val_loss: 0.5378
Epoch 27/120
1s - loss: 0.4948 - val_loss: 0.5373
Train on 39481 samples, validate on 9871 samples
Epoch 1/120
1s - loss: 0.6627 - val_loss: 0.5573
Epoch 2/120
0s - loss: 0.5746 - val_loss: 0.5379
Epoch 3/120
1s - loss: 0.5579 - val_loss: 0.5310
Epoch 4/120
0s - loss: 0.5489 - val_loss: 0.5283
Epoch 5/120
1s - loss: 0.5441 - val_loss: 0.5257
Epoch 6/120
1s - loss: 0.5394 - val_loss: 0.5238
Epoch 7/120
1s - loss: 0.5345 - val_loss: 0.5220
Epoch 8/120
0s - loss: 0.5327 - val_loss: 0.5209
Epoch 9/120
1s - loss: 0.5292 - val_loss: 0.5205
Epoch 10/120
0s - loss: 0.5265 - val_loss: 0.5205
Epoch 11/120
0s - loss: 0.5248 - val_loss: 0.5199
Epoch 12/120
0s - loss: 0.5215 - val_loss: 0.5195
Epoch 13/120
0s - loss: 0.5198 - val_loss: 0.5203
Epoch 14/120
1s - loss: 0.5172 - val_loss: 0.5184
Epoch 15/120
0s - loss: 0.5130 - val_loss: 0.5187
Epoch 16/120
0s - loss: 0.5111 - val_loss: 0.5181
Epoch 17/120
0s - loss: 0.5093 - val_loss: 0.5206
Epoch 18/120
0s - loss: 0.5081 - val_loss: 0.5190
Epoch 19/120
1s - loss: 0.5072 - val_loss: 0.5199
Epoch 20/120
0s - loss: 0.5049 - val_loss: 0.5195
Epoch 21/120
0s - loss: 0.5031 - val_loss: 0.5190
Epoch 22/120
1s - loss: 0.5045 - val_loss: 0.5192
Epoch 23/120
1s - loss: 0.5008 - val_loss: 0.5209
Epoch 24/120
1s - loss: 0.5002 - val_loss: 0.5214
Epoch 25/120
1s - loss: 0.4975 - val_loss: 0.5220
Epoch 26/120
1s - loss: 0.4966 - val_loss: 0.5260
Epoch 27/120
1s - loss: 0.4951 - val_loss: 0.5239
Train on 39481 samples, validate on 9871 samples
Epoch 1/120
1s - loss: 0.6570 - val_loss: 0.5810
Epoch 2/120
0s - loss: 0.5691 - val_loss: 0.5611
Epoch 3/120
0s - loss: 0.5537 - val_loss: 0.5554
Epoch 4/120
1s - loss: 0.5443 - val_loss: 0.5520
Epoch 5/120
1s - loss: 0.5377 - val_loss: 0.5489
Epoch 6/120
1s - loss: 0.5322 - val_loss: 0.5455
Epoch 7/120
1s - loss: 0.5312 - val_loss: 0.5446
Epoch 8/120
0s - loss: 0.5272 - val_loss: 0.5441
Epoch 9/120
0s - loss: 0.5220 - val_loss: 0.5431
Epoch 10/120
0s - loss: 0.5208 - val_loss: 0.5444
Epoch 11/120
0s - loss: 0.5180 - val_loss: 0.5458
Epoch 12/120
1s - loss: 0.5173 - val_loss: 0.5440
Epoch 13/120
0s - loss: 0.5142 - val_loss: 0.5409
Epoch 14/120
0s - loss: 0.5109 - val_loss: 0.5415
Epoch 15/120
0s - loss: 0.5108 - val_loss: 0.5413
Epoch 16/120
1s - loss: 0.5099 - val_loss: 0.5403
Epoch 17/120
0s - loss: 0.5077 - val_loss: 0.5434
Epoch 18/120
0s - loss: 0.5059 - val_loss: 0.5415
Epoch 19/120
0s - loss: 0.5036 - val_loss: 0.5413
Epoch 20/120
0s - loss: 0.5023 - val_loss: 0.5427
Epoch 21/120
0s - loss: 0.5001 - val_loss: 0.5458
Epoch 22/120
0s - loss: 0.4998 - val_loss: 0.5439
Epoch 23/120
0s - loss: 0.4974 - val_loss: 0.5432
Epoch 24/120
0s - loss: 0.4966 - val_loss: 0.5431
Epoch 25/120
0s - loss: 0.4955 - val_loss: 0.5444
Epoch 26/120
0s - loss: 0.4932 - val_loss: 0.5451
Epoch 27/120
1s - loss: 0.4913 - val_loss: 0.5456
Train on 39481 samples, validate on 9871 samples
Epoch 1/120
1s - loss: 0.6385 - val_loss: 0.5600
Epoch 2/120
1s - loss: 0.5715 - val_loss: 0.5442
Epoch 3/120
1s - loss: 0.5549 - val_loss: 0.5385
Epoch 4/120
0s - loss: 0.5459 - val_loss: 0.5364
Epoch 5/120
0s - loss: 0.5402 - val_loss: 0.5331
Epoch 6/120
0s - loss: 0.5364 - val_loss: 0.5316
Epoch 7/120
1s - loss: 0.5318 - val_loss: 0.5311
Epoch 8/120
1s - loss: 0.5290 - val_loss: 0.5290
Epoch 9/120
0s - loss: 0.5268 - val_loss: 0.5296
Epoch 10/120
0s - loss: 0.5230 - val_loss: 0.5271
Epoch 11/120
1s - loss: 0.5208 - val_loss: 0.5277
Epoch 12/120
1s - loss: 0.5185 - val_loss: 0.5268
Epoch 13/120
1s - loss: 0.5158 - val_loss: 0.5264
Epoch 14/120
0s - loss: 0.5146 - val_loss: 0.5264
Epoch 15/120
0s - loss: 0.5105 - val_loss: 0.5272
Epoch 16/120
0s - loss: 0.5102 - val_loss: 0.5285
Epoch 17/120
0s - loss: 0.5071 - val_loss: 0.5285
Epoch 18/120
0s - loss: 0.5055 - val_loss: 0.5275
Epoch 19/120
1s - loss: 0.5053 - val_loss: 0.5287
Epoch 20/120
0s - loss: 0.5021 - val_loss: 0.5274
Epoch 21/120
0s - loss: 0.5012 - val_loss: 0.5268
Epoch 22/120
0s - loss: 0.4993 - val_loss: 0.5289
Epoch 23/120
0s - loss: 0.4983 - val_loss: 0.5279
Epoch 24/120
1s - loss: 0.4948 - val_loss: 0.5287
Train on 39484 samples, validate on 9868 samples
Epoch 1/120
1s - loss: 0.6558 - val_loss: 0.5653
Epoch 2/120
0s - loss: 0.5682 - val_loss: 0.5501
Epoch 3/120
0s - loss: 0.5525 - val_loss: 0.5442
Epoch 4/120
0s - loss: 0.5443 - val_loss: 0.5397
Epoch 5/120
0s - loss: 0.5408 - val_loss: 0.5383
Epoch 6/120
0s - loss: 0.5360 - val_loss: 0.5360
Epoch 7/120
0s - loss: 0.5317 - val_loss: 0.5352
Epoch 8/120
1s - loss: 0.5278 - val_loss: 0.5354
Epoch 9/120
0s - loss: 0.5260 - val_loss: 0.5346
Epoch 10/120
0s - loss: 0.5235 - val_loss: 0.5355
Epoch 11/120
0s - loss: 0.5218 - val_loss: 0.5355
Epoch 12/120
1s - loss: 0.5178 - val_loss: 0.5339
Epoch 13/120
1s - loss: 0.5168 - val_loss: 0.5341
Epoch 14/120
1s - loss: 0.5132 - val_loss: 0.5336
Epoch 15/120
0s - loss: 0.5108 - val_loss: 0.5340
Epoch 16/120
1s - loss: 0.5111 - val_loss: 0.5335
Epoch 17/120
0s - loss: 0.5081 - val_loss: 0.5328
Epoch 18/120
1s - loss: 0.5055 - val_loss: 0.5344
Epoch 19/120
1s - loss: 0.5032 - val_loss: 0.5357
Epoch 20/120
0s - loss: 0.5034 - val_loss: 0.5375
Epoch 21/120
1s - loss: 0.5015 - val_loss: 0.5351
Epoch 22/120
1s - loss: 0.4997 - val_loss: 0.5365
Epoch 23/120
0s - loss: 0.4991 - val_loss: 0.5364
Epoch 24/120
0s - loss: 0.4969 - val_loss: 0.5373
Epoch 25/120
0s - loss: 0.4950 - val_loss: 0.5387
Epoch 26/120
0s - loss: 0.4932 - val_loss: 0.5378
Epoch 27/120
0s - loss: 0.4943 - val_loss: 0.5382
Epoch 28/120
1s - loss: 0.4924 - val_loss: 0.5390

In [20]:
df_cv = pd.concat(df_folds).sort_index()

print(log_loss(df_cv.interest_cat, df_cv[['low', 'medium', 'high']]))

testarray = np.array(test_preds.copy())

tgts = ['low', 'medium', 'high']

df_test = pd.DataFrame(testarray.mean(axis=0))
df_test.columns = tgts
df_test['listing_id'] = test_df.listing_id
df_test.set_index('listing_id', inplace=True)

df_output = pd.concat([df_cv[tgts], df_test])
df_output.sort_index(inplace=True)

df_output.to_pickle('bag-model-nn-v1.pkl')


0.531062637395

In [21]:
df_fold = []
for f in range(testarray.shape[0]):
    df_fold.append(pd.DataFrame(testarray[f]))
    df_fold[-1]['listing_id'] = test_df.listing_id
    df_fold[-1].sort_values('listing_id', inplace=True)
    df_fold[-1].set_index('listing_id', inplace=True)

pickle.dump((df_output, df_fold), open('model-nn.pkl', 'wb'))