NN model
In [1]:
import os
import sys
import operator
import numpy as np
import pandas as pd
from scipy import sparse
import xgboost as xgb
import random
from sklearn import model_selection, preprocessing, ensemble
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import pickle
import time
import sklearn.cluster
#import Levenshtein
from multiprocessing import Pool
#import lightgbm as lgbm
In [2]:
train_df = pd.read_pickle('fin-dprep-train.pkl')
test_df = pd.read_pickle('fin-dprep-test.pkl')
features_to_use = pickle.load(open('fin-dprep-flist.pkl', 'rb'))
In [3]:
medium_price = pd.read_pickle('fin-medium-price.pkl')
train_df = pd.merge(train_df, medium_price, left_on='listing_id', right_index=True)
test_df = pd.merge(test_df, medium_price, left_on='listing_id', right_index=True)
In [ ]:
In [4]:
for df in [train_df, test_df]:
df['predicted_price_diff'] = np.log(df.predicted_price) - np.log(df.price)
df['predicted_price_ratio'] = np.log(df.predicted_price) / np.log(df.price)
In [5]:
# fill in the NaN's.
for t in train_df.keys():
nacount = train_df[t].isnull().sum()
if nacount:
# nacount_test = test_df[t].isnull().sum()
print(t, nacount / len(train_df))#, nacount_test / len(test_df))
train_df.fillna(0, inplace=True)
test_df.fillna(0, inplace=True)
In [6]:
class MeansProcessor:
def __init__(self, key, outkey = None, tgt = 'interest'):
self.key = key
self.outkey = key if outkey is None else outkey
self.count = {}
self.means = {}
self.std = {}
self.global_means = {}
self.tgt = tgt
self.outkeys = [self.outkey + '_level', self.outkey + '_level_std']
def fit(self, df):
self.global_means[self.outkey + '_level'] = df[self.tgt].mean()
self.global_means[self.outkey + '_level_std'] = df[self.tgt].std()
for k in df.groupby(self.key, sort=False):
self.count[k[0]] = len(k[1])
if len(k[1]) < 0:
self.means[k[0]] = np.nan
self.std[k[0]] = np.nan
else:
self.means[k[0]] = np.mean(k[1][self.tgt])
self.std[k[0]] = np.std(k[1][self.tgt])
def predict(self, df, nans = False):
for l in self.outkeys:
df[l] = np.nan if nans else self.global_means[l]
df[self.outkey + '_count'] = 0
for k in df.groupby(self.key, sort=False):
if k[0] == 0:
continue
if k[0] in self.means:
df.loc[k[1].index, self.outkey + '_count'] = self.count[k[0]]
df.loc[k[1].index, self.outkey + '_level'] = self.means[k[0]]
df.loc[k[1].index, self.outkey + '_level_std'] = self.std[k[0]]
return df
def get_features(self):
return self.outkeys.copy() + [self.outkey + '_count']
# i kept the same index randomization (with fixed seed) so I could validate this code against
# the original...
target_num_map = {'low':0, 'medium':1, 'high':2}
train_y = np.array(train_df['interest_level'].apply(lambda x: target_num_map[x]))
def proc_fold(fold):
train_index = fold[0]
test_index = fold[1]
cv_train = train_df.iloc[train_index]
cv_valid = train_df.iloc[test_index][['interest_level', 'manager_id', 'building_id']]
cv_test = test_df.copy()
m_build = MeansProcessor('building_id', 'building_sort')
m_build.fit(cv_train)
cv_valid = m_build.predict(cv_valid)
cv_test = m_build.predict(cv_test)
m_mgr = MeansProcessor('manager_id', 'manager_sort')
m_mgr.fit(cv_train)
cv_valid = m_mgr.predict(cv_valid)
cv_test = m_mgr.predict(cv_test)
m_comb = MeansProcessor(['building_id', 'manager_id'], 'mb_comb')
m_comb.fit(cv_train)
cv_valid = m_comb.predict(cv_valid)
cv_test = m_comb.predict(cv_test)
return cv_train, cv_valid, cv_test
kf = model_selection.StratifiedKFold(n_splits=5, shuffle=True, random_state=2016)
folds = [(k[0], k[1]) for k in kf.split(list(range(train_df.shape[0])), train_y)]
#with Pool(5) as pool:
# rv = pool.map(proc_fold, folds)
import pickle
try:
rv = pickle.load(open('bag-model-groupfeatures_nonan.pkl', 'rb'))
except:
with Pool(5) as pool:
rv = pool.map(proc_fold, folds)
pickle.dump(rv, open('bag-model-groupfeatures_nonan.pkl', 'wb'))
# dummies to get feature id's
m_build = MeansProcessor('building_id', 'building_sort')
m_mgr = MeansProcessor('manager_id', 'manager_sort')
m_comb = MeansProcessor(['building_id', 'manager_id'], 'mb_comb')
group_features = m_build.get_features() + m_mgr.get_features() + m_comb.get_features()
#cv_test = [r[2] for r in rv]
cv_test = []
for r in rv:
cv_test.append(test_df.merge(r[2][group_features], left_index=True, right_index=True))
cv_allvalid = pd.concat([r[1] for r in rv])
train_df = train_df.merge(cv_allvalid[group_features], left_index=True, right_index=True)
In [7]:
kf = model_selection.StratifiedKFold(n_splits=5, shuffle=True, random_state=2016)
folds = [(k[0], k[1]) for k in kf.split(list(range(train_df.shape[0])), train_df.interest_cat)]
In [8]:
for df in [train_df] + cv_test:
df['price_t'] = df['price_t'].clip(0, 13000)
df['price_per_room'] = df['price_per_room'].clip(0, 13000)
# df['density_lin005'] = df['density_lin005'].clip(-50, 50)
df['predicted_price_ratio'] = df['predicted_price_ratio'].clip(-50, 50)
In [9]:
train_df.pos.dtype == 'O'
Out[9]:
In [10]:
train_df_normalized = train_df.copy()
cvtest_normalized = [df.copy() for df in cv_test]
train_df_normalized['listing_id_norm'] = train_df_normalized['listing_id']
for df in cvtest_normalized:
df['listing_id_norm'] = df['listing_id']
normalized_keys = []
scaler = {}
for f in train_df.keys():
if f[0:2] == 'f_' or f[0:3] == 'fm_':
train_df_normalized[f] = train_df_normalized[f].clip(0, 1)
for df in cvtest_normalized:
df[f] = df[f].clip(0, 1)
elif 'interest' in f or f == 'listing_id' or f == 'index':
continue
elif f == 'created' or train_df[f].dtype == 'O':
train_df_normalized.drop(f, axis=1, inplace=True)
for df in cvtest_normalized:
df.drop(f, axis=1, inplace=True)
continue
else:
#print(f, train_df[f].min(), train_df[f].max(), test_df[f].min(), test_df[f].max())
scaler[f] = sklearn.preprocessing.StandardScaler()
train_df_normalized[f] = scaler[f].fit_transform(train_df_normalized[f].values.reshape(-1,1))[:,0]
for df in cvtest_normalized:
df[f] = scaler[f].transform(df[f].values.reshape(-1,1))[:,0]
normalized_keys.append(f)
models begin here
In [11]:
# prep CV
cv_train = []
cv_valid = []
for tr_index, val_index in kf.split(train_df.index, train_df.interest_cat):
cv_train.append(train_df_normalized.loc[tr_index])
cv_valid.append(train_df_normalized.loc[val_index])
In [12]:
fl = normalized_keys.copy() # + m_build.get_features() + m_mgr.get_features()
#for f in ['density_exp01', 'density_exp005', 'density_lin005', 'density_gaussian001', 'density_gaussian', 'density_gaussian01', 'density_gaussian02', 'density_gaussian04']:
# fl.remove(f)
#fl.append('density_gaussian02')
#fl.append('density_exp01')
fl.remove('predicted_price_ratio')
fl.remove('manager_building0_rate')
fl.remove('manager_shortdesc_rate')
fl.remove('manager_0feature_rate')
#fl.append('manager_sort_count')
In [13]:
len(fl)
Out[13]:
In [14]:
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding
from keras.layers import Conv1D, Conv2D, GlobalMaxPooling1D, GlobalMaxPooling2D, MaxPooling1D
from keras.layers import Reshape
import keras
from keras.layers.merge import add, concatenate
from keras.models import Model
from keras.layers import Input
from keras.layers.merge import add, concatenate
from keras.models import Model
from keras.layers import Input
#from keras.layers.recurrent import GRU
from keras.layers import Flatten
In [15]:
#cv_train[0].interest
def buildmodel(num_inputs, shape=[(32, .1), (16, .1)]):
layers = [Input(shape=(num_inputs,))]
for s in shape:
layers.append(Dense(s[0], activation='relu')(layers[-1]))
layers.append(Dropout(s[1])(layers[-1]))
output = Dense(3, activation='softmax', name='output')(layers[-1])
model = Model(inputs=layers[0], outputs=output)
model.compile(loss='categorical_crossentropy',
optimizer='adam')
return model
In [16]:
cv_train[0][fl].values.shape[1]
Out[16]:
In [17]:
m = buildmodel(num_inputs=cv_train[0][fl].values.shape[1])
In [18]:
# plenty of code to do this, but it's simple enough
def oneheat(y):
rv = np.zeros((len(y), 3))
for i in [0, 1, 2]:
rv[:,i] = (y == i)
return rv
In [19]:
models = []
df_folds = []
test_preds = []
for fold in range(5):
m = buildmodel(num_inputs=cv_train[fold][fl].values.shape[1], shape=[(64, .2), (32, .1)])
bst_model_path = 'tmpnny.h5'
ES = keras.callbacks.EarlyStopping(patience=10)
MC = keras.callbacks.ModelCheckpoint(bst_model_path, save_best_only=True, save_weights_only=True)
tmp_train_x = cv_train[fold][fl].values
tmp_train_y = oneheat(cv_train[fold].interest_cat)
tmp_valid_x = cv_valid[fold][fl].values
tmp_valid_y = oneheat(cv_valid[fold].interest_cat)
test_x = cvtest_normalized[fold][fl].values
history = m.fit(tmp_train_x, tmp_train_y, batch_size=256, epochs=120, verbose=2, validation_data=(tmp_valid_x, tmp_valid_y), callbacks=[MC, ES])
m.load_weights(bst_model_path)
tpreds = m.predict(tmp_valid_x)
df_tmp = pd.DataFrame(tpreds)
df_tmp.set_index(cv_valid[fold].listing_id, inplace=True)
df_tmp.columns = [['low', 'medium', 'high']]
# df_tmp['listing_id'] = cv_valid[fold].listing_id
df_tmp['interest_cat'] = cv_valid[fold].interest_cat.values
#break
#print(log_loss(self.train_info.iloc[valid_idx].interest_level, df_tmp[self.tgts]))
df_folds.append(df_tmp)
test_preds.append(m.predict(test_x))
models.append(m)
In [20]:
df_cv = pd.concat(df_folds).sort_index()
print(log_loss(df_cv.interest_cat, df_cv[['low', 'medium', 'high']]))
testarray = np.array(test_preds.copy())
tgts = ['low', 'medium', 'high']
df_test = pd.DataFrame(testarray.mean(axis=0))
df_test.columns = tgts
df_test['listing_id'] = test_df.listing_id
df_test.set_index('listing_id', inplace=True)
df_output = pd.concat([df_cv[tgts], df_test])
df_output.sort_index(inplace=True)
df_output.to_pickle('bag-model-nn-v1.pkl')
In [21]:
df_fold = []
for f in range(testarray.shape[0]):
df_fold.append(pd.DataFrame(testarray[f]))
df_fold[-1]['listing_id'] = test_df.listing_id
df_fold[-1].sort_values('listing_id', inplace=True)
df_fold[-1].set_index('listing_id', inplace=True)
pickle.dump((df_output, df_fold), open('model-nn.pkl', 'wb'))