Using this just for RandomForestClassifier - there's also ExtraTrees and RandomForestRegressors, but they didn't help my OOB, so they're not getting run here...
In [1]:
import os
import sys
import operator
import numpy as np
import pandas as pd
from scipy import sparse
import xgboost as xgb
import random
from sklearn import model_selection, preprocessing, ensemble
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import pickle
import time
import sklearn.cluster
import Levenshtein
from multiprocessing import Pool
import lightgbm as lgbm
In [2]:
train_df = pd.read_pickle('fin-dprep-train.pkl')
test_df = pd.read_pickle('fin-dprep-test.pkl')
features_to_use = pickle.load(open('fin-dprep-flist.pkl', 'rb'))
In [3]:
medium_price = pd.read_pickle('fin-medium-price.pkl')
train_df = pd.merge(train_df, medium_price, left_on='listing_id', right_index=True)
test_df = pd.merge(test_df, medium_price, left_on='listing_id', right_index=True)
In [ ]:
In [4]:
for df in [train_df, test_df]:
df['predicted_price_diff'] = np.log(df.predicted_price) - np.log(df.price)
df['predicted_price_ratio'] = np.log(df.predicted_price) / np.log(df.price)
In [5]:
# fill in the NaN's.
for t in train_df.keys():
nacount = train_df[t].isnull().sum()
if nacount:
# nacount_test = test_df[t].isnull().sum()
print(t, nacount / len(train_df))#, nacount_test / len(test_df))
train_df.fillna(0, inplace=True)
test_df.fillna(0, inplace=True)
In [6]:
class MeansProcessor:
def __init__(self, key, outkey = None, tgt = 'interest'):
self.key = key
self.outkey = key if outkey is None else outkey
self.count = {}
self.means = {}
self.std = {}
self.global_means = {}
self.tgt = tgt
self.outkeys = [self.outkey + '_level', self.outkey + '_level_std']
def fit(self, df):
self.global_means[self.outkey + '_level'] = df[self.tgt].mean()
self.global_means[self.outkey + '_level_std'] = df[self.tgt].std()
for k in df.groupby(self.key, sort=False):
self.count[k[0]] = len(k[1])
if len(k[1]) < 0:
self.means[k[0]] = np.nan
self.std[k[0]] = np.nan
else:
self.means[k[0]] = np.mean(k[1][self.tgt])
self.std[k[0]] = np.std(k[1][self.tgt])
def predict(self, df, nans = False):
for l in self.outkeys:
df[l] = np.nan if nans else self.global_means[l]
df[self.outkey + '_count'] = 0
for k in df.groupby(self.key, sort=False):
if k[0] == 0:
continue
if k[0] in self.means:
df.loc[k[1].index, self.outkey + '_count'] = self.count[k[0]]
df.loc[k[1].index, self.outkey + '_level'] = self.means[k[0]]
df.loc[k[1].index, self.outkey + '_level_std'] = self.std[k[0]]
return df
def get_features(self):
return self.outkeys.copy() + [self.outkey + '_count']
# i kept the same index randomization (with fixed seed) so I could validate this code against
# the original...
target_num_map = {'low':0, 'medium':1, 'high':2}
train_y = np.array(train_df['interest_level'].apply(lambda x: target_num_map[x]))
def proc_fold(fold):
train_index = fold[0]
test_index = fold[1]
cv_train = train_df.iloc[train_index]
cv_valid = train_df.iloc[test_index][['interest_level', 'manager_id', 'building_id']]
cv_test = test_df.copy()
m_build = MeansProcessor('building_id', 'building_sort')
m_build.fit(cv_train)
cv_valid = m_build.predict(cv_valid)
cv_test = m_build.predict(cv_test)
m_mgr = MeansProcessor('manager_id', 'manager_sort')
m_mgr.fit(cv_train)
cv_valid = m_mgr.predict(cv_valid)
cv_test = m_mgr.predict(cv_test)
m_comb = MeansProcessor(['building_id', 'manager_id'], 'mb_comb')
m_comb.fit(cv_train)
cv_valid = m_comb.predict(cv_valid)
cv_test = m_comb.predict(cv_test)
return cv_train, cv_valid, cv_test
kf = model_selection.StratifiedKFold(n_splits=5, shuffle=True, random_state=2016)
folds = [(k[0], k[1]) for k in kf.split(list(range(train_df.shape[0])), train_y)]
#with Pool(5) as pool:
# rv = pool.map(proc_fold, folds)
import pickle
try:
rv = pickle.load(open('0422-model-groupfeatures_nonan.pkl', 'rb'))
except:
with Pool(5) as pool:
rv = pool.map(proc_fold, folds)
pickle.dump(rv, open('0422-model-groupfeatures_nonan.pkl', 'wb'))
# dummies to get feature id's
m_build = MeansProcessor('building_id', 'building_sort')
m_mgr = MeansProcessor('manager_id', 'manager_sort')
m_comb = MeansProcessor(['building_id', 'manager_id'], 'mb_comb')
group_features = m_build.get_features() + m_mgr.get_features() + m_comb.get_features()
#cv_test = [r[2] for r in rv]
cv_test = []
for r in rv:
cv_test.append(test_df.merge(r[2][group_features], left_index=True, right_index=True))
cv_allvalid = pd.concat([r[1] for r in rv])
train_df = train_df.merge(cv_allvalid[group_features], left_index=True, right_index=True)
In [7]:
kf = model_selection.StratifiedKFold(n_splits=5, shuffle=True, random_state=2016)
folds = [(k[0], k[1]) for k in kf.split(list(range(train_df.shape[0])), train_df.interest_cat)]
In [8]:
for df in [train_df] + cv_test:
df['price_t'] = df['price_t'].clip(0, 13000)
df['price_per_room'] = df['price_per_room'].clip(0, 13000)
#df['density_lin005'] = df['density_lin005'].clip(-50, 50)
df['predicted_price_ratio'] = df['predicted_price_ratio'].clip(-50, 50)
In [9]:
train_df_normalized = train_df.copy()
cvtest_normalized = [df.copy() for df in cv_test]
train_df_normalized['listing_id_norm'] = train_df_normalized['listing_id']
for df in cvtest_normalized:
df['listing_id_norm'] = df['listing_id']
normalized_keys = []
scaler = {}
for f in train_df.keys():
if f[0:2] == 'f_' or f[0:3] == 'fm_':
train_df_normalized[f] = train_df_normalized[f].clip(0, 1)
for df in cvtest_normalized:
df[f] = df[f].clip(0, 1)
elif 'interest' in f or f == 'listing_id' or f == 'index':
continue
elif f == 'created' or train_df[f].dtype == 'O':
train_df_normalized.drop(f, axis=1, inplace=True)
for df in cvtest_normalized:
df.drop(f, axis=1, inplace=True)
continue
else:
#print(f, train_df[f].min(), train_df[f].max(), test_df[f].min(), test_df[f].max())
scaler[f] = sklearn.preprocessing.StandardScaler()
train_df_normalized[f] = scaler[f].fit_transform(train_df_normalized[f].values.reshape(-1,1))[:,0]
for df in cvtest_normalized:
df[f] = scaler[f].transform(df[f].values.reshape(-1,1))[:,0]
normalized_keys.append(f)
models begin here
In [10]:
fl = normalized_keys.copy() + m_build.get_features() + m_mgr.get_features()
#for f in ['density_exp01', 'density_exp005', 'density_lin005', 'density_gaussian001', 'density_gaussian', 'density_gaussian01', 'density_gaussian02', 'density_gaussian04']:
# fl.remove(f)
#fl.append('density_gaussian02')
#fl.append('density_exp01')
fl.remove('predicted_price_ratio')
fl.remove('manager_building0_rate')
fl.remove('manager_shortdesc_rate')
fl.remove('manager_0feature_rate')
#fl.append('manager_sort_count')
In [11]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.datasets import load_digits
from sklearn.ensemble import RandomForestClassifier
from scipy.stats import randint as sp_randint
In [12]:
# RandomForestClassifier
start = time.time()
# prep CV
cv_train = []
cv_valid = []
for tr_index, val_index in kf.split(train_df.index, train_df.interest_cat):
cv_train.append(train_df_normalized.loc[tr_index])
cv_valid.append(train_df_normalized.loc[val_index])
models = []
cv_preds = []
df_cvpreds = []
for f in range(5):
#dset = lgbm.Dataset(cv_train[f][fl], cv_train[f].interest_cat, silent=True)
#dset_val = lgbm.Dataset(cv_valid[f][fl], cv_valid[f].interest_cat, silent=True)
#models.append(lgbm.train(lgbm_params, dset, early_stopping_rounds=100, verbose_eval=False, valid_sets=dset_val, num_boost_round=2000))
models.append(sklearn.ensemble.RandomForestClassifier(n_estimators=10, min_samples_split=8, min_samples_leaf=4,
n_jobs=-1, class_weight=None, random_state=0))
models[-1].fit(cv_train[f][fl].values, cv_train[f].interest_cat.values)
best = None
for nest in range(20, 2000, 10):
models[-1].set_params(warm_start = True, n_estimators=nest)
models[-1].fit(cv_train[f][fl].values, cv_train[f].interest_cat.values)
preds = models[-1].predict_proba(cv_valid[f][fl].values)
score = sklearn.metrics.log_loss(cv_valid[f].interest_cat, preds)
print(nest, score)
if best is None or score < best[0]:
best = (score, nest)
elif nest - best[1] >= 50:
break
#models[-1].set_params(n_estimators = best[1])
#print('done training')
cv_preds.append(models[-1].predict_proba(cv_valid[f][fl].values))
df_cvpreds.append(pd.DataFrame(cv_preds[f], columns=['low', 'medium', 'high']))
df_cvpreds[f].index = cv_valid[f].index
df_cvpreds[f]['interest_cat'] = cv_valid[f].interest_cat
df_cvpreds[f]['listing_id'] = cv_valid[f].listing_id
df_cvpreds[f].set_index('listing_id', inplace=True)
print(f, sklearn.metrics.log_loss(df_cvpreds[f].interest_cat, df_cvpreds[f][['low', 'medium', 'high']]))
df_cvpreds = pd.concat(df_cvpreds)
tgts = ['low', 'medium', 'high']
print('combined: ', sklearn.metrics.log_loss(df_cvpreds.interest_cat, df_cvpreds[tgts]))
end = time.time()
print(end - start)
In [15]:
testpreds = []
for i, m in enumerate(models):
testpreds.append(m.predict_proba(cvtest_normalized[i][fl]))
f = np.array(testpreds).mean(axis=0)
df_testpreds = pd.DataFrame(f, columns=['low', 'medium', 'high'])
df_testpreds.index = test_df.index
df_testpreds['listing_id'] = test_df.listing_id
df_testpreds.set_index('listing_id', inplace=True)
df_output = pd.concat([df_testpreds, df_cvpreds[tgts]])
df_output.sort_index(inplace=True)
In [16]:
df_fold = []
for i in range(len(testpreds)):
df_fold.append(pd.DataFrame(testpreds[i]))
df_fold[-1]['listing_id'] = test_df.listing_id
df_fold[-1].sort_values('listing_id', inplace=True)
df_fold[-1].set_index('listing_id', inplace=True)
pickle.dump((df_output, df_fold), open('model-output-rf.pkl', 'wb'))
In [17]:
# RandomForestRegressor
start = time.time()
# prep CV
cv_train = []
cv_valid = []
for tr_index, val_index in kf.split(train_df.index, train_df.interest_cat):
cv_train.append(train_df_normalized.loc[tr_index])
cv_valid.append(train_df_normalized.loc[val_index])
models = []
cv_preds = []
df_cvpreds = []
for f in range(5):
#dset = lgbm.Dataset(cv_train[f][fl], cv_train[f].interest_cat, silent=True)
#dset_val = lgbm.Dataset(cv_valid[f][fl], cv_valid[f].interest_cat, silent=True)
#models.append(lgbm.train(lgbm_params, dset, early_stopping_rounds=100, verbose_eval=False, valid_sets=dset_val, num_boost_round=2000))
models.append(sklearn.ensemble.RandomForestRegressor(n_estimators=10, min_samples_split=8, min_samples_leaf=4,
n_jobs=-1, random_state=0))
models[-1].fit(cv_train[f][fl].values, cv_train[f].interest_cat.values)
best = None
for nest in range(20, 2000, 10):
models[-1].set_params(warm_start = True, n_estimators=nest)
models[-1].fit(cv_train[f][fl].values, cv_train[f].interest.values)
preds = models[-1].predict(cv_valid[f][fl].values)
#score = sklearn.metrics.log_loss(cv_valid[f].interest, preds)
score = np.sqrt(sklearn.metrics.mean_squared_error(cv_valid[f].interest, preds))
print(nest, score)
if best is None or score < best[0]:
best = (score, nest)
elif nest - best[1] >= 50:
break
#models[-1].set_params(n_estimators = best[1])
#print('done training')
cv_preds.append(models[-1].predict(cv_valid[f][fl].values))
df_cvpreds.append(pd.DataFrame(cv_preds[f], columns=['prediction']))
df_cvpreds[f].index = cv_valid[f].index
df_cvpreds[f]['interest'] = cv_valid[f].interest
df_cvpreds[f]['listing_id'] = cv_valid[f].listing_id
df_cvpreds[f].set_index('listing_id', inplace=True)
print(f) #, np.sqrt(sklearn.metrics.mean_squared_error(cv_valid[f].interest, preds)))
df_cvpreds = pd.concat(df_cvpreds)
tgts = ['low', 'medium', 'high']
print('combined: ', np.sqrt(sklearn.metrics.mean_squared_error(df_cvpreds.interest, df_cvpreds.prediction))) #sklearn.metrics.log_loss(df_cvpreds.interest_cat, df_cvpreds[tgts]))
end = time.time()
print(end - start)
In [18]:
testpreds = []
for i, m in enumerate(models):
testpreds.append(m.predict(cvtest_normalized[i][fl]))
In [19]:
f = np.array(testpreds).mean(axis=0)
df_testpreds = pd.DataFrame(f, columns=['prediction'])
df_testpreds.index = test_df.index
df_testpreds['listing_id'] = test_df.listing_id
df_testpreds.set_index('listing_id', inplace=True)
df_output = pd.concat([df_testpreds, df_cvpreds[['prediction']]])
df_output.sort_index(inplace=True)
df_output.to_pickle('bag-model-rfr-v1.pkl')
df_fold = []
for i in range(len(testpreds)):
df_fold.append(pd.DataFrame(testpreds[i]))
df_fold[-1]['listing_id'] = test_df.listing_id
df_fold[-1].sort_values('listing_id', inplace=True)
df_fold[-1].set_index('listing_id', inplace=True)
pickle.dump((df_output, df_fold), open('bagta-model-rfr-v1.pkl', 'wb'))
In [23]:
# ExtraTreesRegressor
start = time.time()
# prep CV
cv_train = []
cv_valid = []
for tr_index, val_index in kf.split(train_df.index, train_df.interest_cat):
cv_train.append(train_df_normalized.loc[tr_index])
cv_valid.append(train_df_normalized.loc[val_index])
models = []
cv_preds = []
df_cvpreds = []
for f in range(5):
models.append(sklearn.ensemble.ExtraTreesRegressor(n_estimators=10, min_samples_split=8, min_samples_leaf=4,
n_jobs=-1, random_state=0))
models[-1].fit(cv_train[f][fl].values, cv_train[f].interest_cat.values)
best = None
for nest in range(20, 2000, 10):
models[-1].set_params(warm_start = True, n_estimators=nest)
models[-1].fit(cv_train[f][fl].values, cv_train[f].interest.values)
preds = models[-1].predict(cv_valid[f][fl].values)
#score = sklearn.metrics.log_loss(cv_valid[f].interest, preds)
score = np.sqrt(sklearn.metrics.mean_squared_error(cv_valid[f].interest, preds))
print(nest, score)
if best is None or score < best[0]:
best = (score, nest)
elif nest - best[1] >= 50:
break
cv_preds.append(models[-1].predict(cv_valid[f][fl].values))
df_cvpreds.append(pd.DataFrame(cv_preds[f], columns=['prediction']))
df_cvpreds[f].index = cv_valid[f].index
df_cvpreds[f]['interest'] = cv_valid[f].interest
df_cvpreds[f]['listing_id'] = cv_valid[f].listing_id
df_cvpreds[f].set_index('listing_id', inplace=True)
print(f) #, np.sqrt(sklearn.metrics.mean_squared_error(cv_valid[f].interest, preds)))
df_cvpreds = pd.concat(df_cvpreds)
tgts = ['low', 'medium', 'high']
print('combined: ', np.sqrt(sklearn.metrics.mean_squared_error(df_cvpreds.interest, df_cvpreds.prediction))) #sklearn.metrics.log_loss(df_cvpreds.interest_cat, df_cvpreds[tgts]))
end = time.time()
print(end - start)
In [24]:
testpreds = []
for i, m in enumerate(models):
testpreds.append(m.predict(cvtest_normalized[i][fl]))
f = np.array(testpreds).mean(axis=0)
df_testpreds = pd.DataFrame(f, columns=['prediction'])
df_testpreds.index = test_df.index
df_testpreds['listing_id'] = test_df.listing_id
df_testpreds.set_index('listing_id', inplace=True)
df_output = pd.concat([df_testpreds, df_cvpreds[['prediction']]])
df_output.sort_index(inplace=True)
#df_output.to_pickle('0423-model-etr-v1.pkl')
df_fold = []
for i in range(len(testpreds)):
df_fold.append(pd.DataFrame(testpreds[i]))
df_fold[-1]['listing_id'] = test_df.listing_id
df_fold[-1].sort_values('listing_id', inplace=True)
df_fold[-1].set_index('listing_id', inplace=True)
pickle.dump((df_output, df_fold), open('bagta-model-etr-v1.pkl', 'wb'))
In [103]:
df_output
Out[103]:
In [79]:
fl = ['price', 'manager_sort_level', 'latitude', 'longitude', 'bedrooms', 'bathrooms', 'density_exp005', 'predicted_price_diff', 'created_hour']
# ET
# RandomForestClassifier
start = time.time()
# prep CV
cv_train = []
cv_valid = []
for tr_index, val_index in kf.split(train_df.index, train_df.interest_cat):
cv_train.append(train_df_normalized.loc[tr_index])
cv_valid.append(train_df_normalized.loc[val_index])
models = []
cv_preds = []
df_cvpreds = []
for f in range(5):
#dset = lgbm.Dataset(cv_train[f][fl], cv_train[f].interest_cat, silent=True)
#dset_val = lgbm.Dataset(cv_valid[f][fl], cv_valid[f].interest_cat, silent=True)
#models.append(lgbm.train(lgbm_params, dset, early_stopping_rounds=100, verbose_eval=False, valid_sets=dset_val, num_boost_round=2000))
models.append(sklearn.ensemble.ExtraTreesClassifier(n_estimators=10, max_features=len(fl),
n_jobs=-1, class_weight=None, random_state=0))
models[-1].fit(cv_train[f][fl].values, cv_train[f].interest_cat.values)
best = None
for nest in range(20, 2000, 10):
models[-1].set_params(warm_start = True, n_estimators=nest)
models[-1].fit(cv_train[f][fl].values, cv_train[f].interest_cat.values)
preds = models[-1].predict_proba(cv_valid[f][fl].values)
score = sklearn.metrics.log_loss(cv_valid[f].interest_cat, preds)
print(nest, score)
if best is None or score < best[0]:
best = (score, nest)
elif nest - best[1] >= 50:
break
#models[-1].set_params(n_estimators = best[1])
#print('done training')
cv_preds.append(models[-1].predict_proba(cv_valid[f][fl].values))
df_cvpreds.append(pd.DataFrame(cv_preds[f], columns=['low', 'medium', 'high']))
df_cvpreds[f].index = cv_valid[f].index
df_cvpreds[f]['interest_cat'] = cv_valid[f].interest_cat
df_cvpreds[f]['listing_id'] = cv_valid[f].listing_id
df_cvpreds[f].set_index('listing_id', inplace=True)
print(f, sklearn.metrics.log_loss(df_cvpreds[f].interest_cat, df_cvpreds[f][['low', 'medium', 'high']]))
df_cvpreds = pd.concat(df_cvpreds)
tgts = ['low', 'medium', 'high']
print('combined: ', sklearn.metrics.log_loss(df_cvpreds.interest_cat, df_cvpreds[tgts]))
end = time.time()
print(end - start)
In [ ]:
In [34]:
train_df['desc_xp_count'] = train_df.description.apply(lambda x: x.count('!'))
train_df['desc_xp_ratio'] = train_df.description.apply(lambda x: (x.count('!') / len(x)) if len(x) else 0)
train_df['desc_xp_first'] = train_df.description.apply(lambda x: x.find('!') if len(x) else 0)
train_df['desc_xp_inv_first'] = train_df.description.apply(lambda x: (len(x) - x.find('!')) if len(x) else 0)
train_df['desc_xp_inv_mult'] = train_df.desc_xp_count * train_df.desc_xp_inv_first
In [30]:
# prep CV
cv_train = []
cv_valid = []
for tr_index, val_index in kf.split(train_df.index, train_df.interest_cat):
cv_train.append(train_df.loc[tr_index])
cv_valid.append(train_df.loc[val_index])
In [74]:
t4_params = {
'boosting_type': 'gbdt', 'objective': 'multiclass', 'nthread': -1, 'silent': True,
'num_leaves': 2**5, 'learning_rate': 0.02, 'max_depth': -1, 'metric': ['multi_logloss'],
'max_bin': 255, 'subsample_for_bin': 50000,
'subsample': 0.8, 'subsample_freq': 1, 'colsample_bytree': 0.6, 'reg_alpha': 1, 'reg_lambda': 0,
'min_split_gain': 0.25, 'min_child_weight': .5, 'min_child_samples': 20, 'scale_pos_weight': 1}
lgbm_params = t4_params.copy()
lgbm_params['num_class'] = 3
In [78]:
fl = ['price', 'manager_sort_level', 'latitude', 'longitude', 'bedrooms', 'bathrooms', 'density_exp005', 'predicted_price_diff', 'created_hour']
start = time.time()
# prep CV
cv_train = []
cv_valid = []
for tr_index, val_index in kf.split(train_df.index, train_df.interest_cat):
cv_train.append(train_df_normalized.loc[tr_index])
cv_valid.append(train_df_normalized.loc[val_index])
models = []
cv_preds = []
df_cvpreds = []
for f in range(5):
dset = lgbm.Dataset(cv_train[f][fl], cv_train[f].interest_cat, silent=True)
dset_val = lgbm.Dataset(cv_valid[f][fl], cv_valid[f].interest_cat, silent=True)
models.append(lgbm.train(lgbm_params, dset, early_stopping_rounds=100, verbose_eval=False, valid_sets=dset_val, num_boost_round=2000))
#print('done training')
cv_preds.append(models[-1].predict(cv_valid[f][fl]))
df_cvpreds.append(pd.DataFrame(cv_preds[f], columns=['low', 'medium', 'high']))
df_cvpreds[f].index = cv_valid[f].index
df_cvpreds[f]['interest_cat'] = cv_valid[f].interest_cat
df_cvpreds[f]['listing_id'] = cv_valid[f].listing_id
df_cvpreds[f].set_index('listing_id', inplace=True)
print(f, sklearn.metrics.log_loss(df_cvpreds[f].interest_cat, df_cvpreds[f][['low', 'medium', 'high']]))
df_cvpreds = pd.concat(df_cvpreds)
tgts = ['low', 'medium', 'high']
print('combined: ', sklearn.metrics.log_loss(df_cvpreds.interest_cat, df_cvpreds[tgts]))
end = time.time()
print(end - start)
In [77]:
testpreds = []
for i, m in enumerate(models):
testpreds.append(m.predict(cv_test[i][fl], num_iteration=m.best_iteration))
f = np.array(testpreds).mean(axis=0)
df_testpreds = pd.DataFrame(f, columns=['low', 'medium', 'high'])
df_testpreds.index = test_df.index
df_testpreds['listing_id'] = test_df.listing_id
df_testpreds.set_index('listing_id', inplace=True)
df_output = pd.concat([df_testpreds, df_cvpreds[tgts]])
df_output.sort_index(inplace=True)
df_output.to_pickle('0423-minimod-lgbm-lf-v1.pkl')
In [23]:
corrs = []
for k in fl:
corrs.append((k, train_df[k].corr(train_df.interest)))
In [24]:
sorted(corrs, key=operator.itemgetter(1))
Out[24]:
In [25]:
pd.Series.corr?
In [38]:
train_df[~(train_df.f_doorman == 1)].price.mean()
Out[38]: