In [1]:
import os
import sys
import operator
import numpy as np
import pandas as pd
from scipy import sparse
import xgboost as xgb
import random
from sklearn import model_selection, preprocessing, ensemble
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import pickle
import sklearn.cluster
import Levenshtein
from multiprocessing import Pool
In [2]:
train_df = pd.read_pickle('fin-dprep-train.pkl')
test_df = pd.read_pickle('fin-dprep-test.pkl')
features_to_use = pickle.load(open('fin-dprep-flist.pkl', 'rb'))
medium_price = pd.read_pickle('fin-medium-price-r2.pkl')
train_df = pd.merge(train_df, medium_price, left_on='listing_id', right_index=True)
test_df = pd.merge(test_df, medium_price, left_on='listing_id', right_index=True)
In [3]:
adams = pd.read_pickle('features-adams.pkl')
train_df = pd.merge(train_df, adams, left_on='listing_id', right_index=True)
test_df = pd.merge(test_df, adams, left_on='listing_id', right_index=True)
In [4]:
train_df["predicted_price_diff"] = np.log(train_df["price"]) - np.log(train_df["predicted_price"])
test_df["predicted_price_diff"] = np.log(test_df["price"]) - np.log(test_df["predicted_price"])
In [5]:
class MeansProcessor:
def __init__(self, key, outkey = None, tgt = 'interest_cat'):
self.key = key
self.outkey = key if outkey is None else outkey
self.count = {}
self.means = {}
self.std = {}
self.global_means = 0
self.tgt = tgt
self.outkeys = [self.outkey + '_level', self.outkey + '_level_std']
def fit(self, df):
self.global_means = df[self.tgt].mean()
for k in df.groupby(self.key, sort=False):
self.count[k[0]] = len(k[1])
if len(k[1]) < 0:
self.means[k[0]] = np.nan
self.std[k[0]] = np.nan
else:
self.means[k[0]] = np.mean(k[1][self.tgt])
self.std[k[0]] = np.std(k[1][self.tgt])
def predict(self, df):
for l in self.outkeys:
df[l] = np.nan # self.global_means[l]
df[self.outkey + '_count'] = 0
for k in df.groupby(self.key, sort=False):
if k[0] == 0:
continue
if k[0] in self.means:
df.loc[k[1].index, self.outkey + '_count'] = self.count[k[0]]
df.loc[k[1].index, self.outkey + '_level'] = self.means[k[0]]
df.loc[k[1].index, self.outkey + '_level_std'] = self.std[k[0]]
return df
def get_features(self):
return self.outkeys.copy() + [self.outkey + '_count']
# i kept the same index randomization (with fixed seed) so I could validate this code against
# the original...
target_num_map = {'low':0, 'medium':1, 'high':2}
train_y = np.array(train_df['interest_level'].apply(lambda x: target_num_map[x]))
def proc_fold(fold):
train_index = fold[0]
test_index = fold[1]
cv_train = train_df.iloc[train_index]
cv_valid = train_df.iloc[test_index][['interest_level', 'manager_id', 'building_id']]
cv_test = test_df.copy()
m_build = MeansProcessor('building_id', 'building_sort')
m_build.fit(cv_train)
cv_valid = m_build.predict(cv_valid)
cv_test = m_build.predict(cv_test)
m_mgr = MeansProcessor('manager_id', 'manager_sort')
m_mgr.fit(cv_train)
cv_valid = m_mgr.predict(cv_valid)
cv_test = m_mgr.predict(cv_test)
m_comb = MeansProcessor(['building_id', 'manager_id'], 'mb_comb')
m_comb.fit(cv_train)
cv_valid = m_comb.predict(cv_valid)
cv_test = m_comb.predict(cv_test)
return cv_train, cv_valid, cv_test
kf = model_selection.StratifiedKFold(n_splits=5, shuffle=True, random_state=2016)
folds = [(k[0], k[1]) for k in kf.split(list(range(train_df.shape[0])), train_y)]
#with Pool(5) as pool:
# rv = pool.map(proc_fold, folds)
import pickle
try:
rv = pickle.load(open('0420-model-groupfeatures.pkl', 'rb'))
except:
with Pool(5) as pool:
rv = pool.map(proc_fold, folds)
pickle.dump(rv, open('0420-model-groupfeatures.pkl', 'wb'))
# dummies to get feature id's
m_build = MeansProcessor('building_id', 'building_sort')
m_mgr = MeansProcessor('manager_id', 'manager_sort')
m_comb = MeansProcessor(['building_id', 'manager_id'], 'mb_comb')
group_features = m_build.get_features() + m_mgr.get_features() + m_comb.get_features()
cv_test = []
for r in rv:
cv_test.append(test_df.merge(r[2][group_features], left_index=True, right_index=True))
cv_allvalid = pd.concat([r[1] for r in rv])
train_df = train_df.merge(cv_allvalid[group_features], left_index=True, right_index=True)
In [6]:
train_ids = []
val_ids = []
for dev_index, val_index in kf.split(range(train_df.shape[0]), train_df.interest_cat):
train_ids.append(train_df.iloc[dev_index].listing_id.values)
val_ids.append(train_df.iloc[val_index].listing_id.values)
In [7]:
adams_features = ['num_rot15_X', 'num_rot15_Y', 'num_rot30_X', 'num_rot30_Y', 'num_rot45_X', 'num_rot45_Y', 'num_rot60_X', 'num_rot60_Y', 'num_rho', 'num_phi', 'num_cap_share', 'num_nr_of_lines', 'num_redacted', 'num_email', 'num_phone_nr']
In [8]:
#fl = features_to_use + m_build.get_features() + m_mgr.get_features() + m_comb.get_features() + tfidf_fn
fl = features_to_use.copy() + group_features + adams_features.copy()
#fl.remove('price')
#fl.remove('price_t')
#fl.remove('price_per_room')
fl.append('predicted_price')
fl.append('predicted_price_diff')
fl.append('manager_lazy_rate')
fl.append('density_exp01')
In [15]:
def run3_to_stackdf(run):
df_testpreds3 = pd.DataFrame(run[2].mean(axis=0))
df_testpreds3.columns = ['low', 'medium', 'high']
df_testpreds3['listing_id'] = test_df.listing_id
df_allpreds3 = pd.concat([run[1][['low', 'medium', 'high', 'listing_id']], df_testpreds3])
df_allpreds3.sort_values('listing_id', inplace=True)
df_allpreds3.set_index('listing_id', inplace=True)
df_fold = []
for f in range(run[2].shape[0]):
df_fold.append(pd.DataFrame(run[2][f]))
df_fold[-1]['listing_id'] = test_df.listing_id
df_fold[-1].sort_values('listing_id', inplace=True)
df_fold[-1].set_index('listing_id', inplace=True)
return (df_allpreds3, df_fold)
In [16]:
def runXGB(train_X, train_y, test_X, test_y=None, feature_names=None, seed_val=0, num_rounds=4000):
param = {}
param['objective'] = 'multi:softprob'
#param['tree_method'] = 'hist'
param['eta'] = 0.02
param['max_depth'] = 6
param['silent'] = 1
param['num_class'] = 3
param['eval_metric'] = "mlogloss"
param['min_child_weight'] = 1
param['subsample'] = 0.7
param['colsample_bytree'] = 0.7
param['seed'] = seed_val
#param['base_score'] = [np.mean(train_y == i) for i in [0, 1, 2]]
num_rounds = num_rounds
plst = list(param.items())
xgtrain = xgb.DMatrix(train_X, label=train_y)
if test_y is not None:
xgtest = xgb.DMatrix(test_X, label=test_y)
watchlist = [ (xgtrain,'train'), (xgtest, 'test') ]
model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=50, verbose_eval=10)
else:
xgtest = xgb.DMatrix(test_X)
model = xgb.train(plst, xgtrain, num_rounds)
pred_test_y = model.predict(xgtest, ntree_limit=model.best_ntree_limit)
return pred_test_y, model
In [17]:
def run_cv(train_df, cv_test, kf, features_to_use):
train_X = train_df[features_to_use]
train_y = np.array(train_df['interest_level'].apply(lambda x: target_num_map[x]))
cv_preds = []
cv_scores = []
models = []
test_preds = []
fold = 0
for dev_index, val_index in kf.split(range(train_X.shape[0]), train_y):
dev_X, val_X = train_X.iloc[dev_index], train_X.iloc[val_index]
dev_y, val_y = train_y[dev_index], train_y[val_index]
preds, model = runXGB(dev_X, dev_y, val_X, val_y)
models.append(model)
cv_scores.append(log_loss(val_y, preds))
print(cv_scores)
cut_df = train_df.iloc[val_index]
out_df = pd.DataFrame(preds)
out_df.columns = ["low", "medium", "high"]
out_df["listing_id"] = cut_df.listing_id.values
interest = cut_df.interest_level.apply(lambda x: target_num_map[x])
out_df['interest_tgt'] = interest.values
cv_preds.append(out_df)
xgtest = xgb.DMatrix(cv_test[fold][features_to_use])
test_preds.append(model.predict(xgtest, ntree_limit=model.best_ntree_limit))
df_cv = pd.concat(cv_preds)
print(log_loss(df_cv.interest_tgt, df_cv[['low', 'medium', 'high']]))
apreds = np.array(test_preds)
return models, df_cv, apreds
In [18]:
rv3 = run_cv(train_df, cv_test, kf, fl)
In [19]:
dfs3 = run3_to_stackdf(rv3)
pickle.dump(dfs3, open('modeloutput-xgb-clf-r3.pkl', 'wb'))
In [9]:
def run_to_stackdf(run):
df_testpreds = pd.DataFrame(run[2].mean(axis=0))
df_testpreds.columns = ['level']
df_testpreds['listing_id'] = cv_test[0].listing_id
df_allpreds = pd.concat([run[1][['level', 'listing_id']], df_testpreds])
df_allpreds.sort_values('listing_id', inplace=True)
df_allpreds.set_index('listing_id', inplace=True)
df_fold = []
for f in range(run[2].shape[0]):
df_fold.append(pd.DataFrame(run[2][f]))
df_fold[-1]['listing_id'] = test_df.listing_id
df_fold[-1].sort_values('listing_id', inplace=True)
df_fold[-1].set_index('listing_id', inplace=True)
return (df_allpreds, df_fold)
In [10]:
def runXGB1(train_X, train_y, test_X, test_y=None, feature_names=None, seed_val=0, num_rounds=4000):
param = {}
param['objective'] = 'reg:logistic'
#param['tree_method'] = 'hist'
param['eta'] = 0.02
param['max_depth'] = 6
param['silent'] = 1
param['num_class'] = 1
param['eval_metric'] = "rmse"
param['min_child_weight'] = 1
param['subsample'] = 0.7
param['colsample_bytree'] = 0.7
param['seed'] = seed_val
param['base_score'] = train_y.mean()
num_rounds = num_rounds
plst = list(param.items())
xgtrain = xgb.DMatrix(train_X, label=train_y)
if test_y is not None:
xgtest = xgb.DMatrix(test_X, label=test_y)
watchlist = [ (xgtrain,'train'), (xgtest, 'test') ]
model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=50, verbose_eval=10)
else:
xgtest = xgb.DMatrix(test_X)
model = xgb.train(plst, xgtrain, num_rounds)
pred_test_y = model.predict(xgtest, ntree_limit=model.best_ntree_limit)
return pred_test_y, model
In [11]:
medium_regression_tgt = (.5 + (9/13)) / 2
def run_cv1(train_df, cv_test, kf, features_to_use):
train_X = train_df[features_to_use] #sparse.hstack([train_df[features_to_use], tr_sparse]).tocsr()
train_y3 = np.array(train_df['interest_level'].apply(lambda x: target_num_map[x]))
train_y = np.zeros_like(train_y3, dtype=np.float32)
train_y[train_y3 == 1] = medium_regression_tgt
train_y[train_y3 == 2] = 1
cv_preds = []
cv_scores = []
models = []
test_preds = []
fold = 0
for dev_index, val_index in kf.split(range(train_X.shape[0]), train_y):
dev_X, val_X = train_X.iloc[dev_index], train_X.iloc[val_index]
dev_y, val_y = train_y[dev_index], train_y[val_index]
preds, model = runXGB1(dev_X, dev_y, val_X, val_y)
models.append(model)
cv_scores.append(model.best_score)
print(cv_scores)
cut_df = train_df.iloc[val_index]
out_df = pd.DataFrame(preds)
out_df.columns = ["level"]
out_df["listing_id"] = cut_df.listing_id.values
out_df['interest_tgt'] = val_y # cut_df.interest.values
cv_preds.append(out_df)
xgtest = xgb.DMatrix(cv_test[fold][features_to_use])
test_preds.append(model.predict(xgtest, ntree_limit=model.best_ntree_limit))
df_cv = pd.concat(cv_preds)
print(np.sqrt(sklearn.metrics.mean_squared_error(df_cv.interest_tgt, df_cv.level)))
apreds = np.array(test_preds)
return models, df_cv, apreds
In [12]:
rv1 = run_cv1(train_df, cv_test, kf, fl)
In [13]:
dfs1 = run_to_stackdf(rv1)
pickle.dump(dfs1, open('modeloutput-xgb-reg-r3.pkl', 'wb'))
In [ ]: