In [1]:
import gc
import pickle
import time
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from multiprocessing import Pool
import matplotlib.pylab as plt
from datetime import datetime
from numba import jit
from operator import itemgetter
import lightgbm as lgb
import xgboost
In [2]:
df_train_gt = pd.read_csv('train.csv', index_col='order_id')
In [3]:
'''
This kernel implements the O(n²) F1-Score expectation maximization algorithm presented in
"Ye, N., Chai, K., Lee, W., and Chieu, H. Optimizing F-measures: A Tale of Two Approaches. In ICML, 2012."
It solves argmax_(0 <= k <= n,[[None]]) E[F1(P,k,[[None]])]
with [[None]] being the indicator for predicting label "None"
given posteriors P = [p_1, p_2, ... , p_n], where p_1 > p_2 > ... > p_n
under label independence assumption by means of dynamic programming in O(n²).
'''
class F1Optimizer():
def __init__(self):
pass
@staticmethod
@jit
def get_expectations(P, pNone=None):
expectations = []
P = np.sort(P)[::-1]
n = np.array(P).shape[0]
DP_C = np.zeros((n + 2, n + 1))
if pNone is None:
pNone = (1.0 - P).prod()
DP_C[0][0] = 1.0
for j in range(1, n):
DP_C[0][j] = (1.0 - P[j - 1]) * DP_C[0, j - 1]
for i in range(1, n + 1):
DP_C[i, i] = DP_C[i - 1, i - 1] * P[i - 1]
for j in range(i + 1, n + 1):
DP_C[i, j] = P[j - 1] * DP_C[i - 1, j - 1] + (1.0 - P[j - 1]) * DP_C[i, j - 1]
DP_S = np.zeros((2 * n + 1,))
DP_SNone = np.zeros((2 * n + 1,))
for i in range(1, 2 * n + 1):
DP_S[i] = 1. / (1. * i)
DP_SNone[i] = 1. / (1. * i + 1)
for k in range(n + 1)[::-1]:
f1 = 0
f1None = 0
for k1 in range(n + 1):
f1 += 2 * k1 * DP_C[k1][k] * DP_S[k + k1]
f1None += 2 * k1 * DP_C[k1][k] * DP_SNone[k + k1]
for i in range(1, 2 * k - 1):
DP_S[i] = (1 - P[k - 1]) * DP_S[i] + P[k - 1] * DP_S[i + 1]
DP_SNone[i] = (1 - P[k - 1]) * DP_SNone[i] + P[k - 1] * DP_SNone[i + 1]
expectations.append([f1None + 2 * pNone / (2 + k), f1])
return np.array(expectations[::-1]).T
@staticmethod
@jit
def maximize_expectation(P, pNone=None):
expectations = F1Optimizer.get_expectations(P, pNone)
ix_max = np.unravel_index(expectations.argmax(), expectations.shape)
max_f1 = expectations[ix_max]
predNone = True if ix_max[0] == 0 else False
best_k = ix_max[1]
return best_k, predNone, max_f1
@staticmethod
def _F1(tp, fp, fn):
return 2 * tp / (2 * tp + fp + fn)
@staticmethod
def _Fbeta(tp, fp, fn, beta=1.0):
beta_squared = beta ** 2
return (1.0 + beta_squared) * tp / ((1.0 + beta_squared) * tp + fp + beta_squared * fn)
def print_best_prediction(P, pNone=None):
print("Maximize F1-Expectation")
print("=" * 23)
P = np.sort(P)[::-1]
n = P.shape[0]
L = ['L{}'.format(i + 1) for i in range(n)]
if pNone is None:
print("Estimate p(None|x) as (1-p_1)*(1-p_2)*...*(1-p_n)")
pNone = (1.0 - P).prod()
PL = ['p({}|x)={}'.format(l, p) for l, p in zip(L, P)]
print("Posteriors: {} (n={})".format(PL, n))
print("p(None|x)={}".format(pNone))
opt = F1Optimizer.maximize_expectation(P, pNone)
best_prediction = ['None'] if opt[1] else []
best_prediction += (L[:opt[0]])
f1_max = opt[2]
print("Prediction {} yields best E[F1] of {}\n".format(best_prediction, f1_max))
def save_plot(P, filename='expected_f1.png'):
E_F1 = pd.DataFrame(F1Optimizer.get_expectations(P).T, columns=["/w None", "/wo None"])
best_k, _, max_f1 = F1Optimizer.maximize_expectation(P)
plt.style.use('ggplot')
plt.figure()
E_F1.plot()
plt.title('Expected F1-Score for \n {}'.format("P = [{}]".format(",".join(map(str, P)))), fontsize=12)
plt.xlabel('k')
plt.xticks(np.arange(0, len(P) + 1, 1.0))
plt.ylabel('E[F1(P,k)]')
plt.plot([best_k], [max_f1], 'o', color='#000000', markersize=4)
plt.annotate('max E[F1(P,k)] = E[F1(P,{})] = {:.5f}'.format(best_k, max_f1), xy=(best_k, max_f1),
xytext=(best_k, max_f1 * 0.8), arrowprops=dict(facecolor='black', shrink=0.05, width=1, headwidth=7),
horizontalalignment='center', verticalalignment='top')
plt.gcf().savefig(filename)
def f1proc(df):
g = df.groupby('order_id', sort=False)
done = []
for i, (oid, subset) in enumerate(g):
s = subset.sort_values('reordered_prob', ascending=False)
if len(s) > 1:
k, usenone, exp = F1Optimizer.maximize_expectation(s.reordered_prob.values)
else:
k = 1 if s.reordered_prob.values[0] > .205 else 0
usenone = False if s.reordered_prob.values[0] > .205 else True
#raw.loc[s.iloc[0:k].index, 'reordered_f1o'] = 1
ostr = ''
ostr = 'None ' if usenone else ''
ostr += ' '.join([str(v) for v in s.iloc[0:k].product_id.values])
done.append((oid, ostr, len(s), k, usenone))
return pd.DataFrame(done, columns=['order_id', 'products', 'possible', 'k', 'usenone']).set_index('order_id')
In [27]:
def raw_f1proc(raw):
raw_orderids = raw.order_id.unique()
folds = []
for s in np.array_split(raw_orderids, 48):
folds.append(raw[raw.order_id.isin(s)])
p = Pool(12)
rv = p.map(f1proc, folds)
df_trial = pd.concat(rv).sort_index()
#df_trial.to_csv('0809-f1-local_1b.csv')
return df_trial
In [5]:
def compare_results(df_gt, df_preds, return_scores = False, addnoneonone = False):
df_gt_cut = df_gt.loc[df_preds.index]
df_predsa = df_preds.copy().sort_index()
f1 = []
for gt, pred in zip(df_gt_cut.sort_index().products, df_preds.sort_index().products):
lgt = gt.replace("None", "-1").split(' ')
lpred = pred.replace("None", "-1").split(' ')
if addnoneonone and len(lpred) == 1 and lpred[0] != "-1":
lpred.append("-1")
rr = (np.intersect1d(lgt, lpred))
precision = np.float(len(rr)) / len(lpred)
recall = np.float(len(rr)) / len(lgt)
denom = precision + recall
f1.append(((2 * precision * recall) / denom) if denom > 0 else 0)
if return_scores:
df_predsa['f1'] = f1
df_predsa['products_gt'] = df_gt_cut.sort_index().products
return(np.mean(f1), df_predsa)
else:
return(np.mean(f1))
In [6]:
def xgb_cv(X_train, y_train, X_val, y_val, features_to_use, rounds=2500):
#d_train = xgboost.DMatrix(X_train.drop(['user_id', 'product_id', 'order_id'], axis=1), y_train)
#d_val = xgboost.DMatrix(X_val.drop(['user_id', 'product_id', 'order_id'], axis=1), y_val)
d_train = xgboost.DMatrix(X_train[features_to_use], y_train)
xgb_params = {
"objective" : "reg:logistic"
,"eval_metric" : "logloss"
,"eta" : 0.04
,"max_depth" : 8
,"min_child_weight" :10
,"gamma" :0.70
,"subsample" :0.76
,"colsample_bytree" :0.95
,"alpha" :2e-05
,"lambda" :10
,"nthread": 12
}
if y_val is not None:
d_val = xgboost.DMatrix(X_val[features_to_use], y_val)
watchlist= [(d_train, "train"), (d_val, 'valid')]
bst = xgboost.train(params=xgb_params, dtrain=d_train, num_boost_round=rounds, evals=watchlist, verbose_eval=10, early_stopping_rounds=30)
preds = bst.predict(d_val, ntree_limit=bst.best_ntree_limit)
else:
d_val = xgboost.DMatrix(X_val[features_to_use])
watchlist= [(d_train, "train")]
bst = xgboost.train(params=xgb_params, dtrain=d_train, num_boost_round=rounds, evals=watchlist, verbose_eval=10)
preds = bst.predict(d_val, ntree_limit=bst.best_ntree_limit)
return bst, preds
In [7]:
def raw_to_dfcv(df, lim = .21):
g = df[df.reordered_prob > lim].groupby('order_id', sort=False)
df_preds = g[['product_id']].agg(lambda x: ' '.join(set(x)))
# complete (but empty) test df
df_out = pd.DataFrame(index=df.order_id.unique())
df_out.index.name = 'order_id'
df_out['products'] = ['None'] * len(df_out)
# combine empty output df with predictions
df_out.loc[df_preds.index, 'products'] = df_preds.product_id
df_out.sort_index(inplace=True)
return df_out
def build_dfcv(X, preds, y = None, lim = .21):
df = X[['user_id', 'product_id', 'order_id']].copy()
if y is not None:
df['reordered_gt'] = y
df['reordered_prob'] = preds
df['reordered'] = (preds > lim).astype(int)
df.product_id = df.product_id.astype(str)
g = df[df.reordered == 1].groupby('order_id', sort=False)
df_preds = g[['product_id']].agg(lambda x: ' '.join(set(x)))
#df_testpreds.head()
# complete (but empty) test df
df_out = pd.DataFrame(index=X.order_id.unique())
df_out.index.name = 'order_id'
df_out['products'] = ['None'] * len(df_out)
# combine empty output df with predictions
df_out.loc[df_preds.index, 'products'] = df_preds.product_id
df_out.sort_index(inplace=True)
return df, df_out
In [8]:
def runcv(func, nfolds = 4, lim = .21, shortrun = False, droptrain = None):
df_cvfolds = []
df_cvraw = []
models = []
for fold in range(nfolds):
train_subset = train[train.user_id % nfolds != fold]
if droptrain is not None:
# https://stackoverflow.com/questions/17071871/select-rows-from-a-dataframe-based-on-values-in-a-column-in-pandas
train_subset = train_subset[~train_subset.order_id.isin(droptrain)]
valid_subset = train[train.user_id % nfolds == fold]
X_train = train_subset.drop('reordered', axis=1)
y_train = train_subset.reordered
X_val = valid_subset.drop('reordered', axis=1)
y_val = valid_subset.reordered
mdl, rawpreds = func(X_train, y_train, X_val, y_val, features_to_use)
models.append(mdl)
df_raw, df_preds = build_dfcv(X_val, rawpreds, lim = lim, y = y_val)
df_cvraw.append(df_raw)
df_cvfolds.append(df_preds)
print(fold, compare_results(df_train_gt, df_cvfolds[-1]))
if shortrun:
break
return models, df_cvraw, pd.concat(df_cvfolds).sort_index()
In [9]:
# from https://www.kaggle.com/waxbabi/light-gbm-benchmark-0-3692
def lightgbm_cv(X_train, y_train, X_val, y_val, features_to_use, rounds=800):
params = {
'task': 'train',
'boosting_type': 'gbdt',
'objective': 'binary',
'metric': {'binary_logloss', 'auc'},
'num_leaves': 256,
'min_sum_hessian_in_leaf':20,
'max_depth': -12,
'learning_rate': 0.05,
'feature_fraction': 0.6,
# 'bagging_fraction': 0.9,
# 'bagging_freq': 3,
'verbose': 1
}
d_train = lgb.Dataset(X_train[features_to_use], label=y_train)#, categorical_feature=['product_id', 'aisle_id', 'department_id'])
if y_val is not None:
d_val = lgb.Dataset(X_val[features_to_use], label=y_val)#, categorical_feature=['product_id', 'aisle_id', 'department_id'])
bst = lgb.train(params, d_train, rounds, early_stopping_rounds=50, verbose_eval=True, valid_sets=d_val)
else:
params['metric'] = 'binary_logloss'
bst = lgb.train(params, d_train, rounds, verbose_eval=10)
preds = bst.predict(X_val[features_to_use], num_iteration=bst.best_iteration)
return bst, preds
In [ ]:
In [10]:
data = pd.read_pickle('testdata.pkl')
print(len(data))
In [11]:
data_user = pd.read_pickle('testdata_user.pkl')
data = pd.merge(data, data_user, on='user_id')
data_prod = pd.read_pickle('testdata_prod.pkl')
data = pd.merge(data, data_prod, on='product_id')
data_tmp = pd.read_pickle('testdata_aisle.pkl')
data = pd.merge(data, data_tmp, on='aisle_id')
data_tmp = pd.read_pickle('testdata_dept.pkl')
data = pd.merge(data, data_tmp, on='department_id')
In [12]:
#'testdata-user_reordered.pkl'
data_userr = pd.read_pickle('testdata-user_reordered.pkl')
data = pd.merge(data, data_userr, on='user_id')
In [13]:
data_tmp = pd.read_pickle('testdata_ud.pkl')
data = pd.merge(data, data_tmp, on=['user_id', 'department_id'])
In [14]:
data_userd = pd.read_pickle('testdata_user_dept.pkl')
data = pd.merge(data, data_userd, on='user_id')
In [15]:
data.up_time_prev1 = data.up_time_prev1.fillna(9999)
data.up_time_prev2 = data.up_time_prev1.fillna(9999)
data.ud_time_prev1 = data.ud_time_prev1.fillna(9999)
data.ud_time_prev2 = data.ud_time_prev1.fillna(9999)
In [ ]:
# Generated from Farons streak code
_pd_streak = pd.read_csv('_pd_streak.csv')
data = pd.merge(data, _pd_streak, on=['user_id', 'product_id'])
In [17]:
data['order_hourp12'] = (data.order_hour_of_day + 12) % 24
In [18]:
data['u_ordersize_prev1_ratio'] = data.u_ordersize_prev1 / data.u_num_products
data['u_ordersize_prev2_ratio'] = data.u_ordersize_prev2 / data.u_num_products
data['u_ordersize_prev3_ratio'] = data.u_ordersize_prev3 / data.u_num_products
In [19]:
X_test = data.loc[data.eval_set == "test",:].copy()
X_test.drop(['eval_set'], axis=1, inplace=True)
train = data.loc[data.eval_set == "train",:].copy()
train.drop(['eval_set'], axis=1, inplace=True)
train.loc[:, 'reordered'] = train.reordered.fillna(0)
In [20]:
features_to_use = list(train.columns)
features_to_use.remove('reordered')
features_to_use.remove('user_id')
features_to_use.remove('product_id')
features_to_use.remove('order_id')
#features_to_use.remove('u_ordersize_prev2')
#features_to_use.remove('u_ordersize_prev3')
features_to_use.remove('ud_reordered')
#features_to_use.remove('product_name')
In [ ]:
models, raw, preds = runcv(xgb_cv, shortrun=True, lim=.205)
df_f1 = raw_f1proc(raw[0])
rv, rs = compare_results(df_train_gt, df_f1, True, False)
rv
In [22]:
rv
Out[22]:
In [23]:
models[0].best_ntree_limit
Out[23]:
In [28]:
pickle.dump((models, raw, preds), open('0810-f1.pkl', 'wb'))
In [74]:
# Feature importance code
In [24]:
m = models[0]
scores = {}
classes = ['weight', 'gain', 'cover']
for t in classes:
scores[t] = m.get_score(importance_type=t)
scorel = []
for k in scores['gain'].keys():
scorel.append((k, *[scores[t][k] for t in classes]))
df_imp = pd.DataFrame(scorel, columns=['key', *classes]).set_index('key')
In [25]:
df_imp.sort_values('gain', ascending=False)
Out[25]:
In [21]:
def run_sub(func):
X_train = train.drop('reordered', axis=1)
y_train = train.reordered
mdl, rawpreds = func(X_train, y_train, X_test, None, features_to_use, rounds=600)
return mdl, rawpreds
model_test, raw_test = run_sub(xgb_cv)
In [22]:
X_presub = X_test[['user_id', 'product_id', 'order_id']].copy()
X_presub['reordered_prob'] = raw_test
X_presub.to_pickle('0813m1-dusty-subraw.pkl')
In [23]:
df_f1 = raw_f1proc(X_presub)
In [24]:
df_f1[['products']].to_csv('sub-0813-dusty.csv')
In [25]:
len(features_to_use)
Out[25]:
In [51]:
xpa = X_presub.copy()
xp_imba = pd.read_pickle('sub-0810-sh1ng-pre.pkl')
xpa.sort_values(['user_id', 'product_id'], inplace=True)
xp_imba.sort_values(['user_id', 'product_id'], inplace=True)
xpb = pd.merge(xpa, xp_imba, on=['product_id', 'user_id', 'order_id'])
xpb['reordered_prob'] = (xpb.reordered_prob_x + xpb.reordered_prob_y) / 2
In [61]:
df_f1m = raw_f1proc(xpb)
df_f1m[['products']].to_csv('sub-0813-dusty_merge2.csv')
In [62]:
df_f1m
Out[62]: