XGB model

This model produced .4026 on the private LB - half of my #100 solution.


In [1]:
import gc
import pickle
import time
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from multiprocessing import Pool

import matplotlib.pylab as plt
from datetime import datetime
from numba import jit

from operator import itemgetter

import lightgbm as lgb

import xgboost

In [2]:
df_train_gt = pd.read_csv('train.csv', index_col='order_id')

Faron's F1 code


In [3]:
'''
This kernel implements the O(n²) F1-Score expectation maximization algorithm presented in
"Ye, N., Chai, K., Lee, W., and Chieu, H.  Optimizing F-measures: A Tale of Two Approaches. In ICML, 2012."

It solves argmax_(0 <= k <= n,[[None]]) E[F1(P,k,[[None]])]
with [[None]] being the indicator for predicting label "None"
given posteriors P = [p_1, p_2, ... , p_n], where p_1 > p_2 > ... > p_n
under label independence assumption by means of dynamic programming in O(n²).
'''


class F1Optimizer():
    def __init__(self):
        pass

    @staticmethod
    @jit
    def get_expectations(P, pNone=None):
        expectations = []
        P = np.sort(P)[::-1]

        n = np.array(P).shape[0]
        DP_C = np.zeros((n + 2, n + 1))
        if pNone is None:
            pNone = (1.0 - P).prod()

        DP_C[0][0] = 1.0
        for j in range(1, n):
            DP_C[0][j] = (1.0 - P[j - 1]) * DP_C[0, j - 1]

        for i in range(1, n + 1):
            DP_C[i, i] = DP_C[i - 1, i - 1] * P[i - 1]
            for j in range(i + 1, n + 1):
                DP_C[i, j] = P[j - 1] * DP_C[i - 1, j - 1] + (1.0 - P[j - 1]) * DP_C[i, j - 1]

        DP_S = np.zeros((2 * n + 1,))
        DP_SNone = np.zeros((2 * n + 1,))
        for i in range(1, 2 * n + 1):
            DP_S[i] = 1. / (1. * i)
            DP_SNone[i] = 1. / (1. * i + 1)
        for k in range(n + 1)[::-1]:
            f1 = 0
            f1None = 0
            for k1 in range(n + 1):
                f1 += 2 * k1 * DP_C[k1][k] * DP_S[k + k1]
                f1None += 2 * k1 * DP_C[k1][k] * DP_SNone[k + k1]
            for i in range(1, 2 * k - 1):
                DP_S[i] = (1 - P[k - 1]) * DP_S[i] + P[k - 1] * DP_S[i + 1]
                DP_SNone[i] = (1 - P[k - 1]) * DP_SNone[i] + P[k - 1] * DP_SNone[i + 1]
            expectations.append([f1None + 2 * pNone / (2 + k), f1])

        return np.array(expectations[::-1]).T

    @staticmethod
    @jit
    def maximize_expectation(P, pNone=None):
        expectations = F1Optimizer.get_expectations(P, pNone)

        ix_max = np.unravel_index(expectations.argmax(), expectations.shape)
        max_f1 = expectations[ix_max]

        predNone = True if ix_max[0] == 0 else False
        best_k = ix_max[1]

        return best_k, predNone, max_f1

    @staticmethod
    def _F1(tp, fp, fn):
        return 2 * tp / (2 * tp + fp + fn)

    @staticmethod
    def _Fbeta(tp, fp, fn, beta=1.0):
        beta_squared = beta ** 2
        return (1.0 + beta_squared) * tp / ((1.0 + beta_squared) * tp + fp + beta_squared * fn)


def print_best_prediction(P, pNone=None):
    print("Maximize F1-Expectation")
    print("=" * 23)
    P = np.sort(P)[::-1]
    n = P.shape[0]
    L = ['L{}'.format(i + 1) for i in range(n)]

    if pNone is None:
        print("Estimate p(None|x) as (1-p_1)*(1-p_2)*...*(1-p_n)")
        pNone = (1.0 - P).prod()

    PL = ['p({}|x)={}'.format(l, p) for l, p in zip(L, P)]
    print("Posteriors: {} (n={})".format(PL, n))
    print("p(None|x)={}".format(pNone))

    opt = F1Optimizer.maximize_expectation(P, pNone)
    best_prediction = ['None'] if opt[1] else []
    best_prediction += (L[:opt[0]])
    f1_max = opt[2]

    print("Prediction {} yields best E[F1] of {}\n".format(best_prediction, f1_max))


def save_plot(P, filename='expected_f1.png'):
    E_F1 = pd.DataFrame(F1Optimizer.get_expectations(P).T, columns=["/w None", "/wo None"])
    best_k, _, max_f1 = F1Optimizer.maximize_expectation(P)

    plt.style.use('ggplot')
    plt.figure()
    E_F1.plot()
    plt.title('Expected F1-Score for \n {}'.format("P = [{}]".format(",".join(map(str, P)))), fontsize=12)
    plt.xlabel('k')
    plt.xticks(np.arange(0, len(P) + 1, 1.0))
    plt.ylabel('E[F1(P,k)]')
    plt.plot([best_k], [max_f1], 'o', color='#000000', markersize=4)
    plt.annotate('max E[F1(P,k)] = E[F1(P,{})] = {:.5f}'.format(best_k, max_f1), xy=(best_k, max_f1),
                 xytext=(best_k, max_f1 * 0.8), arrowprops=dict(facecolor='black', shrink=0.05, width=1, headwidth=7),
                 horizontalalignment='center', verticalalignment='top')
    plt.gcf().savefig(filename)
    
def f1proc(df):
    g = df.groupby('order_id', sort=False)

    done = []
    for i, (oid, subset) in enumerate(g):
        s = subset.sort_values('reordered_prob', ascending=False)

        if len(s) > 1:
            k, usenone, exp = F1Optimizer.maximize_expectation(s.reordered_prob.values)
        else:
            k = 1 if s.reordered_prob.values[0] > .205 else 0
            usenone = False if s.reordered_prob.values[0] > .205 else True

        #raw.loc[s.iloc[0:k].index, 'reordered_f1o'] = 1

        ostr = ''
        ostr = 'None ' if usenone else ''
        ostr += ' '.join([str(v) for v in s.iloc[0:k].product_id.values])

        done.append((oid, ostr, len(s), k, usenone))
        
    return pd.DataFrame(done, columns=['order_id', 'products', 'possible', 'k', 'usenone']).set_index('order_id')

In [27]:
def raw_f1proc(raw):
    raw_orderids = raw.order_id.unique()

    folds = []
    for s in np.array_split(raw_orderids, 48):
        folds.append(raw[raw.order_id.isin(s)])

    p = Pool(12)
    rv = p.map(f1proc, folds)
    df_trial = pd.concat(rv).sort_index()

    #df_trial.to_csv('0809-f1-local_1b.csv')
    return df_trial

In [5]:
def compare_results(df_gt, df_preds, return_scores = False, addnoneonone = False):
    
    df_gt_cut = df_gt.loc[df_preds.index]
    df_predsa = df_preds.copy().sort_index()
    
    f1 = []
    for gt, pred in zip(df_gt_cut.sort_index().products, df_preds.sort_index().products):
        lgt = gt.replace("None", "-1").split(' ')
        lpred = pred.replace("None", "-1").split(' ')
        
        if addnoneonone and len(lpred) == 1 and lpred[0] != "-1":
            lpred.append("-1")
            

        rr = (np.intersect1d(lgt, lpred))
        precision = np.float(len(rr)) / len(lpred)
        recall = np.float(len(rr)) / len(lgt)

        denom = precision + recall
        f1.append(((2 * precision * recall) / denom) if denom > 0 else 0)

    if return_scores:
        df_predsa['f1'] = f1
        df_predsa['products_gt'] = df_gt_cut.sort_index().products
        
        return(np.mean(f1), df_predsa)
    else:
        return(np.mean(f1))

Model+CV functions


In [6]:
def xgb_cv(X_train, y_train, X_val, y_val, features_to_use, rounds=2500):
    #d_train = xgboost.DMatrix(X_train.drop(['user_id', 'product_id', 'order_id'], axis=1), y_train)
    #d_val = xgboost.DMatrix(X_val.drop(['user_id', 'product_id', 'order_id'], axis=1), y_val)

    d_train = xgboost.DMatrix(X_train[features_to_use], y_train)

    xgb_params = {
        "objective"         : "reg:logistic"
        ,"eval_metric"      : "logloss"
        ,"eta"              : 0.04
        ,"max_depth"        : 8
        ,"min_child_weight" :10
        ,"gamma"            :0.70
        ,"subsample"        :0.76
        ,"colsample_bytree" :0.95
        ,"alpha"            :2e-05
        ,"lambda"           :10
        ,"nthread": 12
    }

    if y_val is not None:
        d_val = xgboost.DMatrix(X_val[features_to_use], y_val)
        watchlist= [(d_train, "train"), (d_val, 'valid')]
        bst = xgboost.train(params=xgb_params, dtrain=d_train, num_boost_round=rounds, evals=watchlist, verbose_eval=10, early_stopping_rounds=30)
        preds = bst.predict(d_val, ntree_limit=bst.best_ntree_limit)
    else:
        d_val = xgboost.DMatrix(X_val[features_to_use])
        watchlist= [(d_train, "train")]
        bst = xgboost.train(params=xgb_params, dtrain=d_train, num_boost_round=rounds, evals=watchlist, verbose_eval=10)
        preds = bst.predict(d_val, ntree_limit=bst.best_ntree_limit)
    
    return bst, preds

In [7]:
def raw_to_dfcv(df, lim = .21):
    g = df[df.reordered_prob > lim].groupby('order_id', sort=False)
    df_preds = g[['product_id']].agg(lambda x: ' '.join(set(x)))

    # complete (but empty) test df
    df_out = pd.DataFrame(index=df.order_id.unique())
    df_out.index.name = 'order_id'
    df_out['products'] = ['None'] * len(df_out)

    # combine empty output df with predictions
    df_out.loc[df_preds.index, 'products'] = df_preds.product_id
    df_out.sort_index(inplace=True)
    
    return df_out

def build_dfcv(X, preds, y = None, lim = .21):
    df = X[['user_id', 'product_id', 'order_id']].copy()

    if y is not None:
        df['reordered_gt'] = y
    
    df['reordered_prob'] = preds
    df['reordered'] = (preds > lim).astype(int)
    df.product_id = df.product_id.astype(str)

    g = df[df.reordered == 1].groupby('order_id', sort=False)
    df_preds = g[['product_id']].agg(lambda x: ' '.join(set(x)))
    #df_testpreds.head()

    # complete (but empty) test df
    df_out = pd.DataFrame(index=X.order_id.unique())
    df_out.index.name = 'order_id'
    df_out['products'] = ['None'] * len(df_out)

    # combine empty output df with predictions
    df_out.loc[df_preds.index, 'products'] = df_preds.product_id
    df_out.sort_index(inplace=True)
    
    return df, df_out

In [8]:
def runcv(func, nfolds = 4, lim = .21, shortrun = False, droptrain = None):
    df_cvfolds = []
    df_cvraw = []
    models = []
    
    for fold in range(nfolds):
        train_subset = train[train.user_id % nfolds != fold]
        if droptrain is not None:
            # https://stackoverflow.com/questions/17071871/select-rows-from-a-dataframe-based-on-values-in-a-column-in-pandas
            train_subset = train_subset[~train_subset.order_id.isin(droptrain)]
            
        valid_subset = train[train.user_id % nfolds == fold]

        X_train = train_subset.drop('reordered', axis=1)
        y_train = train_subset.reordered

        X_val = valid_subset.drop('reordered', axis=1)
        y_val = valid_subset.reordered

        mdl, rawpreds = func(X_train, y_train, X_val, y_val, features_to_use)
        models.append(mdl)

        df_raw, df_preds = build_dfcv(X_val, rawpreds, lim = lim, y = y_val)
        
        df_cvraw.append(df_raw)
        df_cvfolds.append(df_preds)
        
        print(fold, compare_results(df_train_gt, df_cvfolds[-1]))
        
        if shortrun:
            break

    return models, df_cvraw, pd.concat(df_cvfolds).sort_index()

In [9]:
# from https://www.kaggle.com/waxbabi/light-gbm-benchmark-0-3692

def lightgbm_cv(X_train, y_train, X_val, y_val, features_to_use, rounds=800):

    params = {
        'task': 'train',
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric': {'binary_logloss', 'auc'},
        'num_leaves': 256,
        'min_sum_hessian_in_leaf':20,
        'max_depth': -12,
        'learning_rate': 0.05,
        'feature_fraction': 0.6,
        # 'bagging_fraction': 0.9,
        # 'bagging_freq': 3,
        'verbose': 1
    }
    
    d_train = lgb.Dataset(X_train[features_to_use], label=y_train)#, categorical_feature=['product_id', 'aisle_id', 'department_id'])
    
    if y_val is not None:
        d_val = lgb.Dataset(X_val[features_to_use], label=y_val)#, categorical_feature=['product_id', 'aisle_id', 'department_id'])
        bst = lgb.train(params, d_train, rounds, early_stopping_rounds=50, verbose_eval=True, valid_sets=d_val)
    else:
        params['metric'] = 'binary_logloss'
        bst = lgb.train(params, d_train, rounds, verbose_eval=10)
    
    
    preds = bst.predict(X_val[features_to_use], num_iteration=bst.best_iteration)
    
    return bst, preds

Create Train / Test


In [ ]:


In [10]:
data = pd.read_pickle('testdata.pkl')
print(len(data))


13307953

In [11]:
data_user = pd.read_pickle('testdata_user.pkl')
data = pd.merge(data, data_user, on='user_id')

data_prod = pd.read_pickle('testdata_prod.pkl')
data = pd.merge(data, data_prod, on='product_id')

data_tmp = pd.read_pickle('testdata_aisle.pkl')
data = pd.merge(data, data_tmp, on='aisle_id')

data_tmp = pd.read_pickle('testdata_dept.pkl')
data = pd.merge(data, data_tmp, on='department_id')

In [12]:
#'testdata-user_reordered.pkl'
data_userr = pd.read_pickle('testdata-user_reordered.pkl')
data = pd.merge(data, data_userr, on='user_id')

In [13]:
data_tmp = pd.read_pickle('testdata_ud.pkl')
data = pd.merge(data, data_tmp, on=['user_id', 'department_id'])

In [14]:
data_userd = pd.read_pickle('testdata_user_dept.pkl')
data = pd.merge(data, data_userd, on='user_id')

In [15]:
data.up_time_prev1 = data.up_time_prev1.fillna(9999)
data.up_time_prev2 = data.up_time_prev1.fillna(9999)

data.ud_time_prev1 = data.ud_time_prev1.fillna(9999)
data.ud_time_prev2 = data.ud_time_prev1.fillna(9999)

In [ ]:
# Generated from Farons streak code
_pd_streak = pd.read_csv('_pd_streak.csv')
data = pd.merge(data, _pd_streak, on=['user_id', 'product_id'])

In [17]:
data['order_hourp12'] = (data.order_hour_of_day + 12) % 24

In [18]:
data['u_ordersize_prev1_ratio'] = data.u_ordersize_prev1 / data.u_num_products
data['u_ordersize_prev2_ratio'] = data.u_ordersize_prev2 / data.u_num_products
data['u_ordersize_prev3_ratio'] = data.u_ordersize_prev3 / data.u_num_products

In [19]:
X_test = data.loc[data.eval_set == "test",:].copy()
X_test.drop(['eval_set'], axis=1, inplace=True)

train = data.loc[data.eval_set == "train",:].copy()
train.drop(['eval_set'], axis=1, inplace=True)
train.loc[:, 'reordered'] = train.reordered.fillna(0)

In [20]:
features_to_use = list(train.columns)
features_to_use.remove('reordered')
features_to_use.remove('user_id')
features_to_use.remove('product_id')
features_to_use.remove('order_id')

#features_to_use.remove('u_ordersize_prev2')
#features_to_use.remove('u_ordersize_prev3')

features_to_use.remove('ud_reordered')

#features_to_use.remove('product_name')

Single fold test (with F1 computation)


In [ ]:
models, raw, preds = runcv(xgb_cv, shortrun=True, lim=.205)

df_f1 = raw_f1proc(raw[0])
rv, rs = compare_results(df_train_gt, df_f1, True, False)
rv


[0]	train-logloss:0.686049	valid-logloss:0.686046
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
[10]	train-logloss:0.62218	valid-logloss:0.622145
[20]	train-logloss:0.569136	valid-logloss:0.569078
[30]	train-logloss:0.524593	valid-logloss:0.524518
[40]	train-logloss:0.486878	valid-logloss:0.486791
[50]	train-logloss:0.45471	valid-logloss:0.454612
[60]	train-logloss:0.427139	valid-logloss:0.427038
[70]	train-logloss:0.403396	valid-logloss:0.40329
[80]	train-logloss:0.382877	valid-logloss:0.382769
[90]	train-logloss:0.3651	valid-logloss:0.364991
[100]	train-logloss:0.349665	valid-logloss:0.349555
[110]	train-logloss:0.336238	valid-logloss:0.336129
[120]	train-logloss:0.324537	valid-logloss:0.324432
[130]	train-logloss:0.314326	valid-logloss:0.314226
[140]	train-logloss:0.305407	valid-logloss:0.305314
[150]	train-logloss:0.2976	valid-logloss:0.297515
[160]	train-logloss:0.290777	valid-logloss:0.290703
[170]	train-logloss:0.284801	valid-logloss:0.284737
[180]	train-logloss:0.279565	valid-logloss:0.27951
[190]	train-logloss:0.27499	valid-logloss:0.274945
[200]	train-logloss:0.270985	valid-logloss:0.270952
[210]	train-logloss:0.267482	valid-logloss:0.267464
[220]	train-logloss:0.264424	valid-logloss:0.264419
[230]	train-logloss:0.261741	valid-logloss:0.261749
[240]	train-logloss:0.259387	valid-logloss:0.259411
[250]	train-logloss:0.257328	valid-logloss:0.257369
[260]	train-logloss:0.255524	valid-logloss:0.255581
[270]	train-logloss:0.253943	valid-logloss:0.254019
[280]	train-logloss:0.252564	valid-logloss:0.252659
[290]	train-logloss:0.251353	valid-logloss:0.251466
[300]	train-logloss:0.250294	valid-logloss:0.250426
[310]	train-logloss:0.249364	valid-logloss:0.249514
[320]	train-logloss:0.248547	valid-logloss:0.248715
[330]	train-logloss:0.247833	valid-logloss:0.248022
[340]	train-logloss:0.247198	valid-logloss:0.247407
[350]	train-logloss:0.246643	valid-logloss:0.246873
[360]	train-logloss:0.246155	valid-logloss:0.246408
[370]	train-logloss:0.24572	valid-logloss:0.245995
[380]	train-logloss:0.245336	valid-logloss:0.245635
[390]	train-logloss:0.244993	valid-logloss:0.245316
[400]	train-logloss:0.244684	valid-logloss:0.24503
[410]	train-logloss:0.244414	valid-logloss:0.244782
[420]	train-logloss:0.244162	valid-logloss:0.244556
[430]	train-logloss:0.243939	valid-logloss:0.244356
[440]	train-logloss:0.243741	valid-logloss:0.244184
[450]	train-logloss:0.243561	valid-logloss:0.244028
[460]	train-logloss:0.243398	valid-logloss:0.24389
[470]	train-logloss:0.243246	valid-logloss:0.243764
[480]	train-logloss:0.243107	valid-logloss:0.24365
[490]	train-logloss:0.24298	valid-logloss:0.243549
[500]	train-logloss:0.242859	valid-logloss:0.243455
[510]	train-logloss:0.242746	valid-logloss:0.243371
[520]	train-logloss:0.242637	valid-logloss:0.243293
[530]	train-logloss:0.242542	valid-logloss:0.243226
[540]	train-logloss:0.242443	valid-logloss:0.24316
[550]	train-logloss:0.242352	valid-logloss:0.243105
[560]	train-logloss:0.242261	valid-logloss:0.24305
[570]	train-logloss:0.242185	valid-logloss:0.243002
[580]	train-logloss:0.242102	valid-logloss:0.242951
[590]	train-logloss:0.242025	valid-logloss:0.242906
[600]	train-logloss:0.241958	valid-logloss:0.242867
[610]	train-logloss:0.241886	valid-logloss:0.242825
[620]	train-logloss:0.241815	valid-logloss:0.242786
[630]	train-logloss:0.241742	valid-logloss:0.242748
[640]	train-logloss:0.24168	valid-logloss:0.242718
[650]	train-logloss:0.241613	valid-logloss:0.242687
[660]	train-logloss:0.241553	valid-logloss:0.242657
[670]	train-logloss:0.241492	valid-logloss:0.242631
[680]	train-logloss:0.241436	valid-logloss:0.242603
[690]	train-logloss:0.241376	valid-logloss:0.242577
[700]	train-logloss:0.241321	valid-logloss:0.242556
[710]	train-logloss:0.241265	valid-logloss:0.242533
[720]	train-logloss:0.24121	valid-logloss:0.24251
[730]	train-logloss:0.241152	valid-logloss:0.242488
[740]	train-logloss:0.241098	valid-logloss:0.242466
[750]	train-logloss:0.241041	valid-logloss:0.242446
[760]	train-logloss:0.240982	valid-logloss:0.242425
[770]	train-logloss:0.240935	valid-logloss:0.242409
[780]	train-logloss:0.240881	valid-logloss:0.242393
[790]	train-logloss:0.240828	valid-logloss:0.242374
[800]	train-logloss:0.240782	valid-logloss:0.242361
[810]	train-logloss:0.240734	valid-logloss:0.242345
[820]	train-logloss:0.240683	valid-logloss:0.242328
[830]	train-logloss:0.240637	valid-logloss:0.242314
[840]	train-logloss:0.240588	valid-logloss:0.242302
[850]	train-logloss:0.240541	valid-logloss:0.242288
[860]	train-logloss:0.240495	valid-logloss:0.242277
[870]	train-logloss:0.240448	valid-logloss:0.242267
[880]	train-logloss:0.240398	valid-logloss:0.242251
[890]	train-logloss:0.240354	valid-logloss:0.242235
[900]	train-logloss:0.240308	valid-logloss:0.242222
[910]	train-logloss:0.240264	valid-logloss:0.242211
[920]	train-logloss:0.240221	valid-logloss:0.242199
[930]	train-logloss:0.240176	valid-logloss:0.242183
[940]	train-logloss:0.240135	valid-logloss:0.242173
[950]	train-logloss:0.240095	valid-logloss:0.242163
[960]	train-logloss:0.240051	valid-logloss:0.24215
[970]	train-logloss:0.240009	valid-logloss:0.242137
[980]	train-logloss:0.239965	valid-logloss:0.242123
[990]	train-logloss:0.239926	valid-logloss:0.242112
[1000]	train-logloss:0.239883	valid-logloss:0.2421
[1010]	train-logloss:0.239842	valid-logloss:0.242092
[1020]	train-logloss:0.2398	valid-logloss:0.24208
[1030]	train-logloss:0.239759	valid-logloss:0.242074
[1040]	train-logloss:0.239715	valid-logloss:0.242065
[1050]	train-logloss:0.239675	valid-logloss:0.242054
[1060]	train-logloss:0.239636	valid-logloss:0.242046
[1070]	train-logloss:0.239599	valid-logloss:0.242036
[1080]	train-logloss:0.23956	valid-logloss:0.242024
[1090]	train-logloss:0.23952	valid-logloss:0.242017
[1100]	train-logloss:0.239488	valid-logloss:0.24201
[1110]	train-logloss:0.239442	valid-logloss:0.241998
[1120]	train-logloss:0.239404	valid-logloss:0.241989
[1130]	train-logloss:0.23937	valid-logloss:0.241985
[1140]	train-logloss:0.239332	valid-logloss:0.241977
[1150]	train-logloss:0.239292	valid-logloss:0.241967
[1160]	train-logloss:0.239256	valid-logloss:0.241959
[1170]	train-logloss:0.239218	valid-logloss:0.241951
[1180]	train-logloss:0.239178	valid-logloss:0.241941
[1190]	train-logloss:0.239137	valid-logloss:0.241931
[1200]	train-logloss:0.239104	valid-logloss:0.241925
[1210]	train-logloss:0.239063	valid-logloss:0.241917
[1220]	train-logloss:0.239026	valid-logloss:0.24191
[1230]	train-logloss:0.238989	valid-logloss:0.241903
[1240]	train-logloss:0.238953	valid-logloss:0.241897
[1250]	train-logloss:0.238918	valid-logloss:0.241889
[1260]	train-logloss:0.238881	valid-logloss:0.241883
[1270]	train-logloss:0.238846	valid-logloss:0.241877
[1280]	train-logloss:0.238806	valid-logloss:0.24187

In [22]:
rv


Out[22]:
0.39723506748617704

In [23]:
models[0].best_ntree_limit


Out[23]:
455

In [28]:
pickle.dump((models, raw, preds), open('0810-f1.pkl', 'wb'))

In [74]:
# Feature importance code

In [24]:
m = models[0]

scores = {}
classes = ['weight', 'gain', 'cover'] 
for t in classes:
    scores[t] = m.get_score(importance_type=t)


scorel = []
for k in scores['gain'].keys():
    scorel.append((k, *[scores[t][k] for t in classes]))

df_imp = pd.DataFrame(scorel, columns=['key', *classes]).set_index('key')

In [25]:
df_imp.sort_values('gain', ascending=False)


Out[25]:
weight gain cover
key
order_streak 570 903.165645 70766.567882
up_ordergap_last 2122 853.173197 60164.787710
up_time_prev2 106 795.906525 35385.652084
up_time_prev1 1988 776.270640 34004.973004
up_order_rate 2313 651.877906 40933.736176
up_count 1105 101.098170 12570.782727
p_reordered_product_ratio 3112 95.740717 24101.221318
up_reorder_rate 2158 93.910497 17342.130510
days_since_prior_order 2747 48.861033 15923.544574
up_time_last 3057 48.227632 21965.014597
up_time_last_ratio 2288 47.727425 15123.815600
p_order_rate 1689 40.227089 13962.034531
p_reorder_rate 3145 32.725488 13723.192132
ud_time_last 2064 27.007561 11798.646463
u_reordered_product_ratio 3220 25.577547 15131.339615
ud_order_rate 1764 20.322651 14509.132286
p_users 2928 19.594838 14939.751117
a_order_rate 1154 18.899960 13238.240047
up_first_order 1143 18.385544 16273.193212
up_add_to_cart_order_mean 1303 17.224047 13568.103469
u_dpo_ratio 2354 16.998077 10418.436886
u_ordersize_prev1_ratio 2796 16.851476 10664.778793
ud_time_last_ratio 1302 16.791139 10021.460291
up_add_to_cart_order_inverted_mean 966 16.433716 14072.023405
u_ordersize_prev2_ratio 2757 15.476788 9884.061238
up_time_first 1678 15.473418 14182.431012
u_reordered_prev1 2650 15.445868 9343.146446
a_reordered_product_ratio 1027 14.114135 13242.240795
u_reordered_prev2 2384 13.972288 7298.527682
u_reordered_prev3 2117 13.961878 10401.479675
u_ordersize_prev3_ratio 2534 13.751472 10365.937169
u_ordersize_prev1 2504 13.693969 11572.690805
up_last_order 857 13.407516 27787.674823
u_ordersize_prev3 2335 12.843689 8772.334148
user_max_order 991 12.780362 12075.830346
u_ordersize_prev2 2417 12.521399 10487.482707
ud_add_to_cart_order_inverted_mean 380 12.453500 25059.480218
ud_time_prev1 1830 12.253063 12107.651343
order_dow 1382 12.193706 16116.251265
ud_count 857 12.185520 6629.856211
d_order_rate 540 12.118112 13730.128513
u_num_departments 1298 12.011481 19787.877849
ud_time_prev2 121 11.705786 9535.800309
d_up_count 696 11.591067 15251.629754
ud_reorder_rate 1846 11.540409 7253.982125
ud_time_first 1534 11.464914 9391.940663
u_num_products 1874 11.166436 9821.331869
ud_last_order 611 11.135305 16090.513473
order_hour_of_day 1924 11.094476 9591.544841
d_reordered_product_ratio 478 11.063888 15035.328124
p_add_to_cart_order_mean 1784 11.005289 11425.994480
ud_ordergap_last 325 10.874170 35682.918418
department_id 870 10.751851 5736.175643
a_up_count 1325 10.739129 12210.006230
ud_add_to_cart_order_mean 1823 10.685070 6682.064843
u_reordered_department_ratio 2443 10.512853 6751.980585
order_hourp12 1583 10.289704 5128.105178
ud_first_order 432 8.819392 14819.641462
aisle_id 1226 8.713172 8764.242251

In [21]:
def run_sub(func):
    X_train = train.drop('reordered', axis=1)
    y_train = train.reordered

    mdl, rawpreds = func(X_train, y_train, X_test, None, features_to_use, rounds=600)

    return mdl, rawpreds

model_test, raw_test = run_sub(xgb_cv)


[0]	train-logloss:0.665192
[10]	train-logloss:0.474961
[20]	train-logloss:0.375752
[30]	train-logloss:0.320164
[40]	train-logloss:0.288069
[50]	train-logloss:0.269314
[60]	train-logloss:0.25836
[70]	train-logloss:0.251963
[80]	train-logloss:0.248194
[90]	train-logloss:0.24596
[100]	train-logloss:0.244592
[110]	train-logloss:0.243724
[120]	train-logloss:0.24313
[130]	train-logloss:0.242709
[140]	train-logloss:0.242351
[150]	train-logloss:0.242087
[160]	train-logloss:0.241829
[170]	train-logloss:0.241636
[180]	train-logloss:0.24144
[190]	train-logloss:0.241249
[200]	train-logloss:0.241074
[210]	train-logloss:0.240909
[220]	train-logloss:0.240748
[230]	train-logloss:0.240594
[240]	train-logloss:0.240459
[250]	train-logloss:0.240304
[260]	train-logloss:0.240176
[270]	train-logloss:0.240047
[280]	train-logloss:0.239911
[290]	train-logloss:0.239771
[300]	train-logloss:0.23964
[310]	train-logloss:0.239516
[320]	train-logloss:0.239394
[330]	train-logloss:0.239276
[340]	train-logloss:0.239153
[350]	train-logloss:0.239035
[360]	train-logloss:0.238922
[370]	train-logloss:0.238813
[380]	train-logloss:0.238703
[390]	train-logloss:0.238595
[400]	train-logloss:0.238488
[410]	train-logloss:0.238381
[420]	train-logloss:0.23828
[430]	train-logloss:0.238167
[440]	train-logloss:0.238063
[450]	train-logloss:0.237974
[460]	train-logloss:0.237867
[470]	train-logloss:0.237774
[480]	train-logloss:0.237672
[490]	train-logloss:0.237574
[500]	train-logloss:0.237483
[510]	train-logloss:0.237385
[520]	train-logloss:0.237299
[530]	train-logloss:0.237209
[540]	train-logloss:0.237107
[550]	train-logloss:0.237011
[560]	train-logloss:0.236919
[570]	train-logloss:0.236819
[580]	train-logloss:0.23672
[590]	train-logloss:0.236628
[599]	train-logloss:0.236548

In [22]:
X_presub = X_test[['user_id', 'product_id', 'order_id']].copy()

X_presub['reordered_prob'] = raw_test

X_presub.to_pickle('0813m1-dusty-subraw.pkl')

In [23]:
df_f1 = raw_f1proc(X_presub)

In [24]:
df_f1[['products']].to_csv('sub-0813-dusty.csv')

In [25]:
len(features_to_use)


Out[25]:
59

Load in sh1ng model results, blend, and make submission


In [51]:
xpa = X_presub.copy()

xp_imba = pd.read_pickle('sub-0810-sh1ng-pre.pkl')

xpa.sort_values(['user_id', 'product_id'], inplace=True)

xp_imba.sort_values(['user_id', 'product_id'], inplace=True)

xpb = pd.merge(xpa, xp_imba, on=['product_id', 'user_id', 'order_id'])

xpb['reordered_prob'] = (xpb.reordered_prob_x + xpb.reordered_prob_y) / 2

In [61]:
df_f1m = raw_f1proc(xpb)
df_f1m[['products']].to_csv('sub-0813-dusty_merge2.csv')

In [62]:
df_f1m


Out[62]:
products possible k usenone
order_id
17 13107 21463 38777 47766 26429 21 5 False
34 16083 47766 39475 43504 2596 21137 47792 44663... 78 10 False
137 24852 38689 23794 41787 25890 2326 5134 68 7 False
182 5479 9337 39275 13629 47672 11520 32109 41149 ... 100 10 False
257 24852 49235 37646 27966 27104 39475 29837 3023... 57 13 False
313 45007 12779 46906 21903 30391 13198 28535 25890 53 8 False
353 35561 40688 33000 21137 48183 12 5 False
386 24852 39180 47766 45066 38281 42265 15872 2147... 55 21 False
414 20392 20564 27845 21230 21709 33320 44292 4472... 73 15 False
418 47766 40268 30489 5262 1503 13702 41950 38694 71 8 False
437 13176 27966 4589 47209 365 10132 1463 16797 30... 104 13 False
452 13176 13166 36735 27966 36606 40386 5262 28666... 117 29 False
474 None 21 0 True
492 24852 44632 21137 43086 46667 26604 28918 1483... 192 25 False
497 27275 31964 1831 39947 27 4 False
513 None 32806 49328 22039 16023 16 4 True
517 24852 37646 21903 44560 9387 48679 25890 13198... 121 18 False
604 12099 24852 19660 13176 16797 24838 31720 65 7 False
680 7743 30441 14836 47977 30353 7693 27275 9 7 False
758 19660 651 2 2 False
759 5612 24852 39527 47766 16959 14651 6770 15269 ... 134 22 False
887 24852 28593 25647 39758 28204 41290 49683 2210... 24 14 False
1161 47626 22935 44142 26209 31506 35 5 False
1195 None 13870 2707 22275 16 3 True
1304 24852 14218 22035 23909 42585 42265 17122 4506... 44 10 False
1408 46676 16428 47912 10163 43352 1158 31915 2091 ... 70 11 False
1513 29373 5385 26250 16556 10070 47 5 False
1564 None 20 0 True
1727 12218 40310 28710 13 3 False
1789 None 17 0 True
... ... ... ... ...
3419522 None 148 0 True
3419568 None 196 6184 11266 31651 47402 30 5 True
3419574 35527 49235 103 2 False
3419600 13996 2086 6287 29103 43 4 False
3419623 None 4461 49236 23 2 True
3419726 39877 43961 49412 38944 46616 13176 20995 1905... 47 13 False
3419729 None 39581 21137 13884 40164 49683 34915 25 6 True
3419732 13176 24838 4957 47209 22035 25931 13045 37646... 247 28 False
3419743 45051 13176 6184 43154 38928 37710 15 6 False
3419832 5450 27966 24852 260 11712 27845 45747 38694 2... 102 14 False
3419878 7916 21137 35221 30391 21903 47766 4920 26209 ... 99 13 False
3419910 27966 27360 5385 37927 44683 36 5 False
3419934 27845 47626 25146 26209 21497 19173 12614 12889 30 8 False
3420147 15290 5025 25771 19030 41729 21 5 False
3420168 44625 1883 8021 24561 27845 14881 35921 38383 ... 62 14 False
3420237 13176 5451 3896 21137 41192 27423 19354 8193 3... 68 13 False
3420327 36988 1940 5077 34234 18362 10673 151 6 False
3420360 21903 47652 47144 13176 40545 48679 1774 9430 ... 101 15 False
3420390 48720 28924 9407 30949 13176 47209 16254 7905 ... 75 12 False
3420443 24852 20842 47144 5077 18434 41950 21982 8518 ... 85 15 False
3420449 None 37687 48 1 True
3420544 28577 25466 47141 23909 43631 5782 49605 46284... 128 16 False
3420569 47766 26384 6774 24852 6948 5450 46667 16759 1... 48 13 False
3420651 39928 22035 16797 13870 42701 49175 33 6 False
3420702 9018 24852 41885 5803 10305 30 5 False
3420740 39146 35951 8174 49005 19660 13176 34429 46252... 311 15 False
3420877 21137 13176 13646 49111 27966 47209 27845 5161... 187 21 False
3420888 35951 10880 44632 7963 2361 46906 22935 43961 ... 42 9 False
3420989 46676 47766 35004 42450 13517 47229 45866 4321... 34 11 False
3421054 31231 11123 13375 18426 58 4 False

75000 rows × 4 columns