In [1]:
# This script considers all the products a user has ordered
#
# We train a model computing the probability of reorder on the "train" data
#
# For the submission, we keep the orders that have a probability of
# reorder higher than a threshold

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import numpy as np
import pandas as pd
import lightgbm as lgb
import xgboost as xgb
import gc
from tqdm import tqdm, tqdm_notebook

tqdm.pandas(desc="")

%load_ext ipycache

IDIR = 'input/'


print('loading prior')
priors = pd.read_csv(
    IDIR + 'order_products__prior.csv', 
    dtype=dict(
        order_id=np.int32,
        add_to_cart_order=np.int16,
        reordered=np.int8,
        product_id=np.int32
    )
)
print('loading train')
op_train = pd.read_csv(
    IDIR + 'order_products__train.csv',
    dtype=dict(reordered=np.int8, add_to_cart_order=np.int16)
)
op_train.set_index(['order_id', 'product_id'], inplace=True, drop=False)

print('loading orders')
eval_sets = ["prior", "train", "test"]
orders = pd.read_csv(
    IDIR + 'orders.csv',
    dtype=dict(
        order_dow=np.int8,
        order_hour_of_day=np.int8,
        order_number=np.int16,
        order_id=np.int32,
        user_id=np.int32,
        days_since_prior_order=np.float32,
    )
)
orders["eval_set"] = orders["eval_set"].apply(eval_sets.index).astype(np.int8)
orders.set_index('order_id', inplace=True, drop=False)

print('loading products')
products = pd.read_csv(
    IDIR + 'products.csv',
    dtype=dict(
        aisle_id=np.int16,
        department_id=np.int16,
        product_id=np.int32
    )
)
products.drop(['product_name'], axis=1, inplace=True)

departments = pd.read_csv(IDIR + 'departments.csv')
aisles = pd.read_csv(IDIR + 'aisles.csv')

print('priors {}: {}'.format(priors.shape, ', '.join(priors.columns)))
print('orders {}: {}'.format(orders.shape, ', '.join(orders.columns)))
print('train {}: {}'.format(op_train.shape, ', '.join(op_train.columns)))
print('Total departments: {}'.format(departments.shape[0]))
print('Total aisles: {}'.format(aisles.shape[0]))


/home/ubuntu/.venv/local/lib/python2.7/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)
/home/ubuntu/.venv/local/lib/python2.7/site-packages/IPython/config.py:13: ShimWarning: The `IPython.config` package has been deprecated since IPython 4.0. You should import from traitlets.config instead.
  "You should import from traitlets.config instead.", ShimWarning)
/home/ubuntu/.venv/local/lib/python2.7/site-packages/IPython/utils/traitlets.py:5: UserWarning: IPython.utils.traitlets has moved to a top-level traitlets package.
  warn("IPython.utils.traitlets has moved to a top-level traitlets package.")
loading prior
loading train
loading orders
loading products
priors (32434489, 4): order_id, product_id, add_to_cart_order, reordered
orders (3421083, 7): order_id, user_id, eval_set, order_number, order_dow, order_hour_of_day, days_since_prior_order
train (1384617, 4): order_id, product_id, add_to_cart_order, reordered
Total departments: 21
Total aisles: 134

In [2]:
# Week No
o1_gr = orders.groupby("user_id").agg({"days_since_prior_order": "cumsum"})
orders["user_weekno"] = (o1_gr["days_since_prior_order"] / 7).round().fillna(0)

orders = orders.merge(
    orders.groupby("user_id").agg({"user_weekno": "max"}).rename(
        columns={"user_weekno": "user_weekno_max"}
    ).reset_index(),
    on="user_id",
    how="left"
)

orders["user_weekno_rev"] = abs(orders.user_weekno_max - orders.user_weekno).astype(np.int8)
orders = orders.drop("user_weekno_max", axis=1)
gc.collect()


Out[2]:
104

Features

https://www.kaggle.com/c/instacart-market-basket-analysis/discussion/35468

Here are some feature ideas that can help new participants get started and may be you will find something you have missed:


In [3]:
priors = priors.join(orders, on='order_id', rsuffix='_')
priors = priors.join(products, on='product_id', rsuffix='_')
priors.drop(['product_id_', 'order_id_'], inplace=True, axis=1)

User

  • Products purchased (total_distinct_items)
  • Orders made (nb_orders)
  • frequency and recency of orders (average_days_between_orders)
  • Aisle purchased from
  • Department purchased from
  • frequency and recency of reorders
  • tenure
  • mean order size (average basket)
  • etc.

In [4]:
usr = pd.DataFrame()
o_grouped = orders.groupby('user_id')
p_grouped = priors.groupby('user_id')

usr['average_days_between_orders'] = o_grouped.days_since_prior_order.mean()
usr['max_days_between_orders'] = o_grouped.days_since_prior_order.max()
usr['min_days_between_orders'] = o_grouped.days_since_prior_order.min()
usr['std_days_between_orders'] = o_grouped.days_since_prior_order.std()

usr["period"] = o_grouped.days_since_prior_order.fillna(0).sum()
usr['nb_orders'] = o_grouped.size().astype(np.int16)

users = pd.DataFrame()
users['total_items'] = p_grouped.size().astype(np.int16)
users['all_products'] = p_grouped['product_id'].apply(set)
users['total_distinct_items'] = (users.all_products.map(len)).astype(np.int16)

users['reorders'] = p_grouped["reordered"].sum()
users['reorder_rate'] = (users.reorders / usr.nb_orders)

users = users.join(usr)
del usr, o_grouped, p_grouped
users['average_basket'] = (users.total_items / users.nb_orders).astype(np.float32)
gc.collect()
print('user f', users.shape)


Out[4]:
91
user f (206208, 12)

In [5]:
def merge_user_features(df):
    df['user_total_orders'] = df.user_id.map(users.nb_orders)
    df['user_total_items'] = df.user_id.map(users.total_items)
    df['user_total_distinct_items'] = df.user_id.map(users.total_distinct_items)
    df['user_average_days_between_orders'] = df.user_id.map(users.average_days_between_orders)
    df['user_max_days_between_orders'] = df.user_id.map(users.max_days_between_orders)
    df['user_min_days_between_orders'] = df.user_id.map(users.min_days_between_orders)
    df['user_std_days_between_orders'] = df.user_id.map(users.std_days_between_orders)
    df['user_average_basket'] =  df.user_id.map(users.average_basket)
    df['user_period'] =  df.user_id.map(users.period)
    return df

Product

  • users
  • orders
  • order frequency
  • reorder rate
  • recency
  • mean/std add_to_cart_order
  • Срок годности/время перепокупки
  • etc.

In [6]:
prods = pd.DataFrame()
p_grouped = priors.groupby(priors.product_id)

prods['orders'] = p_grouped.size().astype(np.float32)
prods['order_freq'] = (prods['orders'] / len(priors.order_id.unique()))
prods['users'] = p_grouped.user_id.unique().apply(len)
prods['add_to_cart_order_mean'] = p_grouped.add_to_cart_order.mean().astype(np.float32)
prods['add_to_cart_order_std'] = p_grouped.add_to_cart_order.std().astype(np.float32)

prods['reorders'] = p_grouped['reordered'].sum().astype(np.float32)
prods['reorder_rate'] = (prods.reorders / prods.orders).astype(np.float32)

products = products.join(prods, on='product_id')
products.set_index('product_id', drop=False, inplace=True)
del prods, p_grouped
gc.collect()


Out[6]:
321

In [7]:
def merge_product_features(df):
    df['product_orders'] = df.product_id.map(products.orders).astype(np.float32)
    df['product_users'] = df.product_id.map(products.users).astype(np.float32)
    df['product_order_freq'] = df.product_id.map(products.order_freq).astype(np.float32)
    df['product_reorders'] = df.product_id.map(products.reorders).astype(np.float32)
    df['product_reorder_rate'] = df.product_id.map(products.reorder_rate).astype(np.float32)
    df['product_add_to_cart_order_mean'] = df.product_id.map(products.add_to_cart_order_mean).astype(np.float32)
    df['product_add_to_cart_order_std'] = df.product_id.map(products.add_to_cart_order_std).astype(np.float32)
    return df

Aisle

  • users
  • orders
  • order frequency
  • reorder rate
  • recency
  • mean/std add_to_cart_order
  • Срок годности/время перепокупки
  • etc.

In [8]:
prods = pd.DataFrame()
p_grouped = priors.groupby(priors.aisle_id)

prods['orders'] = p_grouped.size().astype(np.float32)
prods['order_freq'] = (prods['orders'] / len(priors.order_id.unique())).astype(np.float32)
prods['users'] = p_grouped.user_id.unique().apply(len).astype(np.float32)
prods['add_to_cart_order_mean'] = p_grouped.add_to_cart_order.mean().astype(np.float32)
prods['add_to_cart_order_std'] = p_grouped.add_to_cart_order.std().astype(np.float32)

prods['reorders'] = p_grouped['reordered'].sum().astype(np.float32)
prods['reorder_rate'] = (prods.reorders / prods.orders).astype(np.float32)

aisles.set_index('aisle_id', drop=False, inplace=True)
aisles = aisles.join(prods)

del prods, p_grouped

In [9]:
def merge_aisle_features(df):
    df['aisle_orders'] = df.aisle_id.map(aisles.orders)
    df['aisle_users'] = df.aisle_id.map(aisles.users)
    df['aisle_order_freq'] = df.aisle_id.map(aisles.order_freq)
    df['aisle_reorders'] = df.aisle_id.map(aisles.reorders)
    df['aisle_reorder_rate'] = df.aisle_id.map(aisles.reorder_rate)
    df['aisle_add_to_cart_order_mean'] = df.aisle_id.map(aisles.add_to_cart_order_mean)
    df['aisle_add_to_cart_order_std'] = df.aisle_id.map(aisles.add_to_cart_order_std)
    return df

Department

  • users
  • orders
  • order frequency
  • reorder rate
  • recency
  • mean add_to_cart_order
  • Срок годности/время перепокупки
  • etc.

In [10]:
prods = pd.DataFrame()
p_grouped = priors.groupby(priors.department_id)

prods['orders'] = p_grouped.size().astype(np.float32)
prods['order_freq'] = (prods['orders'] / len(priors.order_id.unique())).astype(np.float32)
prods['users'] = p_grouped.user_id.unique().apply(len).astype(np.float32)
prods['add_to_cart_order_mean'] = p_grouped.add_to_cart_order.mean().astype(np.float32)
prods['add_to_cart_order_std'] = p_grouped.add_to_cart_order.std().astype(np.float32)

prods['reorders'] = p_grouped['reordered'].sum().astype(np.float32)
prods['reorder_rate'] = (prods.reorders / prods.orders).astype(np.float32)

departments.set_index('department_id', drop=False, inplace=True)
departments = departments.join(prods)

del prods, p_grouped

In [11]:
def merge_department_features(df):
    df['department_orders'] = df.department_id.map(departments.orders)
    df['department_users'] = df.department_id.map(departments.users)
    df['department_order_freq'] = df.department_id.map(departments.order_freq)
    df['department_reorders'] = df.department_id.map(departments.reorders)
    df['department_reorder_rate'] = df.department_id.map(departments.reorder_rate)
    df['department_add_to_cart_order_mean'] = df.department_id.map(departments.add_to_cart_order_mean)
    df['department_add_to_cart_order_std'] = df.department_id.map(departments.add_to_cart_order_std)
    return df

User Product Interaction (UP)

  • purchases (nb_orders)
  • purchases ratio
  • reorders
  • Average position in cart
  • day since last purchase
  • average/min/max days between purchase
  • day since last purchase - срок годности
  • order since last purchase (UP_orders_since_last)
  • Latest one/two/three/four week features
  • etc.

In [12]:
%%cache userXproduct.pkl userXproduct

priors['z'] = priors.product_id + priors.user_id * 100000

d = dict()
for row in tqdm_notebook(priors.itertuples(), total=len(priors)):
    z = row.z
    if z not in d:
        d[z] = (
            1,
            (row.order_number, row.order_id),
            row.add_to_cart_order,
            row.reordered
        )
    else:
        d[z] = (
            d[z][0] + 1,
            max(d[z][1], (row.order_number, row.order_id)),
            d[z][2] + row.add_to_cart_order,
            d[z][3] + row.reordered
        )

priors.drop(['z'], axis=1, inplace=True)

print('to dataframe (less memory)')
userXproduct = pd.DataFrame.from_dict(d, orient='index')
del d
gc.collect()
userXproduct.columns = ['nb_orders', 'last_order_id', 'sum_pos_in_cart', 'reorders']

userXproduct.nb_orders = userXproduct.nb_orders.astype(np.int16)
userXproduct.last_order_id = userXproduct.last_order_id.map(lambda x: x[1]).astype(np.int32)
userXproduct.sum_pos_in_cart = userXproduct.sum_pos_in_cart.astype(np.int16)
userXproduct.reorders = userXproduct.reorders.astype(np.int16)

print('user X product f', len(userXproduct))


[Saved variables userXproduct to file '/home/ubuntu/kaggle/instacart/userXproduct.pkl'.]
25134338/|/ 77%|| 25134338/32434489 [04:10<01:12, 100389.58it/s]
to dataframe (less memory)
user X product f 29302795
7

In [13]:
def merge_user_X_product_features(df):
    df['z'] = df.product_id + df.user_id * 100000
    df['UP_orders'] = df.z.map(userXproduct.nb_orders)
    df['UP_orders_ratio'] = (df.UP_orders / df.user_total_orders)
    df['UP_last_order_id'] = df.z.map(userXproduct.last_order_id)
    df['UP_average_pos_in_cart'] = (df.z.map(userXproduct.sum_pos_in_cart) / df.UP_orders)
    df['UP_reorders'] = df.z.map(userXproduct.reorders)
    df['UP_orders_since_last'] = df.user_total_orders - df.UP_last_order_id.map(orders.order_number)
    #df['UP_days_since_last'] = 
#     df['UP_delta_hour_vs_last'] = abs(
#         df.order_hour_of_day - df.UP_last_order_id.map(orders.order_hour_of_day)
#     ).map(lambda x: min(x, 24-x)).astype(np.int8)
    
    #df['UP_same_dow_as_last_order'] = df.UP_last_order_id.map(orders.order_dow) == \
    #                                              df.order_id.map(orders.order_dow)

    df.drop(['UP_last_order_id', 'z'], axis=1, inplace=True)
    return df

User aisle interaction (UA)

  • purchases
  • reorders
  • day since last purchase
  • average/min/max days between purchase
  • order since last purchase
  • etc.

User department interaction (UD)

  • purchases
  • reorders
  • day since last purchase
  • average days between purchase
  • order since last purchase
  • etc.

User time interaction (UT)

  • user preferred day of week
  • user preferred time of day
  • similar features for products and aisles

Combine


In [14]:
### build list of candidate products to reorder, with features ###
train_index = set(op_train.index)

def features(selected_orders, labels_given=False):
    order_list = []
    product_list = []
    labels = []
    for row in tqdm_notebook(
        selected_orders.itertuples(), 
        total=len(selected_orders)
    ):
        order_id = row.order_id
        user_id = row.user_id
        user_products = users.all_products[user_id]
        product_list += user_products
        order_list += [order_id] * len(user_products)
        if labels_given:
            labels += [
                (order_id, product) in train_index 
                for product in user_products
            ]
        
    df = pd.DataFrame({'order_id': order_list, 'product_id': product_list})

    df.order_id = df.order_id.astype(np.int32)
    df.product_id = df.product_id.astype(np.int32)

    del order_list
    del product_list

    df['user_id'] = df.order_id.map(orders.user_id)
    df['aisle_id'] = df.product_id.map(products.aisle_id).astype(np.int8)
    df['department_id'] = df.product_id.map(products.department_id).astype(np.int8)

    labels = np.array(labels, dtype=np.int8)

    print('user related features')
    df = merge_user_features(df)

    print('order related features')
    df['dow'] = df.order_id.map(orders.order_dow)
    df['order_hour_of_day'] = df.order_id.map(orders.order_hour_of_day)
    df['days_since_prior_order'] = df.order_id.map(orders.days_since_prior_order)
    df['days_since_ratio'] = df.days_since_prior_order / df.user_average_days_between_orders
    
    print('product related features')
    df = merge_product_features(df)

    print('aisle related features')
    df = merge_aisle_features(df)

    print('department related features')
    df = merge_department_features(df)

    print('user_X_product related features')
    df = merge_user_X_product_features(df)

    return (df, labels)

In [15]:
# %%cache dataset.pkl df_train df_test labels
### train / test orders ###
print('split orders : train, test')
test_orders = orders[orders.eval_set == 2]
train_orders = orders[orders.eval_set == 1]

df_train, labels = features(train_orders, labels_given=True)
del train_orders
gc.collect()
df_test, _ = features(test_orders, labels_given=False)
del test_orders
gc.collect()


split orders : train, test
user related features
order related features
product related features
aisle related features
department related features
user_X_product related features
---------------------------------------------------------------------------
InvalidIndexError                         Traceback (most recent call last)
<ipython-input-15-b2faa0d45cd1> in <module>()
      5 train_orders = orders[orders.eval_set == 1]
      6 
----> 7 df_train, labels = features(train_orders, labels_given=True)
      8 del train_orders
      9 gc.collect()

<ipython-input-14-5b32ef7a5939> in features(selected_orders, labels_given)
     54 
     55     print('user_X_product related features')
---> 56     df = merge_user_X_product_features(df)
     57 
     58     return (df, labels)

<ipython-input-13-5bbc2f2d3988> in merge_user_X_product_features(df)
      1 def merge_user_X_product_features(df):
      2     df['z'] = df.product_id + df.user_id * 100000
----> 3     df['UP_orders'] = df.z.map(userXproduct.nb_orders)
      4     df['UP_orders_ratio'] = (df.UP_orders / df.user_total_orders)
      5     df['UP_last_order_id'] = df.z.map(userXproduct.last_order_id)

/home/ubuntu/.venv/local/lib/python2.7/site-packages/pandas/core/series.pyc in map(self, arg, na_action)
   2152         if isinstance(arg, Series):
   2153             # arg is a Series
-> 2154             indexer = arg.index.get_indexer(values)
   2155             new_values = algorithms.take_1d(arg._values, indexer)
   2156         else:

/home/ubuntu/.venv/local/lib/python2.7/site-packages/pandas/core/indexes/base.pyc in get_indexer(self, target, method, limit, tolerance)
   2578             target = target.astype(object)
   2579             return this.get_indexer(target, method=method, limit=limit,
-> 2580                                     tolerance=tolerance)
   2581 
   2582         if not self.is_unique:

/home/ubuntu/.venv/local/lib/python2.7/site-packages/pandas/core/indexes/base.pyc in get_indexer(self, target, method, limit, tolerance)
   2581 
   2582         if not self.is_unique:
-> 2583             raise InvalidIndexError('Reindexing only valid with uniquely'
   2584                                     ' valued Index objects')
   2585 

InvalidIndexError: Reindexing only valid with uniquely valued Index objects

Train


In [28]:
f_to_use = [
    'user_total_orders', 'user_total_items', 'user_total_distinct_items',
    'user_average_days_between_orders', 'user_average_basket',
    'order_hour_of_day', 'days_since_prior_order', 'days_since_ratio',
    'aisle_id', 'department_id', 'product_orders', 'product_reorders',
    'product_reorder_rate', 'UP_orders', 'UP_orders_ratio',
    'UP_average_pos_in_cart', 'UP_reorders', 'UP_orders_since_last',
    'UP_delta_hour_vs_last'
]

def feature_select(df):
#     return df[f_to_use]
    return df.drop(
        ["user_id", "order_id", "product_id"], 
        axis=1, errors="ignore"
    )

In [24]:
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': {'binary_logloss'},
    'num_leaves': 96,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.95,
    'bagging_freq': 5
}
ROUNDS = 98

def train(traindf, y):
    d_train = lgb.Dataset(
        feature_select(traindf),
        label=y,
        categorical_feature=['aisle_id', 'department_id']
    )

    model = lgb.train(params, d_train, ROUNDS)
    return model

In [17]:
model = train(df_train, labels)

Predict


In [25]:
def predict(model, df_test, TRESHOLD=0.19):
    ### build candidates list for test ###

    df_test['pred'] = model.predict(feature_select(df_test))
    # Нужно добавить https://www.kaggle.com/mmueller/f1-score-expectation-maximization-in-o-n/code
    d = dict()
    for row in df_test.itertuples():
        # Вот тут можно отрезать не по threshold, а с помощью модели определять кол-во покупок
        if row.pred > TRESHOLD:
            try:
                d[row.order_id] += ' ' + str(row.product_id)
            except KeyError:
                d[row.order_id] = str(row.product_id)

    for order_id in df_test.order_id:
        if order_id not in d:
            d[order_id] = 'None'

    sub = pd.DataFrame.from_dict(d, orient='index')
    sub.reset_index(inplace=True)
    sub.columns = ['order_id', 'products']
    return sub

In [19]:
sub = predict(model, df_test)
sub.to_csv('sub.csv', index=False)

CV


In [ ]:
lgb.cv(params, d_train, ROUNDS, nfold=5, verbose_eval=10)

In [20]:
%%cache df_train_gt.pkl df_train_gt

from functools import partial

products_raw = pd.read_csv(IDIR + 'products.csv')
# combine aisles, departments and products (left joined to products)
goods = pd.merge(
    left=pd.merge(
        left=products_raw, 
        right=departments, how='left'
    ), 
    right=aisles, how='left'
)
# to retain '-' and make product names more "standard"
goods.product_name = goods.product_name.str.replace(' ', '_').str.lower() 

# retype goods to reduce memory usage
goods.product_id = goods.product_id.astype(np.int32)
goods.aisle_id = goods.aisle_id.astype(np.int16)
goods.department_id = goods.department_id.astype(np.int8)

# initialize it with train dataset
train_details = pd.merge(
                left=op_train,
                 right=orders, 
                 how='left', 
                 on='order_id'
        ).apply(partial(pd.to_numeric, errors='ignore', downcast='integer'))

# add order hierarchy
train_details = pd.merge(
                left=train_details,
                right=goods[['product_id', 
                             'aisle_id', 
                             'department_id']].apply(partial(pd.to_numeric, 
                                                             errors='ignore', 
                                                             downcast='integer')),
                how='left',
                on='product_id'
)

train_gtl = []

for uid, subset in train_details.groupby('user_id'):
    subset1 = subset[subset.reordered == 1]
    oid = subset.order_id.values[0]

    if len(subset1) == 0:
        train_gtl.append((oid, 'None'))
        continue

    ostr = ' '.join([str(int(e)) for e in subset1.product_id.values])
    # .strip is needed because join can have a padding space at the end
    train_gtl.append((oid, ostr.strip()))

del train_details
del goods
del products_raw

gc.collect()

df_train_gt = pd.DataFrame(train_gtl)

df_train_gt.columns = ['order_id', 'products']
df_train_gt.set_index('order_id', inplace=True)
df_train_gt.sort_index(inplace=True)


[Skipped the cell's code and loaded variables df_train_gt from file '/home/ubuntu/kaggle/instacart/df_train_gt.pkl'.]
54

In [26]:
from sklearn.model_selection import GroupKFold

def f1_score(cvpred):
    joined = df_train_gt.join(cvpred, rsuffix="_cv", how="inner")
    lgts = joined.products.replace("None", "-1").apply(lambda x: x.split(" ")).values
    lpreds = joined.products_cv.replace("None", "-1").apply(lambda x: x.split(" ")).values
    f1 = []
    for lgt, lpred in zip(lgts, lpreds):
        rr = (np.intersect1d(lgt, lpred))
        precision = np.float(len(rr)) / len(lpred)
        recall = np.float(len(rr)) / len(lgt)

        denom = precision + recall
        f1.append(((2 * precision * recall) / denom) if denom > 0 else 0)
    return np.mean(f1)

def cv(threshold=0.22):
    gkf = GroupKFold(n_splits=5)

    scores = []
    for train_idx, test_idx in gkf.split(df_train.index, groups=df_train.user_id):
        dftrain = df_train.iloc[train_idx]
        dftest = df_train.iloc[test_idx]
        y = labels[train_idx]
        model = train(dftrain, y)
        pred = predict(model, dftest, threshold).set_index("order_id")
        f1 = f1_score(pred)
        print(f1)
        scores.append(f1)
        del dftrain
        del dftest
        gc.collect()

    return np.mean(scores), np.std(scores)

In [29]:
cv()


/home/ubuntu/.venv/lib/python2.7/site-packages/ipykernel_launcher.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
0.219581467551
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-29-312aae3f6475> in <module>()
----> 1 cv()

<ipython-input-26-14e2a9f4ace5> in cv(threshold)
     23         dftest = df_train.iloc[test_idx]
     24         y = labels[train_idx]
---> 25         model = train(dftrain, y)
     26         pred = predict(model, dftest, threshold).set_index("order_id")
     27         f1 = f1_score(pred)

<ipython-input-24-32f13b5c7596> in train(traindf, y)
     18     )
     19 
---> 20     model = lgb.train(params, d_train, ROUNDS)
     21     return model

/home/ubuntu/.venv/local/lib/python2.7/site-packages/lightgbm/engine.pyc in train(params, train_set, num_boost_round, valid_sets, valid_names, fobj, feval, init_model, feature_name, categorical_feature, early_stopping_rounds, evals_result, verbose_eval, learning_rates, callbacks)
    178                                     evaluation_result_list=None))
    179 
--> 180         booster.update(fobj=fobj)
    181 
    182         evaluation_result_list = []

/home/ubuntu/.venv/local/lib/python2.7/site-packages/lightgbm/basic.pyc in update(self, train_set, fobj)
   1368             _safe_call(_LIB.LGBM_BoosterUpdateOneIter(
   1369                 self.handle,
-> 1370                 ctypes.byref(is_finished)))
   1371             self.__is_predicted_cur_iter = [False for _ in range_(self.__num_dataset)]
   1372             return is_finished.value == 1

KeyboardInterrupt: 

Подбираем threshold


In [18]:
for th in np.arange(0.18, 0.22, 0.01):
    print th
    print cv(threshold=th)
    print


0.18
/Users/evgeny/Library/Python/2.7/lib/python/site-packages/ipykernel_launcher.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
0.375669602808
0.37518960199
0.376068733519
0.374880658158
0.371575669134
(0.37467685312194482, 0.0016027896306283745)

0.19
0.375981281546
0.375613273106
0.37623495823
0.374958453045
0.371884026622
(0.3749343985097483, 0.0015845275427144021)

0.2
0.376141810192
0.375593739202
0.375961736002
0.375124046483
0.371748172351
(0.37491390084571824, 0.001620734287706205)

0.21
0.375454836995
0.374657579102
0.375585106194
0.374639123067
0.371277685501
(0.37432286617177202, 0.0015722458019732746)

0.22
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-18-6c718121d7c6> in <module>()
      1 for th in np.arange(0.18, 0.22, 0.01):
      2     print th
----> 3     print cv(threshold=th)
      4     print

<ipython-input-14-8a57b9000d45> in cv(threshold)
      8         dftest = df_train.iloc[test_idx]
      9         y = labels[train_idx]
---> 10         model = train(dftrain, y)
     11         pred = predict(model, dftest, threshold).set_index("order_id")
     12         f1 = f1_score(pred)

<ipython-input-10-32f13b5c7596> in train(traindf, y)
     18     )
     19 
---> 20     model = lgb.train(params, d_train, ROUNDS)
     21     return model

/usr/local/lib/python2.7/site-packages/lightgbm-0.2-py2.7.egg/lightgbm/engine.py in train(params, train_set, num_boost_round, valid_sets, valid_names, fobj, feval, init_model, feature_name, categorical_feature, early_stopping_rounds, evals_result, verbose_eval, learning_rates, callbacks)
    178                                     evaluation_result_list=None))
    179 
--> 180         booster.update(fobj=fobj)
    181 
    182         evaluation_result_list = []

/usr/local/lib/python2.7/site-packages/lightgbm-0.2-py2.7.egg/lightgbm/basic.py in update(self, train_set, fobj)
   1368             _safe_call(_LIB.LGBM_BoosterUpdateOneIter(
   1369                 self.handle,
-> 1370                 ctypes.byref(is_finished)))
   1371             self.__is_predicted_cur_iter = [False for _ in range_(self.__num_dataset)]
   1372             return is_finished.value == 1

KeyboardInterrupt: 

0.372658477911


In [17]:
0.18
0.375669602808
0.37518960199
0.376068733519
0.374880658158
0.371575669134
(0.37467685312194482, 0.0016027896306283745)

0.19
0.375981281546
0.375613273106
0.37623495823
0.374958453045
0.371884026622
(0.3749343985097483, 0.0015845275427144021)

0.2
0.376141810192
0.375593739202
0.375961736002
0.375124046483
0.371748172351
(0.37491390084571824, 0.001620734287706205)


0.21
0.375454836995
0.374657579102
0.375585106194
0.374639123067
0.371277685501
(0.37432286617177202, 0.0015722458019732746)


0.2
0.376141810192
0.375593739202
0.375961736002
0.375124046483
0.371748172351

(0.37491390084571824, 0.001620734287706205)

0.374504880043
0.372459365153
0.374241429517
0.373332070018
0.370178093483
(0.37294316764289259, 0.0015591904647740879) 0.22
0.370290530162
0.369518178297
0.370515696117
0.369568282123
0.3673846793
(0.36945547319979183, 0.0011069090226251931) 0.24
0.363691285892
0.363725106289
0.363492700824
0.364412180878
0.363024994542
(0.36366925368510306, 0.00044761289123321511) 0.26


  File "<ipython-input-17-fd741fd5103b>", line 6
    (0.37491390084571824, 0.001620734287706205) 0.2
                                                  ^
SyntaxError: invalid syntax

Модель определения кол-ва покупок


In [3]:
prior_orders_count = priors[["order_id", "product_id"]].groupby("order_id").count()
prior_orders_count = prior_orders_count.rename(columns={"product_id": "product_counts"})

train_orders_count = op_train.drop(["product_id", "order_id"], axis=1, errors="ignore")
train_orders_count = train_orders_count.reset_index()[["order_id", "product_id"]].groupby("order_id").count()
train_orders_count = train_orders_count.rename(columns={"product_id": "product_counts"})

prior_orders_count = orders.join(prior_orders_count, how='inner')
train_orders_count = orders.join(train_orders_count, how='inner')
prior_orders_count.head(15)


Out[3]:
order_id user_id eval_set order_number order_dow order_hour_of_day days_since_prior_order product_counts
order_id
2539329 2539329 1 prior 1 2 8 NaN 5
2398795 2398795 1 prior 2 3 7 15.0 6
473747 473747 1 prior 3 3 12 21.0 5
2254736 2254736 1 prior 4 4 7 29.0 5
431534 431534 1 prior 5 4 15 28.0 8
3367565 3367565 1 prior 6 2 7 19.0 4
550135 550135 1 prior 7 1 9 20.0 5
3108588 3108588 1 prior 8 1 14 14.0 6
2295261 2295261 1 prior 9 1 16 0.0 6
2550362 2550362 1 prior 10 4 8 30.0 9
2168274 2168274 2 prior 1 2 11 NaN 13
1501582 1501582 2 prior 2 5 10 10.0 6
1901567 1901567 2 prior 3 1 10 3.0 5
738281 738281 2 prior 4 2 10 8.0 13
1673511 1673511 2 prior 5 3 11 8.0 13

In [13]:
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error


def get_order_count(order, alpha=0.5):
    user_id = order["user_id"]
    df = prior_orders_count[prior_orders_count["user_id"] == user_id]
    feats = ["order_number", "order_dow", "order_hour_of_day", "days_since_prior_order"]
    X = df[feats].fillna(0).values
    y = df["product_counts"].values

    # create dataset for lightgbm
#     lgb_train = lgb.Dataset(X, y)
#     params = {
#         'task': 'train',
#         'boosting_type': 'gbdt',
#         'objective': 'regression',
#         'metric': {'rmse'},
#         'num_leaves': 100,
#         'learning_rate': 0.01,
#         'feature_fraction': 0.9,
#         'bagging_fraction': 0.8,
#         'bagging_freq': 5,
#         'verbose': 0,
#     }
#     clf = lgb.train(params,
#                     lgb_train,
#                     num_boost_round=40)

    xgb_params = {
        'max_depth': 5,
        'n_estimators': 200,
        'learning_rate': 0.05,
        'objective': 'reg:linear',
        'eval_metric': 'rmse',
        'silent': 1
    }
    dtrain_all = xgb.DMatrix(X, y)
    clf = xgb.train(xgb_params, dtrain_all, num_boost_round=400)

#     clf = Lasso(alpha=0.01)
#     clf.fit(X, y)

    Xpred = np.array([order[f] or 0 for f in feats]).reshape(1, -1)
    Xpred = np.nan_to_num(Xpred, 0)

    Xpred = xgb.DMatrix(Xpred)
    return clf.predict(Xpred)[0]

df = train_orders_count.head(10000)
df["pred_products_count"] = df.apply(get_order_count, axis=1)

print(mean_squared_error(
    df["product_counts"],
    df["pred_products_count"]
))


46.9945832987
/home/ubuntu/.venv/lib/python2.7/site-packages/ipykernel_launcher.py:51: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

In [ ]: