In [3]:
# Author : Paul-Antoine Nguyen

# This script considers all the products a user has ordered
#
# We train a model computing the probability of reorder on the "train" data
#
# For the submission, we keep the orders that have a probability of
# reorder higher than a threshold

# some overhead because of kernel memory limits

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from datetime import datetime
import numpy as np
import pandas as pd
import lightgbm as lgb
import gc
from tqdm import tqdm, tqdm_notebook
tqdm.pandas(desc="")

%load_ext ipycache
%load_ext cython

IDIR = 'input/'


/home/ubuntu/.venv/local/lib/python2.7/site-packages/IPython/config.py:13: ShimWarning: The `IPython.config` package has been deprecated since IPython 4.0. You should import from traitlets.config instead.
  "You should import from traitlets.config instead.", ShimWarning)
/home/ubuntu/.venv/local/lib/python2.7/site-packages/IPython/utils/traitlets.py:5: UserWarning: IPython.utils.traitlets has moved to a top-level traitlets package.
  warn("IPython.utils.traitlets has moved to a top-level traitlets package.")

In [ ]:
print('loading prior')
priors = pd.read_csv(IDIR + 'order_products__prior.csv')
print('loading train')

op_train = pd.read_csv(
    IDIR + 'order_products__train.csv', 
    index_col=['order_id', 'product_id']
)
train_index = set(op_train.index)
del op_train

print('loading orders')
orders = pd.read_csv(IDIR + 'orders.csv')
print('loading products')
products = pd.read_csv(IDIR + 'products.csv')

departments = pd.read_csv(IDIR + 'departments.csv', engine='c')
aisles = pd.read_csv(IDIR + 'aisles.csv', engine='c')

print('priors {}: {}'.format(priors.shape, ', '.join(priors.columns)))
print('orders {}: {}'.format(orders.shape, ', '.join(orders.columns)))
# print('train {}: {}'.format(op_train.shape, ', '.join(op_train.columns)))
print('Total departments: {}'.format(departments.shape[0]))
print('Total aisles: {}'.format(aisles.shape[0]))

In [2]:
orders.order_dow = orders.order_dow.astype(np.int8)
orders.order_hour_of_day = orders.order_hour_of_day.astype(np.int8)
orders.order_number = orders.order_number.astype(np.int16)
orders.order_id = orders.order_id.astype(np.int32)
orders.user_id = orders.user_id.astype(np.int32)
orders.days_since_prior_order = orders.days_since_prior_order.astype(np.float32)
orders.set_index('order_id', inplace=True, drop=False)

products.drop(['product_name'], axis=1, inplace=True)
products.aisle_id = products.aisle_id.astype(np.int8)
products.department_id = products.department_id.astype(np.int8)
products.product_id = products.product_id.astype(np.int32)
products.set_index('product_id', drop=False, inplace=True)

# op_train.reordered = op_train.reordered.astype(np.int8)
# op_train.add_to_cart_order = op_train.add_to_cart_order.astype(np.int16)
# op_train.set_index(['order_id', 'product_id'], inplace=True, drop=False)

priors.order_id = priors.order_id.astype(np.int32)
priors.add_to_cart_order = priors.add_to_cart_order.astype(np.int16)
priors.reordered = priors.reordered.astype(np.int8)
priors.product_id = priors.product_id.astype(np.int32)

Features

https://www.kaggle.com/c/instacart-market-basket-analysis/discussion/35468

Here are some feature ideas that can help new participants get started and may be you will find something you have missed:


In [3]:
priors = priors.join(orders, on='order_id', rsuffix='_')
priors = priors.join(products, on='product_id', rsuffix='_')
priors.drop(['product_id_', 'order_id_'], inplace=True, axis=1)

In [4]:
# Week No
o1_gr = orders.sort_values(["user_id", "order_number"]).groupby("user_id").agg({"days_since_prior_order": "cumsum"})
orders["user_weekno"] = (o1_gr["days_since_prior_order"] / 7).round().fillna(0)
orders["user_days"] = o1_gr["days_since_prior_order"].fillna(0)

# orders = orders.merge(
#     orders.groupby("user_id").agg({
#         "user_weekno": "max",
#         "user_days": "max",
#     }).rename(
#         columns={
#             "user_weekno": "user_weekno_max",
#             "user_days": "user_days_max"
#         }
#     ).reset_index(),
#     on="user_id",
#     how="left"
# )

# orders["user_weekno_rev"] = abs(orders.user_weekno_max - orders.user_weekno).astype(np.int8)
# orders["user_days_rev"] = abs(orders.user_days_max - orders.user_days).astype(np.int16)
# orders = orders.drop(["user_weekno_max", "user_days_max"], axis=1)
del o1_gr
gc.collect()


Out[4]:
552

Product

  • users
  • orders
  • order frequency
  • reorder rate
  • recency
  • mean/std add_to_cart_order
  • etc.

In [5]:
prods = pd.DataFrame()
p_grouped = priors.groupby("product_id")

prods['orders'] = p_grouped.size().astype(np.float32)
prods['order_freq'] = prods['orders'] / len(priors.order_id.unique())
prods['users'] = p_grouped.user_id.unique().apply(len)
prods['add_to_cart_order_mean'] = p_grouped.add_to_cart_order.mean()
prods['add_to_cart_order_std'] = p_grouped.add_to_cart_order.std()

prods['reorders'] = p_grouped['reordered'].sum().astype(np.int32)
prods['reorders_max'] = p_grouped['reordered'].max().astype(np.int32)
prods['reorders_min'] = p_grouped['reordered'].min().astype(np.int32)
prods['reorders_mean'] = p_grouped['reordered'].mean().astype(np.float32)
prods['reorders_std'] = p_grouped['reordered'].std().astype(np.float32)

prods['reorder_rate'] = (prods.reorders / prods.orders).astype(np.float32)

products = products.join(prods)
del prods

User

  • Products purchased
  • Orders made
  • frequency and recency of orders
  • Aisle purchased from
  • Department purchased from
  • frequency and recency of reorders
  • tenure
  • mean order size
  • etc.

In [6]:
usr = pd.DataFrame()
o_grouped = orders.groupby('user_id')
p_grouped = priors.groupby('user_id')
usr['average_days_between_orders'] = o_grouped.days_since_prior_order.mean().astype(np.float32)
usr['max_days_between_orders'] = o_grouped.days_since_prior_order.max().astype(np.float32)
usr['min_days_between_orders'] = o_grouped.days_since_prior_order.min().astype(np.float32)
usr['std_days_between_orders'] = o_grouped.days_since_prior_order.std().astype(np.float32)

usr["period"] = o_grouped.days_since_prior_order.fillna(0).sum()
usr["weeks"] = o_grouped.user_weekno.fillna(0).max()
usr['nb_orders'] = o_grouped.size().astype(np.int16)

users = pd.DataFrame()
users['total_items'] = p_grouped.size().astype(np.int16)
users['all_products'] = p_grouped['product_id'].apply(set)
users['total_distinct_items'] = (users.all_products.map(len)).astype(np.int16)

users['reorders'] = p_grouped["reordered"].sum().astype(np.int32)
users['reorders_max'] = p_grouped["reordered"].max().astype(np.int32)
users['reorders_min'] = p_grouped["reordered"].min().astype(np.int32)
users['reorders_mean'] = p_grouped["reordered"].mean().astype(np.float32)
users['reorders_std'] = p_grouped["reordered"].std().astype(np.float32)

users = users.join(usr)

users['reorder_rate'] = (users.reorders / users.nb_orders).astype(np.float32)
users['average_basket'] = (users.total_items / users.nb_orders).astype(np.float32)
del usr
gc.collect()
print('user f', users.shape)


Out[6]:
91
user f (206209, 17)

Aisle

  • users
  • orders
  • order frequency
  • reorder rate
  • recency
  • mean add_to_cart_order
  • etc.

In [7]:
prods = pd.DataFrame()
p_grouped = priors.groupby("aisle_id")

prods['orders'] = p_grouped.size().astype(np.float32)
prods['order_freq'] = (prods['orders'] / len(priors.order_id.unique())).astype(np.float32)
prods['users'] = p_grouped.user_id.unique().apply(len).astype(np.float32)
prods['add_to_cart_order_mean'] = p_grouped.add_to_cart_order.mean().astype(np.float32)
prods['add_to_cart_order_std'] = p_grouped.add_to_cart_order.std().astype(np.float32)

prods['reorders'] = p_grouped['reordered'].sum().astype(np.float32)
prods['reorder_rate'] = (prods.reorders / prods.orders).astype(np.float32)

aisles.set_index('aisle_id', drop=False, inplace=True)
aisles = aisles.join(prods)

del prods, p_grouped

Department

  • users
  • orders
  • order frequency
  • reorder rate
  • recency
  • mean add_to_cart_order
  • etc.

In [8]:
prods = pd.DataFrame()
p_grouped = priors.groupby("department_id")

prods['orders'] = p_grouped.size().astype(np.float32)
prods['order_freq'] = (prods['orders'] / len(priors.order_id.unique())).astype(np.float32)
prods['users'] = p_grouped.user_id.unique().apply(len).astype(np.float32)
prods['add_to_cart_order_mean'] = p_grouped.add_to_cart_order.mean().astype(np.float32)
prods['add_to_cart_order_std'] = p_grouped.add_to_cart_order.std().astype(np.float32)

prods['reorders'] = p_grouped['reordered'].sum().astype(np.float32)
prods['reorder_rate'] = (prods.reorders / prods.orders).astype(np.float32)

departments.set_index('department_id', drop=False, inplace=True)
departments = departments.join(prods)

del prods, p_grouped

User Product Interaction (UP)

  • purchases
  • reorders
  • day since last purchase
  • order since last purchase
  • etc.

In [9]:
orders_last = orders[["order_id", "order_number", "user_id"]].rename(
    columns={"order_id": "last_order_id"}
)
orders_first = orders[["order_id", "order_number", "user_id"]].rename(
    columns={"order_id": "first_order_id"}
)

In [10]:
def flat_columns(df):
    ind = pd.Index(["%s" % (e[1]) for e in df.columns.tolist()])
    df.columns = ind
    return df

In [11]:
priors['z'] = priors.product_id + priors.user_id * 100000
userXproduct = priors.groupby(["z", "user_id"]).agg({
    "order_id": {"nb_orders": "count"},
    "order_number": {
        "last_order_number": "max", 
        "first_order_number": "min"
    },
    "add_to_cart_order": {
        "sum_add_to_cart_order": "sum",
        "min_add_to_cart_order": "min",
        "max_add_to_cart_order": "max",
        "mean_add_to_cart_order": "mean",
        "std_add_to_cart_order": "std"
    },
    "reordered": {
        "sum_reordered": "sum", 
        "mean_reordered": "mean", 
        "std_reordered": "std"
    }
})

userXproduct = flat_columns(userXproduct).reset_index()
userXproduct = userXproduct.merge(
    orders_last, 
    left_on=["user_id", "last_order_number"],
    right_on=["user_id", "order_number"]
).drop("order_number", axis=1)

userXproduct = userXproduct.merge(
    orders_first, 
    left_on=["user_id", "first_order_number"],
    right_on=["user_id", "order_number"]
).drop(["user_id", "order_number"], axis=1)
userXproduct.drop_duplicates(subset=["z"], inplace=True)
userXproduct.set_index("z", inplace=True)
# d = dict()
# for row in tqdm(priors.itertuples(), total=len(priors)):
#     z = row.z
#     if z not in d:
#         d[z] = (
#             1,
#             (row.order_number, row.order_id),
#             row.add_to_cart_order,
#             row.reordered
#         )
#     else:
#         d[z] = (
#             d[z][0] + 1,
#             max(d[z][1], (row.order_number, row.order_id)),
#             d[z][2] + row.add_to_cart_order,
#             d[z][3] + row.reordered
#         )

# # priors.drop(['z'], axis=1, inplace=True)

# print('to dataframe (less memory)')
# d = pd.DataFrame.from_dict(d, orient='index')
# d.columns = ['nb_orders', 'last_order_id', 'sum_pos_in_cart', 'reorders']
# d.nb_orders = d.nb_orders.astype(np.int16)
# d.last_order_id = d.last_order_id.map(lambda x: x[1]).astype(np.int32)
# d.sum_pos_in_cart = d.sum_pos_in_cart.astype(np.int16)
# d.reorders = d.reorders.astype(np.int16)
   
# userXproduct = d
gc.collect()
print('user X product f', len(userXproduct))


/home/ubuntu/.venv/local/lib/python2.7/site-packages/pandas/core/groupby.py:4036: FutureWarning: using a dict with renaming is deprecated and will be removed in a future version
  return super(DataFrameGroupBy, self).aggregate(arg, *args, **kwargs)
Out[11]:
179
user X product f 13293564

User aisle interaction (UA)

  • purchases
  • reorders
  • day since last purchase
  • order since last purchase
  • etc.

In [12]:
priors['z'] = priors.aisle_id + priors.user_id * 100000
userXaisle = priors.groupby(["z", "user_id"]).agg({
    "order_id": {"nb_orders": "count"},
    "order_number": {
        "last_order_number": "max", 
        "first_order_number": "min"
    },
    "add_to_cart_order": {
        "sum_add_to_cart_order": "sum",
        "min_add_to_cart_order": "min",
        "max_add_to_cart_order": "max",
        "mean_add_to_cart_order": "mean",
        "std_add_to_cart_order": "std"
    },
    "reordered": {
        "sum_reordered": "sum", 
        "mean_reordered": "mean", 
        "std_reordered": "std"
    }
})

userXaisle = flat_columns(userXaisle).reset_index()
userXaisle = userXaisle.merge(
    orders_last, 
    left_on=["user_id", "last_order_number"],
    right_on=["user_id", "order_number"]
).drop("order_number", axis=1)

userXaisle = userXaisle.merge(
    orders_first, 
    left_on=["user_id", "first_order_number"],
    right_on=["user_id", "order_number"]
).drop(["user_id", "order_number"], axis=1)
userXaisle.drop_duplicates(subset=["z"], inplace=True)
userXaisle.set_index("z", inplace=True)
gc.collect()
print('user X aisle f', len(userXaisle))


Out[12]:
90
user X aisle f 5729249

User department interaction (UD)

  • purchases
  • reorders
  • day since last purchase
  • order since last purchase
  • etc.

In [13]:
priors['z'] = priors.department_id + priors.user_id * 100000
userXdepartment = priors.groupby(["z", "user_id"]).agg({
    "order_id": {"nb_orders": "count"},
    "order_number": {
        "last_order_number": "max", 
        "first_order_number": "min"
    },
    "add_to_cart_order": {
        "sum_add_to_cart_order": "sum",
        "min_add_to_cart_order": "min",
        "max_add_to_cart_order": "max",
        "mean_add_to_cart_order": "mean",
        "std_add_to_cart_order": "std"
    },
    "reordered": {
        "sum_reordered": "sum", 
        "mean_reordered": "mean", 
        "std_reordered": "std"
    }
})

userXdepartment = flat_columns(userXdepartment).reset_index()
userXdepartment = userXdepartment.merge(
    orders_last, 
    left_on=["user_id", "last_order_number"],
    right_on=["user_id", "order_number"]
).drop("order_number", axis=1)

userXdepartment = userXdepartment.merge(
    orders_first, 
    left_on=["user_id", "first_order_number"],
    right_on=["user_id", "order_number"]
).drop(["user_id", "order_number"], axis=1)
userXdepartment.drop_duplicates(subset=["z"], inplace=True)
userXdepartment.set_index("z", inplace=True)
gc.collect()
print('user X department f', len(userXdepartment))


Out[13]:
225
user X department f 2232789

User time interaction (UT)

  • user preferred day of week
  • user preferred time of day
  • similar features for products and aisles

Combine


In [5]:
### build list of candidate products to reorder, with features ###
train_index = set(op_train.index)

def features(selected_orders, labels_given=False):
    order_list = []
    product_list = []
    labels = []
    for row in tqdm(selected_orders.itertuples(), total=len(selected_orders)):
        order_id = row.order_id
        user_id = row.user_id
        user_products = list(users.all_products[user_id])
        product_list += user_products
        order_list += [order_id] * len(user_products)
        if labels_given:
            labels += [
                (order_id, product) in train_index 
                for product in user_products
            ]
        
    df = pd.DataFrame({'order_id': order_list, 'product_id': product_list})
    df.order_id = df.order_id.astype(np.int32)
    df.product_id = df.product_id.astype(np.int32)
    df['user_id'] = df.order_id.map(orders.user_id).astype(np.int32)
    df['aisle_id'] = df.product_id.map(products.aisle_id).astype(np.int8)
    df['department_id'] = df.product_id.map(products.department_id).astype(np.int8)

    labels = np.array(labels, dtype=np.int8)
    del order_list
    del product_list
    
    print('user related features')
    df['user_total_orders'] = df.user_id.map(users.nb_orders)
    df['user_total_items'] = df.user_id.map(users.total_items)
    df['user_total_distinct_items'] = df.user_id.map(users.total_distinct_items)
    df['user_average_days_between_orders'] = df.user_id.map(users.average_days_between_orders)
    df['user_max_days_between_orders'] = df.user_id.map(users.max_days_between_orders)
    df['user_min_days_between_orders'] = df.user_id.map(users.min_days_between_orders)
    df['user_std_days_between_orders'] = df.user_id.map(users.std_days_between_orders)
    df['user_average_basket'] =  df.user_id.map(users.average_basket)

    df['user_reorders'] =  df.user_id.map(users.reorders)
    df['user_reorders_max'] =  df.user_id.map(users.reorders_max)
    df['user_reorders_min'] =  df.user_id.map(users.reorders_min)
    df['user_reorders_mean'] =  df.user_id.map(users.reorders_mean)
    df['user_reorders_std'] =  df.user_id.map(users.reorders_std)
    df['user_reorder_rate'] =  df.user_id.map(users.reorder_rate)
    df['user_period'] =  df.user_id.map(users.period)
    
    print('order related features')
    df['dow'] = df.order_id.map(orders.order_dow)
    df['order_hour_of_day'] = df.order_id.map(orders.order_hour_of_day)
    df['days_since_prior_order'] = df.order_id.map(orders.days_since_prior_order)
    df['days_since_ratio'] = df.days_since_prior_order / df.user_average_days_between_orders
    
    print('product related features')
    df['product_orders'] = df.product_id.map(products.orders).astype(np.float32)
    df['product_users'] = df.product_id.map(products.users).astype(np.float32)
    df['product_order_freq'] = df.product_id.map(products.order_freq).astype(np.float32)

    df['product_reorders'] = df.product_id.map(products.reorders)
    df['product_reorders_max'] = df.product_id.map(products.reorders_max)
    df['product_reorders_min'] = df.product_id.map(products.reorders_min)
    df['product_reorders_mean'] = df.product_id.map(products.reorders_mean)
    df['product_reorders_std'] = df.product_id.map(products.reorders_std)
    df['product_reorder_rate'] = df.product_id.map(products.reorder_rate)
    df['product_add_to_cart_order_mean'] = df.product_id.map(products.add_to_cart_order_mean).astype(np.float32)
    df['product_add_to_cart_order_std'] = df.product_id.map(products.add_to_cart_order_std).astype(np.float32)

    print('aisle related features')
    df['aisle_orders'] = df.aisle_id.map(aisles.orders)
    df['aisle_users'] = df.aisle_id.map(aisles.users)
    df['aisle_order_freq'] = df.aisle_id.map(aisles.order_freq)
    df['aisle_reorders'] = df.aisle_id.map(aisles.reorders)
    df['aisle_reorder_rate'] = df.aisle_id.map(aisles.reorder_rate)
    df['aisle_add_to_cart_order_mean'] = df.aisle_id.map(aisles.add_to_cart_order_mean)
    df['aisle_add_to_cart_order_std'] = df.aisle_id.map(aisles.add_to_cart_order_std)
    
    print('department related features')
    df['department_orders'] = df.department_id.map(departments.orders)
    df['department_users'] = df.department_id.map(departments.users)
    df['department_order_freq'] = df.department_id.map(departments.order_freq)
    df['department_reorders'] = df.department_id.map(departments.reorders)
    df['department_reorder_rate'] = df.department_id.map(departments.reorder_rate)
    df['department_add_to_cart_order_mean'] = df.department_id.map(departments.add_to_cart_order_mean)
    df['department_add_to_cart_order_std'] = df.department_id.map(departments.add_to_cart_order_std)

    print('user_X_product related features')
    df['z'] = df.product_id + df.user_id * 100000
    df['UP_orders'] = df.z.map(userXproduct.nb_orders)
    df['UP_orders_ratio'] = (df.UP_orders / df.user_total_orders).astype(np.float32)
    df['UP_last_order_id'] = df.z.map(userXproduct.last_order_id)
    df['UP_first_order_id'] = df.z.map(userXproduct.first_order_id)
    
    df['UP_average_pos_in_cart'] = (df.z.map(userXproduct.sum_add_to_cart_order) / df.UP_orders).astype(np.float32)
    df['UP_sum_add_to_cart_order'] = df.z.map(userXproduct.sum_add_to_cart_order)
    df['UP_min_add_to_cart_order'] = df.z.map(userXproduct.min_add_to_cart_order)
    df['UP_mean_add_to_cart_order'] = df.z.map(userXproduct.mean_add_to_cart_order)
    df['UP_max_add_to_cart_order'] = df.z.map(userXproduct.max_add_to_cart_order)
    df['UP_std_add_to_cart_order'] = df.z.map(userXproduct.std_add_to_cart_order)

    df['UP_sum_reordered'] = df.z.map(userXproduct.sum_reordered)
    df['UP_mean_reordered'] = df.z.map(userXproduct.mean_reordered)
    df['UP_std_reordered'] = df.z.map(userXproduct.std_reordered)
    df['UP_reorders_rate'] = (df.UP_sum_reordered / df.UP_orders).astype(np.float32)

    df['UP_last_order_number'] = df.UP_last_order_id.map(orders.order_number)
    df['UP_first_order_number'] = df.UP_first_order_id.map(orders.order_number)
    df['UP_last_order_number_prc'] = (df.UP_last_order_number / df.user_total_orders).astype(np.float32)
    df['UP_first_order_number_prc'] = (df.UP_first_order_number / df.user_total_orders).astype(np.float32)

    df['UP_orders_since_last'] = df.user_total_orders - df.UP_last_order_number
    df['UP_orders_rate_since_first_order'] = df.UP_orders / (df.user_total_orders - df.UP_first_order_number + 1)
    
    df['UP_weeks_sinse_last'] = df.UP_last_order_id.map(orders.user_weekno) - df.order_id.map(orders.user_weekno)
    df['UP_days_sinse_last'] = df.UP_last_order_id.map(orders.user_days) - df.order_id.map(orders.user_days)
    
    df['UP_delta_hour_vs_last'] = abs(df.order_hour_of_day - \
                  df.UP_last_order_id.map(orders.order_hour_of_day)).map(
        lambda x: min(x, 24-x)
    ).astype(np.int8)

    print('user_X_aisle related features')
    df['z'] = df.aisle_id + df.user_id * 100000
    df['UA_orders'] = df.z.map(userXaisle.nb_orders)
    df['UA_orders_ratio'] = (df.UA_orders / df.user_total_orders).astype(np.float32)
    df['UA_last_order_id'] = df.z.map(userXaisle.last_order_id)
    df['UA_first_order_id'] = df.z.map(userXaisle.first_order_id)
    
    df['UA_average_pos_in_cart'] = (df.z.map(userXaisle.sum_add_to_cart_order) / df.UA_orders).astype(np.float32)
    df['UA_sum_add_to_cart_order'] = df.z.map(userXaisle.sum_add_to_cart_order)
    df['UA_min_add_to_cart_order'] = df.z.map(userXaisle.min_add_to_cart_order)
    df['UA_mean_add_to_cart_order'] = df.z.map(userXaisle.mean_add_to_cart_order)
    df['UA_max_add_to_cart_order'] = df.z.map(userXaisle.max_add_to_cart_order)
    df['UA_std_add_to_cart_order'] = df.z.map(userXaisle.std_add_to_cart_order)

    df['UA_sum_reordered'] = df.z.map(userXaisle.sum_reordered)
    df['UA_mean_reordered'] = df.z.map(userXaisle.mean_reordered)
    df['UA_std_reordered'] = df.z.map(userXaisle.std_reordered)
    df['UA_reorders_rate'] = (df.UA_sum_reordered / df.UA_orders).astype(np.float32)

    df['UA_last_order_number'] = df.UA_last_order_id.map(orders.order_number)
    df['UA_first_order_number'] = df.UA_first_order_id.map(orders.order_number)
    df['UA_last_order_number_prc'] = (df.UA_last_order_number / df.user_total_orders).astype(np.float32)
    df['UA_first_order_number_prc'] = (df.UA_first_order_number / df.user_total_orders).astype(np.float32)

    df['UA_orders_since_last'] = df.user_total_orders - df.UA_last_order_number
    df['UA_orders_rate_since_first_order'] = df.UA_orders / (df.user_total_orders - df.UA_first_order_number + 1)
    
    df['UA_weeks_sinse_last'] = df.UA_last_order_id.map(orders.user_weekno) - df.order_id.map(orders.user_weekno)
    df['UA_days_sinse_last'] = df.UA_last_order_id.map(orders.user_days) - df.order_id.map(orders.user_days)
    
    df['UA_delta_hour_vs_last'] = abs(df.order_hour_of_day - \
                  df.UA_last_order_id.map(orders.order_hour_of_day)).map(
        lambda x: min(x, 24-x)
    ).astype(np.int8)

    print('user_X_department related features')
    df['z'] = df.department_id + df.user_id * 100000
    df['UD_orders'] = df.z.map(userXdepartment.nb_orders)
    df['UD_orders_ratio'] = (df.UD_orders / df.user_total_orders).astype(np.float32)
    df['UD_last_order_id'] = df.z.map(userXdepartment.last_order_id)
    df['UD_first_order_id'] = df.z.map(userXdepartment.first_order_id)
    
    df['UD_average_pos_in_cart'] = (df.z.map(userXdepartment.sum_add_to_cart_order) / df.UD_orders).astype(np.float32)
    df['UD_sum_add_to_cart_order'] = df.z.map(userXdepartment.sum_add_to_cart_order)
    df['UD_min_add_to_cart_order'] = df.z.map(userXdepartment.min_add_to_cart_order)
    df['UD_mean_add_to_cart_order'] = df.z.map(userXdepartment.mean_add_to_cart_order)
    df['UD_max_add_to_cart_order'] = df.z.map(userXdepartment.max_add_to_cart_order)
    df['UD_std_add_to_cart_order'] = df.z.map(userXdepartment.std_add_to_cart_order)

    df['UD_sum_reordered'] = df.z.map(userXdepartment.sum_reordered)
    df['UD_mean_reordered'] = df.z.map(userXdepartment.mean_reordered)
    df['UD_std_reordered'] = df.z.map(userXdepartment.std_reordered)
    df['UD_reorders_rate'] = (df.UD_sum_reordered / df.UD_orders).astype(np.float32)

    df['UD_last_order_number'] = df.UD_last_order_id.map(orders.order_number)
    df['UD_first_order_number'] = df.UD_first_order_id.map(orders.order_number)
    df['UD_last_order_number_prc'] = (df.UD_last_order_number / df.user_total_orders).astype(np.float32)
    df['UD_first_order_number_prc'] = (df.UD_first_order_number / df.user_total_orders).astype(np.float32)

    df['UD_orders_since_last'] = df.user_total_orders - df.UD_last_order_number
    df['UD_orders_rate_since_first_order'] = df.UD_orders / (df.user_total_orders - df.UD_first_order_number + 1)
    
    df['UD_weeks_sinse_last'] = df.UD_last_order_id.map(orders.user_weekno) - df.order_id.map(orders.user_weekno)
    df['UD_days_sinse_last'] = df.UD_last_order_id.map(orders.user_days) - df.order_id.map(orders.user_days)
    
    df['UD_delta_hour_vs_last'] = abs(df.order_hour_of_day - \
                  df.UD_last_order_id.map(orders.order_hour_of_day)).map(
        lambda x: min(x, 24-x)
    ).astype(np.int8)

    df.drop([
        'UP_last_order_id', 'UP_first_order_id', 
        'UA_last_order_id', 'UA_first_order_id', 
        'UD_last_order_id', 'UD_first_order_id', 
        'z'], axis=1, inplace=True
    )

    gc.collect()
    return (df, labels)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-5-05007ade1cea> in <module>()
      1 ### build list of candidate products to reorder, with features ###
----> 2 train_index = set(op_train.index)
      3 
      4 def features(selected_orders, labels_given=False):
      5     order_list = []

NameError: name 'op_train' is not defined

In [15]:
### train / test orders ###
print('split orders : train, test')
test_orders = orders[orders.eval_set == 'test']
train_orders = orders[orders.eval_set == 'train']

df_train, labels = features(train_orders, labels_given=True)
df_test, _ = features(test_orders)

del test_orders, train_orders


split orders : train, test
100%|██████████| 131209/131209 [00:10<00:00, 12301.46it/s]
user related features
order related features
product related features
aisle related features
department related features
user_X_product related features
user_X_aisle related features
user_X_department related features
100%|██████████| 75000/75000 [00:02<00:00, 31517.44it/s]
user related features
order related features
product related features
aisle related features
department related features
user_X_product related features
user_X_aisle related features
user_X_department related features

None handling Model


In [13]:
### build list of candidate products to reorder, with features ###
df = op_train.groupby("order_id").agg({"reordered": "sum"})
df["reordered"] = df["reordered"].apply(lambda x: 1 if x == 0 else 0)
none_labels = df["reordered"].to_dict()

def none_features(selected_orders, labels_given=False):
    order_list = []
    labels = []
    for order_id in tqdm(selected_orders, total=len(selected_orders)):
        order_list += [order_id]
        if labels_given:
            labels += [none_labels[order_id]]
        
    df = pd.DataFrame({'order_id': order_list})
    df.order_id = df.order_id.astype(np.int32)
    labels = np.array(labels, dtype=np.int8)
    del order_list
    
    print('user related features')
    df['user_id'] = df.order_id.map(orders.user_id).astype(np.int32)
    df['user_total_orders'] = df.user_id.map(users.nb_orders)
    df['user_total_items'] = df.user_id.map(users.total_items)
    df['user_total_distinct_items'] = df.user_id.map(users.total_distinct_items)
    df['user_average_days_between_orders'] = df.user_id.map(users.average_days_between_orders)
    df['user_average_basket'] =  df.user_id.map(users.average_basket)
    df['user_period'] =  df.user_id.map(users.period)
    
    print('order related features')
    df['dow'] = df.order_id.map(orders.order_dow)
    df['order_hour_of_day'] = df.order_id.map(orders.order_hour_of_day)
    df['days_since_prior_order'] = df.order_id.map(orders.days_since_prior_order)
    df['days_since_ratio'] = df.days_since_prior_order / df.user_average_days_between_orders

    gc.collect()
    return (df, labels)


/home/ubuntu/.venv/lib/python2.7/site-packages/ipykernel_launcher.py:2: FutureWarning: 'order_id' is both a column name and an index level.
Defaulting to column but this will raise an ambiguity error in a future version
  

In [14]:
def none_train(traindf, y):
    none_params = {
        'task': 'train',
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric': ['auc'],
        'num_leaves': 96,
        'feature_fraction': 0.9,
        'bagging_fraction': 0.95,
        'bagging_freq': 5
    }
    d_train = lgb.Dataset(
        feature_select(traindf),
        label=y,
        categorical_feature=['aisle_id', 'department_id']
    )

    model = lgb.train(params, d_train, ROUNDS)
    return model

def none_cv(traindf, y):
    d_train = lgb.Dataset(
        feature_select(traindf),
        label=y,
        categorical_feature=['aisle_id', 'department_id']
    )

    return lgb.cv(params, d_train, ROUNDS)

In [15]:
def none_predict(model, df):
    return model.predict(feature_select(df))

Train


In [2]:
import pickle
df_train = pd.read_pickle("df_train.pkl")
df_test = pd.read_pickle("df_test.pkl")
labels = pickle.load(open("labels.pkl"))

In [31]:
def feature_select(df):
    return df.drop(
        ["user_id", "product_id", "order_id", "pred_ext"],
        axis=1, errors="ignore"
    )

In [7]:
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': {'binary_logloss'},
    'num_leaves': 96,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.95,
    'bagging_freq': 5
}
ROUNDS = 98

def train(traindf, y):
#     none_df, none_labels = none_features(traindf["order_id"].unique(), True)
#     none_model = none_train(none_df, none_labels)

    d_train = lgb.Dataset(
        feature_select(traindf),
        label=y,
        categorical_feature=['aisle_id', 'department_id']
    )

    model = lgb.train(params, d_train, ROUNDS)
    return model, None

In [8]:
model, none_model = train(df_train, labels)

Predict

F1 Expectation

This kernel implements the O(n²) F1-Score expectation maximization algorithm presented in "Ye, N., Chai, K., Lee, W., and Chieu, H. Optimizing F-measures: A Tale of Two Approaches. In ICML, 2012."

It solves argmax_(0 <= k <= n,[[None]]) E[F1(P,k,[[None]])] with [[None]] being the indicator for predicting label "None" given posteriors P = [p_1, p_2, ... , p_n], where p_1 > p_2 > ... > p_n under label independence assumption by means of dynamic programming in O(n²).


In [9]:
def get_expectations(P, pNone=None):
    expectations = []
    P = np.sort(P)[::-1]

    n = np.array(P).shape[0]
    DP_C = np.zeros((n + 2, n + 1))
    if pNone is None:
        pNone = (1.0 - P).prod()

    DP_C[0][0] = 1.0
    for j in range(1, n):
        DP_C[0][j] = (1.0 - P[j - 1]) * DP_C[0, j - 1]

    for i in range(1, n + 1):
        DP_C[i, i] = DP_C[i - 1, i - 1] * P[i - 1]
        for j in range(i + 1, n + 1):
            DP_C[i, j] = P[j - 1] * DP_C[i - 1, j - 1] + (1.0 - P[j - 1]) * DP_C[i, j - 1]

    DP_S = np.zeros((2 * n + 1,))
    DP_SNone = np.zeros((2 * n + 1,))
    for i in range(1, 2 * n + 1):
        DP_S[i] = 1. / (1. * i)
        DP_SNone[i] = 1. / (1. * i + 1)
    for k in range(n + 1)[::-1]:
        f1 = 0
        f1None = 0
        for k1 in range(n + 1):
            f1 += 2 * k1 * DP_C[k1][k] * DP_S[k + k1]
            f1None += 2 * k1 * DP_C[k1][k] * DP_SNone[k + k1]
        for i in range(1, 2 * k - 1):
            DP_S[i] = (1 - P[k - 1]) * DP_S[i] + P[k - 1] * DP_S[i + 1]
            DP_SNone[i] = (1 - P[k - 1]) * DP_SNone[i] + P[k - 1] * DP_SNone[i + 1]
        expectations.append([f1None + 2 * pNone / (2 + k), f1])

    return np.array(expectations[::-1]).T

def maximize_expectation(P, pNone=None):
    expectations = get_expectations(P, pNone)

    ix_max = np.unravel_index(expectations.argmax(), expectations.shape)
    max_f1 = expectations[ix_max]

    predNone = True if ix_max[0] == 0 else False
    best_k = ix_max[1]

    return best_k, predNone, max_f1

def _F1(tp, fp, fn):
    return 2 * tp / (2 * tp + fp + fn)

def _Fbeta(tp, fp, fn, beta=1.0):
    beta_squared = beta ** 2
    return (1.0 + beta_squared) * tp / ((1.0 + beta_squared) * tp + fp + beta_squared * fn)


def print_best_prediction(P, pNone=None):
    print("Maximize F1-Expectation")
    print("=" * 23)
    P = np.sort(P)[::-1]
    n = P.shape[0]
    L = ['L{}'.format(i + 1) for i in range(n)]

    if pNone is None:
        print("Estimate p(None|x) as (1-p_1)*(1-p_2)*...*(1-p_n)")
        pNone = (1.0 - P).prod()

    PL = ['p({}|x)={}'.format(l, p) for l, p in zip(L, P)]
    print("Posteriors: {} (n={})".format(PL, n))
    print("p(None|x)={}".format(pNone))

    opt = F1Optimizer.maximize_expectation(P, pNone)
    best_prediction = ['None'] if opt[1] else []
    best_prediction += (L[:opt[0]])
    f1_max = opt[2]

    print("Prediction {} yields best E[F1] of {}\n".format(best_prediction, f1_max))

In [10]:
%%cython
import numpy as np
def get_expectations_cyt(P, pNone=None):
    expectations = []
    P = np.sort(P)[::-1]

    n = np.array(P).shape[0]
    DP_C = np.zeros((n + 2, n + 1))
    if pNone is None:
        pNone = (1.0 - P).prod()

    DP_C[0][0] = 1.0
    for j in range(1, n):
        DP_C[0][j] = (1.0 - P[j - 1]) * DP_C[0, j - 1]

    for i in range(1, n + 1):
        DP_C[i, i] = DP_C[i - 1, i - 1] * P[i - 1]
        for j in range(i + 1, n + 1):
            DP_C[i, j] = P[j - 1] * DP_C[i - 1, j - 1] + (1.0 - P[j - 1]) * DP_C[i, j - 1]

    DP_S = np.zeros((2 * n + 1,))
    DP_SNone = np.zeros((2 * n + 1,))
    for i in range(1, 2 * n + 1):
        DP_S[i] = 1. / (1. * i)
        DP_SNone[i] = 1. / (1. * i + 1)
    for k in range(n + 1)[::-1]:
        f1 = 0
        f1None = 0
        for k1 in range(n + 1):
            f1 += 2 * k1 * DP_C[k1][k] * DP_S[k + k1]
            f1None += 2 * k1 * DP_C[k1][k] * DP_SNone[k + k1]
        for i in range(1, 2 * k - 1):
            DP_S[i] = (1 - P[k - 1]) * DP_S[i] + P[k - 1] * DP_S[i + 1]
            DP_SNone[i] = (1 - P[k - 1]) * DP_SNone[i] + P[k - 1] * DP_SNone[i + 1]
        expectations.append([f1None + 2 * pNone / (2 + k), f1])

    return np.array(expectations[::-1]).T

def maximize_expectation_cyt(P, pNone=None):
    expectations = get_expectations_cyt(P, pNone)

    ix_max = np.unravel_index(expectations.argmax(), expectations.shape)
    max_f1 = expectations[ix_max]

    predNone = True if ix_max[0] == 0 else False
    best_k = ix_max[1]

    return best_k, predNone, max_f1

def print_best_prediction_cyt(P, pNone=None):
    print("Maximize F1-Expectation")
    print("=" * 23)
    P = np.sort(P)[::-1]
    n = P.shape[0]
    L = ['L{}'.format(i + 1) for i in range(n)]

    if pNone is None:
        print("Estimate p(None|x) as (1-p_1)*(1-p_2)*...*(1-p_n)")
        pNone = (1.0 - P).prod()

    PL = ['p({}|x)={}'.format(l, p) for l, p in zip(L, P)]
    print("Posteriors: {} (n={})".format(PL, n))
    print("p(None|x)={}".format(pNone))

    opt = maximize_expectation_cyt(P, pNone)
    best_prediction = ['None'] if opt[1] else []
    best_prediction += (L[:opt[0]])
    f1_max = opt[2]

    print("Prediction {} yields best E[F1] of {}\n".format(best_prediction, f1_max))

Predict


In [41]:
def final_predict(df_test, none_model=None, TRESHOLD=0.5):
    d = dict()

    if none_model:
        none_df, _ = none_features(df_test["order_id"].unique(), False)
        none_df["pred"] = none_predict(none_model, none_df)
        none_model_res = none_df.set_index("order_id")["pred"].to_dict()

    # Вот тут можно отрезать не по threshold, а с помощью модели определять кол-во покупок
    current_order_id = None
    current_order_count = 0
    current_order_basket_size = 0
    for row in tqdm_notebook(df_test.sort_values(
        by=["order_id", "pred"], 
        ascending=[False, False]
    ).itertuples(), total=len(df_test)):
        order_id = row.order_id
        if order_id != current_order_id:
            current_order_id = order_id
            current_order_count = 0
            P = df_test[df_test.order_id == order_id].pred.values

#             if none_model and none_model_res[order_id] > TRESHOLD:
#                 current_order_basket_size = 0
#             else:
            best_k, predNone, max_f1 = maximize_expectation_cyt(P)
            current_order_basket_size = best_k
            if predNone:
                d[order_id] = 'None'
            
        if current_order_count >= current_order_basket_size:
            continue

        current_order_count += 1
        try:
            d[order_id] += ' ' + str(row.product_id)
        except KeyError:
            d[order_id] = str(row.product_id)

    for order_id in df_test.order_id:
        if order_id not in d:
            d[order_id] = 'None'

    sub = pd.DataFrame.from_dict(d, orient='index')
    sub.reset_index(inplace=True)
    sub.columns = ['order_id', 'products']
    return sub
def predict(model, df_test, none_model=None, TRESHOLD=0.5):
    ### build candidates list for test ###

    df_test['pred'] = model.predict(feature_select(df_test))
    if "pred_ext" in list(df_test.columns):
        df_test['pred'] = (
            df_test['pred'] * 0.2 + 
            df_test['pred_ext'] * 0.8
        )
        print("average pred and pred_ext")
    return final_predict(df_test, none_model=None, TRESHOLD=0.5)

In [37]:
# Загружаем внешний prediction
pred_ext = pd.read_csv("prediction_lgbm.csv").rename(
    columns={"prediction": "pred_ext"}
)
df_test = df_test.merge(pred_ext, on=["order_id", "product_id"])
df_test.head()


Out[37]:
order_id product_id user_id aisle_id department_id user_total_orders user_total_items user_total_distinct_items user_average_days_between_orders user_max_days_between_orders ... UD_last_order_number_prc UD_first_order_number_prc UD_orders_since_last UD_orders_rate_since_first_order UD_weeks_sinse_last UD_days_sinse_last UD_delta_hour_vs_last pred_ext_x pred pred_ext_y
0 2774568 17668 3 91 16 13 88 33 12.0 21.0 ... 0.923077 0.076923 1 1.615385 -2.0 -11.0 0 0.351492 0.347998 0.351492
1 2774568 39190 3 91 16 13 88 33 12.0 21.0 ... 0.923077 0.076923 1 1.615385 -2.0 -11.0 0 0.789540 0.782418 0.789540
2 2774568 44683 3 83 4 13 88 33 12.0 21.0 ... 0.923077 0.076923 1 2.923077 -2.0 -11.0 0 0.066456 0.067806 0.066456
3 2774568 21903 3 123 4 13 88 33 12.0 21.0 ... 0.923077 0.076923 1 2.923077 -2.0 -11.0 0 0.538955 0.554593 0.538955
4 2774568 14992 3 83 4 13 88 33 12.0 21.0 ... 0.923077 0.076923 1 2.923077 -2.0 -11.0 0 0.074164 0.075875 0.074164

5 rows × 115 columns


In [42]:
sub = final_predict(df_test)
sub.to_csv('sub2.csv', index=False)




In [43]:
sub = final_predict(pd.read_csv("prediction_lgbm.csv").rename(
    columns={"prediction": "pred"}
))
sub.to_csv('sub.csv', index=False)




In [35]:
sub = predict(model, df_test, none_model, TRESHOLD=0.8)
sub.to_csv('sub2.csv', index=False)


average pred and pred_ext

CV


In [ ]:
lgb.cv(params, d_train, ROUNDS, nfold=5, verbose_eval=10)

In [21]:
%%cache df_train_gt.pkl df_train_gt

from functools import partial

products_raw = pd.read_csv(IDIR + 'products.csv')
# combine aisles, departments and products (left joined to products)
goods = pd.merge(left=pd.merge(left=products_raw, right=departments, how='left'), right=aisles, how='left')
# to retain '-' and make product names more "standard"
goods.product_name = goods.product_name.str.replace(' ', '_').str.lower() 

# retype goods to reduce memory usage
goods.product_id = goods.product_id.astype(np.int32)
goods.aisle_id = goods.aisle_id.astype(np.int16)
goods.department_id = goods.department_id.astype(np.int8)

# initialize it with train dataset
train_details = pd.merge(
                left=op_train,
                 right=orders, 
                 how='left', 
                 on='order_id'
        ).apply(partial(pd.to_numeric, errors='ignore', downcast='integer'))

# add order hierarchy
train_details = pd.merge(
                left=train_details,
                right=goods[['product_id', 
                             'aisle_id', 
                             'department_id']].apply(partial(pd.to_numeric, 
                                                             errors='ignore', 
                                                             downcast='integer')),
                how='left',
                on='product_id'
)

train_gtl = []

for uid, subset in train_details.groupby('user_id'):
    subset1 = subset[subset.reordered == 1]
    oid = subset.order_id.values[0]

    if len(subset1) == 0:
        train_gtl.append((oid, 'None'))
        continue

    ostr = ' '.join([str(int(e)) for e in subset1.product_id.values])
    # .strip is needed because join can have a padding space at the end
    train_gtl.append((oid, ostr.strip()))

del train_details
del goods
del products_raw

gc.collect()

df_train_gt = pd.DataFrame(train_gtl)

df_train_gt.columns = ['order_id', 'products']
df_train_gt.set_index('order_id', inplace=True)
df_train_gt.sort_index(inplace=True)


[Skipped the cell's code and loaded variables df_train_gt from file '/home/ubuntu/kaggle/instacart/df_train_gt.pkl'.]
54

In [22]:
from sklearn.model_selection import GroupKFold

def f1_score(cvpred):
    joined = df_train_gt.join(cvpred, rsuffix="_cv", how="inner")
    lgts = joined.products.replace("None", "-1").apply(lambda x: x.split(" ")).values
    lpreds = joined.products_cv.replace("None", "-1").apply(lambda x: x.split(" ")).values
    f1 = []
    for lgt, lpred in zip(lgts, lpreds):
        rr = (np.intersect1d(lgt, lpred))
        precision = np.float(len(rr)) / len(lpred)
        recall = np.float(len(rr)) / len(lgt)

        denom = precision + recall
        f1.append(((2 * precision * recall) / denom) if denom > 0 else 0)
    return np.mean(f1)

def cv(threshold=0.5, n=5):
    nsplits = n
    if n == 1:
        nsplits = 2
    gkf = GroupKFold(n_splits=nsplits)

    scores = []
    for train_idx, test_idx in gkf.split(df_train.index, groups=df_train.user_id):
        dftrain = df_train.iloc[train_idx]
        dftest = df_train.iloc[test_idx]
        y = labels[train_idx]
        model, none_model = train(dftrain, y)
        pred = predict(model, dftest, none_model, threshold)
        f1 = f1_score(pred.set_index("order_id"))
        print(f1)
        scores.append(f1)
        del dftrain
        del dftest
        gc.collect()
        if n == 1:
            break

    return np.mean(scores), np.std(scores)

In [23]:
cv(n=1)


100%|██████████| 65486/65486 [00:00<00:00, 876772.24it/s]
user related features
order related features
/home/ubuntu/.venv/lib/python2.7/site-packages/ipykernel_launcher.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
100%|██████████| 65723/65723 [00:00<00:00, 208003.96it/s]
user related features
order related features
0.230950656711
Out[23]:
(0.23095065671059606, 0.0)

In [37]:
for th in [0.8, 0.7, 0.6, 0.5, 0.4]:
    print(th)
    print("\t", cv(threshold=th, n=1))
    print()


0.8
100%|██████████| 65604/65604 [00:00<00:00, 967082.86it/s]
user related features
order related features
/home/ubuntu/.venv/lib/python2.7/site-packages/ipykernel_launcher.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
100%|██████████| 65605/65605 [00:00<00:00, 1830799.37it/s]
user related features
order related features
0.386930142523
	 (0.3869301425230352, 0.0)

0.7
100%|██████████| 65604/65604 [00:00<00:00, 977034.20it/s]
user related features
order related features
100%|██████████| 65605/65605 [00:00<00:00, 1803772.60it/s]
user related features
order related features
0.38692823718
	 (0.3869282371804546, 0.0)

0.6
100%|██████████| 65604/65604 [00:00<00:00, 1002751.81it/s]
user related features
order related features
100%|██████████| 65605/65605 [00:00<00:00, 1994255.07it/s]
user related features
order related features
0.386927269387
	 (0.3869272693873978, 0.0)

0.5
100%|██████████| 65604/65604 [00:00<00:00, 878798.13it/s]
user related features
order related features
100%|██████████| 65605/65605 [00:00<00:00, 2172933.92it/s]
user related features
order related features
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-37-4bcb54de3a36> in <module>()
      1 for th in [0.8, 0.7, 0.6, 0.5, 0.4]:
      2     print(th)
----> 3     print("\t", cv(threshold=th, n=1))
      4     print()

<ipython-input-35-d5a6bf1c9b05> in cv(threshold, n)
     27         y = labels[train_idx]
     28         model, none_model = train(dftrain, y)
---> 29         pred = predict(model, dftest, none_model, threshold)
     30         f1 = f1_score(pred.set_index("order_id"))
     31         print(f1)

<ipython-input-34-e637be62bb3d> in predict(model, df_test, none_model, TRESHOLD)
     36                 current_order_basket_size = 0
     37             else:
---> 38                 best_k, predNone, max_f1 = F1Optimizer.maximize_expectation(P)
     39                 if predNone:
     40                     current_order_basket_size = 0

<ipython-input-15-c449794eaafe> in maximize_expectation(P, pNone)
     42     @staticmethod
     43     def maximize_expectation(P, pNone=None):
---> 44         expectations = F1Optimizer.get_expectations(P, pNone)
     45 
     46         ix_max = np.unravel_index(expectations.argmax(), expectations.shape)

<ipython-input-15-c449794eaafe> in get_expectations(P, pNone)
     31             f1None = 0
     32             for k1 in range(n + 1):
---> 33                 f1 += 2 * k1 * DP_C[k1][k] * DP_S[k + k1]
     34                 f1None += 2 * k1 * DP_C[k1][k] * DP_SNone[k + k1]
     35             for i in range(1, 2 * k - 1):

KeyboardInterrupt: 
364524/|/  9%|| 364524/4237331 [03:20<35:27, 1820.44it/s]

0.372658477911

0.9: 0.386930142523

0.8: 0.386930142523

0.7: 0.38692823718

0.6: 0.386927269387

подмешанный none score: 0.378085325812

без none: 0.386930142523


In [17]:
0.18
0.375669602808
0.37518960199
0.376068733519
0.374880658158
0.371575669134
(0.37467685312194482, 0.0016027896306283745)

0.19
0.375981281546
0.375613273106
0.37623495823
0.374958453045
0.371884026622
(0.3749343985097483, 0.0015845275427144021)

0.2
0.376141810192
0.375593739202
0.375961736002
0.375124046483
0.371748172351
(0.37491390084571824, 0.001620734287706205)


0.21
0.375454836995
0.374657579102
0.375585106194
0.374639123067
0.371277685501
(0.37432286617177202, 0.0015722458019732746)


0.2
0.376141810192
0.375593739202
0.375961736002
0.375124046483
0.371748172351

(0.37491390084571824, 0.001620734287706205)

0.374504880043
0.372459365153
0.374241429517
0.373332070018
0.370178093483
(0.37294316764289259, 0.0015591904647740879) 0.22
0.370290530162
0.369518178297
0.370515696117
0.369568282123
0.3673846793
(0.36945547319979183, 0.0011069090226251931) 0.24
0.363691285892
0.363725106289
0.363492700824
0.364412180878
0.363024994542
(0.36366925368510306, 0.00044761289123321511) 0.26


  File "<ipython-input-17-fd741fd5103b>", line 6
    (0.37491390084571824, 0.001620734287706205) 0.2
                                                  ^
SyntaxError: invalid syntax

Модель определения кол-ва покупок


In [3]:
prior_orders_count = priors[["order_id", "product_id"]].groupby("order_id").count()
prior_orders_count = prior_orders_count.rename(columns={"product_id": "product_counts"})

train_orders_count = op_train.drop(["product_id", "order_id"], axis=1, errors="ignore")
train_orders_count = train_orders_count.reset_index()[["order_id", "product_id"]].groupby("order_id").count()
train_orders_count = train_orders_count.rename(columns={"product_id": "product_counts"})

prior_orders_count = orders.join(prior_orders_count, how='inner')
train_orders_count = orders.join(train_orders_count, how='inner')
prior_orders_count.head(15)


Out[3]:
order_id user_id eval_set order_number order_dow order_hour_of_day days_since_prior_order product_counts
2 473747 1 prior 3 3 12 21.0 9
3 2254736 1 prior 4 4 7 29.0 8
4 431534 1 prior 5 4 15 28.0 13
5 3367565 1 prior 6 2 7 19.0 26
6 550135 1 prior 7 1 9 20.0 3
7 3108588 1 prior 8 1 14 14.0 2
8 2295261 1 prior 9 1 16 0.0 1
9 2550362 1 prior 10 4 8 30.0 15
10 1187899 1 train 11 4 8 14.0 15
11 2168274 2 prior 1 2 11 NaN 5
12 1501582 2 prior 2 5 10 10.0 15
13 1901567 2 prior 3 1 10 3.0 13
14 738281 2 prior 4 2 10 8.0 11
15 1673511 2 prior 5 3 11 8.0 5
16 1199898 2 prior 6 2 9 13.0 3

In [4]:
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error


def get_order_count(order, alpha=0.5):
    user_id = order["user_id"]
    df = prior_orders_count[prior_orders_count["user_id"] == user_id]
    feats = ["order_number", "order_dow", "order_hour_of_day", "days_since_prior_order"]
    X = df[feats].fillna(0).values
    y = df["product_counts"].values

    # create dataset for lightgbm
    lgb_train = lgb.Dataset(X, y)

    # specify your configurations as a dict
    params = {
        'task': 'train',
        'boosting_type': 'gbdt',
        'objective': 'regression',
        'metric': {'rmse'},
        'num_leaves': 100,
        'learning_rate': 0.01,
        'feature_fraction': 0.9,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'verbose': 0,
    }

    # train
    clf = lgb.train(params,
                    lgb_train,
                    num_boost_round=40)

#     clf = Lasso(alpha=0.01)
#     clf.fit(X, y)

    Xpred = np.array([order[f] or 0 for f in feats]).reshape(1, -1)
    Xpred = np.nan_to_num(Xpred, 0)
    return clf.predict(Xpred)[0]

df = train_orders_count.head(1000)
df["pred_products_count"] = df.apply(get_order_count, axis=1)

print(mean_squared_error(
            df["product_counts"],
            df["pred_products_count"]
        ))


75.973135734
/home/ubuntu/.venv/lib/python2.7/site-packages/ipykernel_launcher.py:42: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

None handling model


In [ ]:
### build list of candidate products to reorder, with features ###
train_index = set(op_train.index)

def none_features(selected_orders, labels_given=False):
    order_list = []
    product_list = []
    labels = []
    for row in tqdm(selected_orders.itertuples(), total=len(selected_orders)):
        order_id = row.order_id
        user_id = row.user_id
        order_list += [order_id]
        if labels_given:
            labels += [
                (order_id, product) in train_index 
                for product in user_products
            ]
        
    df = pd.DataFrame({'order_id': order_list})
    df.order_id = df.order_id.astype(np.int32)
    labels = np.array(labels, dtype=np.int8)
    del order_list
    
    print('user related features')
    df['user_id'] = df.order_id.map(orders.user_id).astype(np.int32)
    df['user_total_orders'] = df.user_id.map(users.nb_orders)
    df['user_total_items'] = df.user_id.map(users.total_items)
    df['user_total_distinct_items'] = df.user_id.map(users.total_distinct_items)
    df['user_average_days_between_orders'] = df.user_id.map(users.average_days_between_orders)
    df['user_average_basket'] =  df.user_id.map(users.average_basket)
    df['user_period'] =  df.user_id.map(users.period)
    
    print('order related features')
    df['dow'] = df.order_id.map(orders.order_dow)
    df['order_hour_of_day'] = df.order_id.map(orders.order_hour_of_day)
    df['days_since_prior_order'] = df.order_id.map(orders.days_since_prior_order)
    df['days_since_ratio'] = df.days_since_prior_order / df.user_average_days_between_orders

    gc.collect()
    return (df, labels)

Test Big Problem


In [10]:
train_index = set(op_train.index)
train_orders = orders[orders.eval_set == 'train']
selected_orders = train_orders
labels_given=True
order_list = []
product_list = []
labels = []
for row in tqdm(selected_orders.itertuples(), total=len(selected_orders)):
    order_id = row.order_id
    user_id = row.user_id
    user_products = list(users.all_products[user_id])
    product_list += user_products
    order_list += [order_id] * len(user_products)
    if labels_given:
        labels += [
            (order_id, product) in train_index 
            for product in user_products
        ]


100%|██████████| 131209/131209 [00:11<00:00, 11402.83it/s]

In [18]:
df = pd.DataFrame({'order_id': order_list, 'product_id': product_list})
df.order_id = df.order_id.astype(np.int32)
df.product_id = df.product_id
df['user_id'] = df.order_id.map(orders.user_id)
df['aisle_id'] = df.product_id.map(products.aisle_id).astype(np.int8)
df['department_id'] = df.product_id.map(products.department_id).astype(np.int8)

labels = np.array(labels, dtype=np.int8)
# del order_list
# del product_list

print('user related features')
df['user_total_orders'] = df.user_id.map(users.nb_orders)
df['user_total_items'] = df.user_id.map(users.total_items)
df['user_total_distinct_items'] = df.user_id.map(users.total_distinct_items)
df['user_average_days_between_orders'] = df.user_id.map(users.average_days_between_orders)
df['user_max_days_between_orders'] = df.user_id.map(users.max_days_between_orders)
df['user_min_days_between_orders'] = df.user_id.map(users.min_days_between_orders)
df['user_std_days_between_orders'] = df.user_id.map(users.std_days_between_orders)
df['user_average_basket'] =  df.user_id.map(users.average_basket)

df['user_reorders'] =  df.user_id.map(users.reorders)
df['user_reorder_rate'] =  df.user_id.map(users.reorder_rate)
df['user_period'] =  df.user_id.map(users.period)

print('order related features')
df['dow'] = df.order_id.map(orders.order_dow)
df['order_hour_of_day'] = df.order_id.map(orders.order_hour_of_day)
df['days_since_prior_order'] = df.order_id.map(orders.days_since_prior_order)
df['days_since_ratio'] = df.days_since_prior_order / df.user_average_days_between_orders

print('product related features')
df['product_orders'] = df.product_id.map(products.orders).astype(np.int32)
df['product_users'] = df.product_id.map(products.users).astype(np.int32)
df['product_order_freq'] = df.product_id.map(products.order_freq).astype(np.float32)
df['product_reorders'] = df.product_id.map(products.reorders).astype(np.int32)
df['product_reorder_rate'] = df.product_id.map(products.reorder_rate)
df['product_add_to_cart_order_mean'] = df.product_id.map(products.add_to_cart_order_mean).astype(np.float32)
df['product_add_to_cart_order_std'] = df.product_id.map(products.add_to_cart_order_std).astype(np.float32)

print('aisle related features')
df['aisle_orders'] = df.aisle_id.map(aisles.orders).astype(np.int32)
df['aisle_users'] = df.aisle_id.map(aisles.users).astype(np.int32)
df['aisle_order_freq'] = df.aisle_id.map(aisles.order_freq).astype(np.float32)
df['aisle_reorders'] = df.aisle_id.map(aisles.reorders).astype(np.int32)
df['aisle_reorder_rate'] = df.aisle_id.map(aisles.reorder_rate).astype(np.float32)
df['aisle_add_to_cart_order_mean'] = df.aisle_id.map(aisles.add_to_cart_order_mean).astype(np.float32)
df['aisle_add_to_cart_order_std'] = df.aisle_id.map(aisles.add_to_cart_order_std).astype(np.float32)

print('department related features')
df['department_orders'] = df.department_id.map(departments.orders).astype(np.int32)
df['department_users'] = df.department_id.map(departments.users).astype(np.int32)
df['department_order_freq'] = df.department_id.map(departments.order_freq).astype(np.float32)
df['department_reorders'] = df.department_id.map(departments.reorders).astype(np.int32)
df['department_reorder_rate'] = df.department_id.map(departments.reorder_rate).astype(np.float32)
df['department_add_to_cart_order_mean'] = df.department_id.map(departments.add_to_cart_order_mean).astype(np.float32)
df['department_add_to_cart_order_std'] = df.department_id.map(departments.add_to_cart_order_std).astype(np.float32)

print('user_X_product related features')
df['z'] = df.product_id + df.user_id * 100000
df['UP_orders'] = df.z.map(userXproduct.nb_orders)
df['UP_orders_ratio'] = (df.UP_orders / df.user_total_orders).astype(np.float32)
df['UP_last_order_id'] = df.z.map(userXproduct.last_order_id)
#     df['UP_first_order_id'] = df.z.map(userXproduct.first_order_id)
df['UP_average_pos_in_cart'] = (df.z.map(userXproduct.sum_pos_in_cart) / df.UP_orders).astype(np.float32)
df['UP_reorders'] = df.z.map(userXproduct.reorders)
df['UP_last_order_number'] = df.UP_last_order_id.map(orders.order_number)
#     df['UP_first_order_number'] = df.UP_first_order_id.map(orders.order_number)
df['UP_orders_since_last'] = df.user_total_orders - df.UP_last_order_number
#     df['UP_orders_rate_since_first_order'] = df.UP_orders / (df.user_total_orders - df.UP_first_order_number + 1)

df['UP_weeks_sinse_last'] = df.UP_last_order_id.map(orders.user_weekno_rev)
df['UP_days_sinse_last'] = df.UP_last_order_id.map(orders.user_days_rev)

df['UP_delta_hour_vs_last'] = abs(df.order_hour_of_day - \
              df.UP_last_order_id.map(orders.order_hour_of_day)).map(
    lambda x: min(x, 24-x)
).astype(np.int8)

#df['UP_same_dow_as_last_order'] = df.UP_last_order_id.map(orders.order_dow) == \
#                                              df.order_id.map(orders.order_dow)

df.drop(['UP_last_order_id', 'z'], axis=1, inplace=True)

gc.collect()


user related features
order related features
product related features
aisle related features
department related features
user_X_product related features
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-18-aca1d9fe66dd> in <module>()
     75 df['UP_delta_hour_vs_last'] = abs(df.order_hour_of_day -               df.UP_last_order_id.map(orders.order_hour_of_day)).map(
     76     lambda x: min(x, 24-x)
---> 77 ).astype(np.int8)
     78 
     79 #df['UP_same_dow_as_last_order'] = df.UP_last_order_id.map(orders.order_dow) == \

/home/ubuntu/.venv/local/lib/python2.7/site-packages/pandas/util/_decorators.pyc in wrapper(*args, **kwargs)
     89                 else:
     90                     kwargs[new_arg_name] = new_arg_value
---> 91             return func(*args, **kwargs)
     92         return wrapper
     93     return _deprecate_kwarg

/home/ubuntu/.venv/local/lib/python2.7/site-packages/pandas/core/generic.pyc in astype(self, dtype, copy, errors, **kwargs)
   3408         # else, only a single dtype is given
   3409         new_data = self._data.astype(dtype=dtype, copy=copy, errors=errors,
-> 3410                                      **kwargs)
   3411         return self._constructor(new_data).__finalize__(self)
   3412 

/home/ubuntu/.venv/local/lib/python2.7/site-packages/pandas/core/internals.pyc in astype(self, dtype, **kwargs)
   3222 
   3223     def astype(self, dtype, **kwargs):
-> 3224         return self.apply('astype', dtype=dtype, **kwargs)
   3225 
   3226     def convert(self, **kwargs):

/home/ubuntu/.venv/local/lib/python2.7/site-packages/pandas/core/internals.pyc in apply(self, f, axes, filter, do_integrity_check, consolidate, **kwargs)
   3089 
   3090             kwargs['mgr'] = self
-> 3091             applied = getattr(b, f)(**kwargs)
   3092             result_blocks = _extend_blocks(applied, result_blocks)
   3093 

/home/ubuntu/.venv/local/lib/python2.7/site-packages/pandas/core/internals.pyc in astype(self, dtype, copy, errors, values, **kwargs)
    469     def astype(self, dtype, copy=False, errors='raise', values=None, **kwargs):
    470         return self._astype(dtype, copy=copy, errors=errors, values=values,
--> 471                             **kwargs)
    472 
    473     def _astype(self, dtype, copy=False, errors='raise', values=None,

/home/ubuntu/.venv/local/lib/python2.7/site-packages/pandas/core/internals.pyc in _astype(self, dtype, copy, errors, values, klass, mgr, raise_on_error, **kwargs)
    519 
    520                 # _astype_nansafe works fine with 1-d only
--> 521                 values = astype_nansafe(values.ravel(), dtype, copy=True)
    522                 values = values.reshape(self.shape)
    523 

/home/ubuntu/.venv/local/lib/python2.7/site-packages/pandas/core/dtypes/cast.pyc in astype_nansafe(arr, dtype, copy)
    618 
    619         if not np.isfinite(arr).all():
--> 620             raise ValueError('Cannot convert non-finite values (NA or inf) to '
    621                              'integer')
    622 

ValueError: Cannot convert non-finite values (NA or inf) to integer

In [13]:
df = pd.DataFrame({'order_id':order_list, 'product_id':product_list})
df.order_id = df.order_id.astype(np.int32)
df.product_id = df.product_id.astype(np.int32)
df['user_id'] = df.order_id.map(orders.user_id)
df["user_id"] = df["user_id"].astype(np.int32)

print('user related features')

df['user_total_orders'] = df.user_id.map(users.nb_orders)
df['user_total_items'] = df.user_id.map(users.total_items)
df['user_total_distinct_items'] = df.user_id.map(users.total_distinct_items)
df['user_average_days_between_orders'] = df.user_id.map(users.average_days_between_orders)
df['user_average_basket'] =  df.user_id.map(users.average_basket)
df['user_period'] =  df.user_id.map(users.period)

print('order related features')
# df['dow'] = df.order_id.map(orders.order_dow)
df['order_hour_of_day'] = df.order_id.map(orders.order_hour_of_day)
df['days_since_prior_order'] = df.order_id.map(orders.days_since_prior_order)
df['days_since_ratio'] = df.days_since_prior_order / df.user_average_days_between_orders

print('product related features')
df['aisle_id'] = df.product_id.map(products.aisle_id).astype(np.int8)
df['department_id'] = df.product_id.map(products.department_id).astype(np.int8)
df['product_orders'] = df.product_id.map(products.orders).astype(np.float32)
df['product_users'] = df.product_id.map(products.users).astype(np.float32)
df['product_order_freq'] = df.product_id.map(products.order_freq).astype(np.float32)
df['product_reorders'] = df.product_id.map(products.reorders).astype(np.float32)
df['product_reorder_rate'] = df.product_id.map(products.reorder_rate)

print('user_X_product related features')
df['z'] = df.product_id + df.user_id * 100000
df['UP_orders'] = df.z.map(userXproduct.nb_orders)
df['UP_orders_ratio'] = (df.UP_orders / df.user_total_orders).astype(np.float32)
df['UP_last_order_id'] = df.z.map(userXproduct.last_order_id)
df['UP_average_pos_in_cart'] = (df.z.map(userXproduct.sum_pos_in_cart) / df.UP_orders).astype(np.float32)

df['UP_reorders'] = df.z.map(userXproduct.reorders)

df['UP_orders_since_last'] = df.user_total_orders - df.UP_last_order_id.map(orders.order_number)
df['UP_delta_hour_vs_last'] = abs(df.order_hour_of_day - \
              df.UP_last_order_id.map(orders.order_hour_of_day)).map(lambda x: min(x, 24-x)).astype(np.int8)

#     df['UP_days_past_last_buy'] = 
#df['UP_same_dow_as_last_order'] = df.UP_last_order_id.map(orders.order_dow) == \
#                                              df.order_id.map(orders.order_dow)

df.drop(['UP_last_order_id', 'z'], axis=1, inplace=True)


user related features
order related features
product related features
user_X_product related features
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-13-11906d9f9e5f> in <module>()
     39 
     40 df['UP_orders_since_last'] = df.user_total_orders - df.UP_last_order_id.map(orders.order_number)
---> 41 df['UP_delta_hour_vs_last'] = abs(df.order_hour_of_day -               df.UP_last_order_id.map(orders.order_hour_of_day)).map(lambda x: min(x, 24-x)).astype(np.int8)
     42 
     43 #     df['UP_days_past_last_buy'] =

/home/ubuntu/.venv/local/lib/python2.7/site-packages/pandas/util/_decorators.pyc in wrapper(*args, **kwargs)
     89                 else:
     90                     kwargs[new_arg_name] = new_arg_value
---> 91             return func(*args, **kwargs)
     92         return wrapper
     93     return _deprecate_kwarg

/home/ubuntu/.venv/local/lib/python2.7/site-packages/pandas/core/generic.pyc in astype(self, dtype, copy, errors, **kwargs)
   3408         # else, only a single dtype is given
   3409         new_data = self._data.astype(dtype=dtype, copy=copy, errors=errors,
-> 3410                                      **kwargs)
   3411         return self._constructor(new_data).__finalize__(self)
   3412 

/home/ubuntu/.venv/local/lib/python2.7/site-packages/pandas/core/internals.pyc in astype(self, dtype, **kwargs)
   3222 
   3223     def astype(self, dtype, **kwargs):
-> 3224         return self.apply('astype', dtype=dtype, **kwargs)
   3225 
   3226     def convert(self, **kwargs):

/home/ubuntu/.venv/local/lib/python2.7/site-packages/pandas/core/internals.pyc in apply(self, f, axes, filter, do_integrity_check, consolidate, **kwargs)
   3089 
   3090             kwargs['mgr'] = self
-> 3091             applied = getattr(b, f)(**kwargs)
   3092             result_blocks = _extend_blocks(applied, result_blocks)
   3093 

/home/ubuntu/.venv/local/lib/python2.7/site-packages/pandas/core/internals.pyc in astype(self, dtype, copy, errors, values, **kwargs)
    469     def astype(self, dtype, copy=False, errors='raise', values=None, **kwargs):
    470         return self._astype(dtype, copy=copy, errors=errors, values=values,
--> 471                             **kwargs)
    472 
    473     def _astype(self, dtype, copy=False, errors='raise', values=None,

/home/ubuntu/.venv/local/lib/python2.7/site-packages/pandas/core/internals.pyc in _astype(self, dtype, copy, errors, values, klass, mgr, raise_on_error, **kwargs)
    519 
    520                 # _astype_nansafe works fine with 1-d only
--> 521                 values = astype_nansafe(values.ravel(), dtype, copy=True)
    522                 values = values.reshape(self.shape)
    523 

/home/ubuntu/.venv/local/lib/python2.7/site-packages/pandas/core/dtypes/cast.pyc in astype_nansafe(arr, dtype, copy)
    618 
    619         if not np.isfinite(arr).all():
--> 620             raise ValueError('Cannot convert non-finite values (NA or inf) to '
    621                              'integer')
    622 

ValueError: Cannot convert non-finite values (NA or inf) to integer

In [4]:
ds = pd.read_pickle("../../imba/data/dataset.pkl")


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-4-d937f07e2aa9> in <module>()
----> 1 ds = pd.read_pickle("../../imba/data/dataset.pkl")

/home/ubuntu/.venv/local/lib/python2.7/site-packages/pandas/io/pickle.pyc in read_pickle(path, compression)
     92                     lambda f: pc.load(f, encoding=encoding, compat=True))
     93     try:
---> 94         return try_read(path)
     95     except:
     96         if PY3:

/home/ubuntu/.venv/local/lib/python2.7/site-packages/pandas/io/pickle.pyc in try_read(path, encoding)
     90             except:
     91                 return read_wrapper(
---> 92                     lambda f: pc.load(f, encoding=encoding, compat=True))
     93     try:
     94         return try_read(path)

/home/ubuntu/.venv/local/lib/python2.7/site-packages/pandas/io/pickle.pyc in read_wrapper(func)
     66                             is_text=False)
     67         try:
---> 68             return func(f)
     69         finally:
     70             for _f in fh:

/home/ubuntu/.venv/local/lib/python2.7/site-packages/pandas/io/pickle.pyc in <lambda>(f)
     90             except:
     91                 return read_wrapper(
---> 92                     lambda f: pc.load(f, encoding=encoding, compat=True))
     93     try:
     94         return try_read(path)

/home/ubuntu/.venv/local/lib/python2.7/site-packages/pandas/compat/pickle_compat.pyc in load(fh, encoding, compat, is_verbose)
    192         up.is_verbose = is_verbose
    193 
--> 194         return up.load()
    195     except:
    196         raise

/usr/lib/python2.7/pickle.pyc in load(self)
    862             while 1:
    863                 key = read(1)
--> 864                 dispatch[key](self)
    865         except _Stop, stopinst:
    866             return stopinst.value

/usr/lib/python2.7/pickle.pyc in load_proto(self)
    890         proto = ord(self.read(1))
    891         if not 0 <= proto <= 2:
--> 892             raise ValueError, "unsupported pickle protocol: %d" % proto
    893     dispatch[PROTO] = load_proto
    894 

ValueError: unsupported pickle protocol: 4

In [ ]: