We are capable of measuring order assortment in a number of ways with this dataset.
Areas of interest:
Hypothesis:
Less diverse (number of distinct categories), more consistent (fewer new products, consistent add-to-cart sequence) orders will tend to have the following behavior:
In [2]:
import pandas as pd
import re
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
sns.set_style("whitegrid")
import calendar
In [3]:
# First, let's import requisite files
orders = pd.read_csv('../Instacart_Input/orders.csv')
prior_set = pd.read_csv('../Instacart_Input/order_products__prior.csv')
train_set = pd.read_csv('../Instacart_Input/order_products__train.csv')
aisles = pd.read_csv('../Instacart_Input/aisles.csv')
departments = pd.read_csv('../Instacart_Input/departments.csv')
products = pd.read_csv('../Instacart_Input/products.csv')
In [4]:
# Let's
# 1. Merge the helper files 'departments' and 'aisles'
# 2. Drop distracting jargon
products_df = pd.merge(left=pd.merge(products, right=departments, on='department_id'),
right=aisles, on='aisle_id')
products_df = products_df.drop(['department_id', 'aisle_id'], axis=1)
products_df.head()
Out[4]:
In [5]:
market_basket_prior = prior_set.merge(products_df, on='product_id')
market_basket_prior = market_basket_prior.sort_values(by=['order_id', 'add_to_cart_order'])
market_basket_prior.head(10)
Out[5]:
In [6]:
reordered = market_basket_prior['reordered'].value_counts()[1]
not_reordered = market_basket_prior['reordered'].value_counts()[0]
pct_reordered = str(round(reordered / float(reordered + not_reordered)*100, 2)) + '%'
print 'Instances of Reordered Items: ' + str(market_basket_prior.reordered.sum())
print 'Total Number of Items Purchased: ' + str(market_basket_prior.shape[0])
print '\nUnique instances of a previously ordered product being ordered:' + '\nReorder Rate: ' + pct_reordered + '\n'
In [7]:
# Let's refine by excluding the first order, which arguably dilutes the reorder rate
excluding_first_order = orders[(orders['order_number'] > 1) & (orders['eval_set'] == 'prior')].sort_values('user_id')
excluding_first_order = excluding_first_order.merge(prior_set, on='order_id')
excluding_first_order
reordered = excluding_first_order['reordered'].value_counts()[1]
not_reordered = excluding_first_order['reordered'].value_counts()[0]
pct_reordered = str(round(reordered / float(reordered + not_reordered)*100, 2)) + '%'
print 'Instances of Reordered Items: ' + str(excluding_first_order.reordered.sum())
print 'Total Number of Items Purchased: ' + str(excluding_first_order.shape[0])
print '\nUnique instances of a previously ordered product being ordered:' + '\nReorder Rate: ' + pct_reordered + '\n'
In [10]:
# Total Number of Instances of each Category in Prior Order Set
# e.g. order_id 2 has 5 'pantry' items, order_id 3 has 0 'pantry' items, etc.
fig, ax = plt.subplots(figsize=(12, 5))
market_basket_prior.groupby(['department']).size().sort_values(ascending=False
).plot(kind='bar',
fontsize=15)
ax.ticklabel_format(axis='y', style='plain')
plt.xticks(ha='right', rotation=55)
plt.title("Categories by Total Instances in Orders", fontsize=18);
dept_totals = pd.DataFrame(market_basket_prior.groupby(['department']
).size().sort_values(ascending=False))
dept_totals.columns = ['Totals']
dept_totals.head(10)
Out[10]:
In [11]:
# Popular products by total order count
fig, ax = plt.subplots(figsize=(12, 5))
market_basket_prior.groupby(['product_name']).size().sort_values(ascending=False
)[:20].plot(kind='bar',
fontsize=15)
ax.ticklabel_format(axis='y', style='plain')
plt.xticks(ha='right', rotation=55)
plt.title("Products by Total Instances in Orders\n Long live Bananas", fontsize=18);
product_totals = pd.DataFrame(market_basket_prior.groupby(['product_name']).size().sort_values(ascending=False))
product_totals.columns = ['Totals']
product_totals.head(10)
Out[11]: