In [8]:
import pandas as pd
import re
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
sns.set_style("whitegrid")
import calendar
In [9]:
# First, let's import requisite files
orders = pd.read_csv('../Instacart_Input/orders.csv')
prior_set = pd.read_csv('../Instacart_Input/order_products__prior.csv')
train_set = pd.read_csv('../Instacart_Input/order_products__train.csv')
aisles = pd.read_csv('../Instacart_Input/aisles.csv')
departments = pd.read_csv('../Instacart_Input/departments.csv')
products = pd.read_csv('../Instacart_Input/products.csv')
products.head()
Out[9]:
In [10]:
# To avoid confusion down the road, I am changing the names of two aisles
# 'missing' and 'other', which share the same string values as their parent department
aisles.loc[5,'aisle'] = 'other aisle'
aisles.loc[99,'aisle'] = 'missing aisle'
aisles.head(10)
Out[10]:
In [11]:
# The aisles.csv and departments.csv files more useful when merged into the products.csv
# Departments appear to represent larger groups, with sub-groups represented by aisle,
# not unlike a physical store.
# Personally, I can more easily conceptualize this thusly:
# 'departments' == Categories, and 'aisles' == Sub-Categories.
# Merge Aisle and Department information into Product df
aisle_names = aisles['aisle']
dept_names = departments['department']
products['Category'] = dept_names[products
['department_id'].values-1].values
products['Sub_Category'] = dept_names[products
['department_id'
].values-1
].values + " >> " + aisle_names[products['aisle_id'
].values-1].values
cols = ['product_id',
'product_name',
'Category',
'Sub_Category',
'aisle_id',
'department_id',
]
products = products[cols]
products.sort_values(by=['Category', 'Sub_Category']).head()
Out[11]:
In [12]:
# Example of missing category / aisle
products[products['product_id'] == 6728]
Out[12]:
In [13]:
# Let's list the Category and Aisle counts, grouped alphabetically, arranged hierarchically.
data = {'Product_Count' : products['Category'].value_counts(),
'_Product_Count' : products['Sub_Category'].value_counts()
}
counts_df = pd.DataFrame(data=data).fillna(value='--')
counts_df.index.names = ['Category']
counts_df.sort_index()
Out[13]:
In [14]:
# products categorized as'missing' or 'other' account for 1,806 products,
# roughly 3.634% of the 49,688 total products. Not exactly negligible.
counts_df.sort_index()[99:103]
Out[14]:
In [15]:
f, ax = plt.subplots(figsize=(14,10), ncols=1)
products['Category'].value_counts().plot(kind='bar')
_= ax.set_title('Categories by Total Products', size=22)
_= ax.set_ylabel('Count', size=20)
_= ax.tick_params(labelsize=16)
plt.xticks(ha='right', rotation=55);
In [16]:
# Let's plot sub-category (aisle) as a proportion of category (department)
#sns.barplot(x = stacked_bar_data.Group
#products.pivot('','')[].plot(kind='bar', stacked=True)
In [17]:
data1 = {'_Product_Count' : products['Sub_Category'].value_counts()
}
data2 = {'Product_Count' : products['Category'].value_counts()
}
d1_counts_df = pd.DataFrame(data=data1)
d2_counts_df = pd.DataFrame(data=data2)
data1['_Product_Count'].value_counts().sum()
d2_counts_df['Product_Count']['other']
Out[17]:
In [18]:
data3 = { 'No. of Sub-Categories' : {},
'Products in Sub-Category' : {},
'Parent' : {},
'Products in Parent Category' : {}
}
for i in range(len(d1_counts_df['_Product_Count'])):
#print counts_df.index[i], counts_df['_Product_Count'][i]
match = re.findall(r'([\w ]+)( >> )([\w ]+)', d1_counts_df.index[i])
parent, child = match[0][0], match[0][2]
if parent in data3['No. of Sub-Categories']:
data3['No. of Sub-Categories'][parent] += 1
data3['Products in Sub-Category'][child] = d1_counts_df['_Product_Count'][i]
else:
data3['No. of Sub-Categories'][parent] = 1
data3['Products in Parent Category'][parent] = d2_counts_df['Product_Count'][parent]
data3['Products in Sub-Category'][child] = d1_counts_df['_Product_Count'][i]
if child not in data3['Parent']:
data3['Parent'][child] = parent
data3['Products in Parent Category'][child] = d2_counts_df['Product_Count'][parent]
stacked_category_data = pd.DataFrame(data=data3)
stacked_category_data['Parent'
] = stacked_category_data['Parent'
].fillna('None')
stacked_category_data['Products in Sub-Category'
] = stacked_category_data['Products in Sub-Category'
].fillna('N/A')
stacked_category_data['No. of Sub-Categories'
] = stacked_category_data['No. of Sub-Categories'
].fillna('None')
stacked_category_data.sort_values(by=['Parent', 'Products in Parent Category'])[21:40]
Out[18]:
In [19]:
# Let's plot Sub_Category product groups (aisle)
# as a proportion of total products in Parent Category (department)
newstacked = stacked_category_data.groupby(['Parent',
stacked_category_data.index,
'Products in Sub-Category'
]).size()[21:40]
newstacked
Out[19]:
In [20]:
aisles_only = stacked_category_data.sort_values(by=
['Parent',
'Products in Parent Category'])[21:]
Top15 = aisles_only.sort_values(by='Products in Sub-Category',
ascending=False)[:15]
Bottom15 = aisles_only.sort_values(by='Products in Sub-Category',
ascending=False)[119:]
In [21]:
# Plotting top and Bottom 15 Sub-Categories by Product Count
f, ax = plt.subplots(figsize=(14,5))
Top15['Products in Sub-Category'].plot(kind='bar')
_= ax.set_title('Top 15 Sub-Categories by Count', size=22)
_= ax.set_ylabel('Count', size=20)
_= ax.tick_params(labelsize=16)
plt.xticks(ha='right', rotation=55);
In [22]:
# Notice that the larger Sub-Categories reflect more vague grouping criteria,
# e.g. "missing" isn't exactly specific
# which suggest less helpful groupings for making predictions
# as opposed to the smaller sub-categories below
f, ax = plt.subplots(figsize=(14,5))
Bottom15['Products in Sub-Category'].plot(kind='bar')
_= ax.set_title('Bottom 15 Sub-Categories by Count', size=22)
_= ax.set_ylabel('Count', size=20)
_= ax.tick_params(labelsize=16)
plt.xticks(ha='right', rotation=55);
In [93]:
# Now let's try to find some category trends. First, let's check out the alcohol category.
master_set = pd.merge(pd.merge(prior_set, orders, on='order_id', how='right'), products,
on='product_id',
how='left')
master_set.head()
Out[93]:
In [91]:
# Let's plot a heatmap of the count of products ordered in the alcohol category by hour and day
category_five = master_set[master_set['department_id'] == 5.0]
cat_heatmap_data = pd.DataFrame(category_five[['order_dow', 'order_hour_of_day']].groupby(['order_dow',
'order_hour_of_day']
).size()
).reset_index()
cat_heatmap_data = cat_heatmap_data.pivot(index='order_hour_of_day',
columns='order_dow',
values=0)
f, ax = plt.subplots(figsize=(16,12))
_= ax.set_title('Alcohol Category:\n Time-Series Trend', size=22)
_= ax.set_ylabel('Hour of Day', size=20, labelpad=15)
_= ax.set_xlabel('Day of Week', size=20, labelpad=15)
_= ax.tick_params(labelsize=16)
sns.heatmap(cat_heatmap_data, ax=ax, annot=True, fmt="d");
In [99]:
bananas = master_set[master_set['product_id'] == 24852]
cat_heatmap_data = pd.DataFrame(bananas[['order_dow', 'order_hour_of_day']].groupby(['order_dow',
'order_hour_of_day']
).size()
).reset_index()
cat_heatmap_data = cat_heatmap_data.pivot(index='order_hour_of_day',
columns='order_dow',
values=0)
f, ax = plt.subplots(figsize=(16,12))
_= ax.set_title('Organic Bananas:\n Time-Series Trend', size=22)
_= ax.set_ylabel('Hour of Day', size=20, labelpad=15)
_= ax.set_xlabel('Day of Week', size=20, labelpad=15)
_= ax.tick_params(labelsize=16)
sns.heatmap(cat_heatmap_data, ax=ax, annot=True, fmt="d");