notebook.community

Edit and run



In [8]:

    
import pandas as pd
import re
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
sns.set_style("whitegrid")
import calendar



In [9]:

    
# First, let's import requisite files
orders      = pd.read_csv('../Instacart_Input/orders.csv')
prior_set   = pd.read_csv('../Instacart_Input/order_products__prior.csv')
train_set   = pd.read_csv('../Instacart_Input/order_products__train.csv')
aisles      = pd.read_csv('../Instacart_Input/aisles.csv')
departments = pd.read_csv('../Instacart_Input/departments.csv')
products    = pd.read_csv('../Instacart_Input/products.csv')

products.head()









    Out[9]:







  
    
      
      product_id
      product_name
      aisle_id
      department_id
    
  
  
    
      0
      1
      Chocolate Sandwich Cookies
      61
      19
    
    
      1
      2
      All-Seasons Salt
      104
      13
    
    
      2
      3
      Robust Golden Unsweetened Oolong Tea
      94
      7
    
    
      3
      4
      Smart Ones Classic Favorites Mini Rigatoni Wit...
      38
      1
    
    
      4
      5
      Green Chile Anytime Sauce
      5
      13



In [10]:

    
# To avoid confusion down the road, I am changing the names of two aisles
# 'missing' and 'other', which share the same string values as their parent department 

aisles.loc[5,'aisle'] = 'other aisle'
aisles.loc[99,'aisle'] = 'missing aisle'
aisles.head(10)









    Out[10]:







  
    
      
      aisle_id
      aisle
    
  
  
    
      0
      1
      prepared soups salads
    
    
      1
      2
      specialty cheeses
    
    
      2
      3
      energy granola bars
    
    
      3
      4
      instant foods
    
    
      4
      5
      marinades meat preparation
    
    
      5
      6
      other aisle
    
    
      6
      7
      packaged meat
    
    
      7
      8
      bakery desserts
    
    
      8
      9
      pasta sauce
    
    
      9
      10
      kitchen supplies



In [11]:

    
# The aisles.csv and departments.csv files more useful when merged into the products.csv

# Departments appear to represent larger groups, with sub-groups represented by aisle, 
# not unlike a physical store. 

# Personally, I can more easily conceptualize this thusly:
# 'departments' == Categories, and 'aisles' == Sub-Categories.

# Merge Aisle and Department information into Product df

aisle_names              = aisles['aisle']
dept_names               = departments['department']
products['Category']     = dept_names[products
                                      ['department_id'].values-1].values
products['Sub_Category'] = dept_names[products
                                      ['department_id'
                                      ].values-1
                                     ].values + " >> " + aisle_names[products['aisle_id'
                                                                     ].values-1].values

cols = ['product_id',
        'product_name',
        'Category',
        'Sub_Category',
        'aisle_id',
        'department_id',
       ]
products = products[cols]


products.sort_values(by=['Category', 'Sub_Category']).head()









    Out[11]:







  
    
      
      product_id
      product_name
      Category
      Sub_Category
      aisle_id
      department_id
    
  
  
    
      349
      350
      Mixed 12 Pack Lion's Share Ale
      alcohol
      alcohol >> beers coolers
      27
      5
    
    
      378
      379
      Super Dry Beer
      alcohol
      alcohol >> beers coolers
      27
      5
    
    
      429
      430
      Born Yesterday Pale Ale
      alcohol
      alcohol >> beers coolers
      27
      5
    
    
      469
      470
      Lite Beer
      alcohol
      alcohol >> beers coolers
      27
      5
    
    
      735
      736
      Beer, IPA
      alcohol
      alcohol >> beers coolers
      27
      5



In [12]:

    
# Example of missing category / aisle 
products[products['product_id'] == 6728]









    Out[12]:







  
    
      
      product_id
      product_name
      Category
      Sub_Category
      aisle_id
      department_id
    
  
  
    
      6727
      6728
      Fun Size Chocolate Candy
      missing
      missing >> missing aisle
      100
      21

Things to note:

product_id seems to increase proportionally to with department_id | aisle_id (when sorted, low product_ids first, high product_ids last), which might be useful later



In [13]:

    
# Let's list the Category and Aisle counts, grouped alphabetically, arranged hierarchically.

data = {'Product_Count'     : products['Category'].value_counts(),
        '_Product_Count' : products['Sub_Category'].value_counts()
       }

counts_df = pd.DataFrame(data=data).fillna(value='--')
counts_df.index.names = ['Category']
counts_df.sort_index()









    Out[13]:







  
    
      
      Product_Count
      _Product_Count
    
    
      Category
      
      
    
  
  
    
      alcohol
      1054
      --
    
    
      alcohol >> beers coolers
      --
      385
    
    
      alcohol >> red wines
      --
      232
    
    
      alcohol >> specialty wines champagnes
      --
      95
    
    
      alcohol >> spirits
      --
      195
    
    
      alcohol >> white wines
      --
      147
    
    
      babies
      1081
      --
    
    
      babies >> baby accessories
      --
      44
    
    
      babies >> baby bath body care
      --
      132
    
    
      babies >> baby food formula
      --
      718
    
    
      babies >> diapers wipes
      --
      187
    
    
      bakery
      1516
      --
    
    
      bakery >> bakery desserts
      --
      297
    
    
      bakery >> bread
      --
      557
    
    
      bakery >> breakfast bakery
      --
      226
    
    
      bakery >> buns rolls
      --
      195
    
    
      bakery >> tortillas flat bread
      --
      241
    
    
      beverages
      4365
      --
    
    
      beverages >> cocoa drink mixes
      --
      223
    
    
      beverages >> coffee
      --
      680
    
    
      beverages >> energy sports drinks
      --
      294
    
    
      beverages >> juice nectars
      --
      792
    
    
      beverages >> refrigerated
      --
      675
    
    
      beverages >> soft drinks
      --
      463
    
    
      beverages >> tea
      --
      894
    
    
      beverages >> water seltzer sparkling water
      --
      344
    
    
      breakfast
      1115
      --
    
    
      breakfast >> breakfast bars pastries
      --
      173
    
    
      breakfast >> cereal
      --
      454
    
    
      breakfast >> granola
      --
      185
    
    
      ...
      ...
      ...
    
    
      personal care >> first aid
      --
      240
    
    
      personal care >> hair care
      --
      816
    
    
      personal care >> muscles joints pain relief
      --
      172
    
    
      personal care >> oral hygiene
      --
      565
    
    
      personal care >> protein meal replacements
      --
      325
    
    
      personal care >> shave needs
      --
      198
    
    
      personal care >> skin care
      --
      245
    
    
      personal care >> soap
      --
      525
    
    
      personal care >> vitamins supplements
      --
      1038
    
    
      pets
      972
      --
    
    
      pets >> cat food care
      --
      499
    
    
      pets >> dog food care
      --
      473
    
    
      produce
      1684
      --
    
    
      produce >> fresh fruits
      --
      382
    
    
      produce >> fresh herbs
      --
      86
    
    
      produce >> fresh vegetables
      --
      569
    
    
      produce >> packaged produce
      --
      32
    
    
      produce >> packaged vegetables fruits
      --
      615
    
    
      snacks
      6264
      --
    
    
      snacks >> candy chocolate
      --
      1246
    
    
      snacks >> chips pretzels
      --
      989
    
    
      snacks >> cookies cakes
      --
      874
    
    
      snacks >> crackers
      --
      747
    
    
      snacks >> energy granola bars
      --
      832
    
    
      snacks >> fruit vegetable snacks
      --
      356
    
    
      snacks >> ice cream toppings
      --
      85
    
    
      snacks >> mint gum
      --
      168
    
    
      snacks >> nuts seeds dried fruit
      --
      582
    
    
      snacks >> popcorn jerky
      --
      316
    
    
      snacks >> trail mix snack mix
      --
      69
    
  

155 rows × 2 columns



In [14]:

    
# products categorized as'missing' or 'other' account for 1,806 products,
# roughly 3.634% of the 49,688 total products. Not exactly negligible.

counts_df.sort_index()[99:103]









    Out[14]:







  
    
      
      Product_Count
      _Product_Count
    
    
      Category
      
      
    
  
  
    
      missing
      1258
      --
    
    
      missing >> missing aisle
      --
      1258
    
    
      other
      548
      --
    
    
      other >> other aisle
      --
      548



In [15]:

    
f, ax = plt.subplots(figsize=(14,10), ncols=1)

products['Category'].value_counts().plot(kind='bar')
_= ax.set_title('Categories by Total Products', size=22)
_= ax.set_ylabel('Count', size=20)
_= ax.tick_params(labelsize=16)
plt.xticks(ha='right', rotation=55);



In [16]:

    
# Let's plot sub-category (aisle) as a proportion of category (department)

#sns.barplot(x = stacked_bar_data.Group
#products.pivot('','')[].plot(kind='bar', stacked=True)



In [17]:

    
data1 = {'_Product_Count' : products['Sub_Category'].value_counts()
       }
data2 = {'Product_Count' : products['Category'].value_counts()
       }


d1_counts_df = pd.DataFrame(data=data1)
d2_counts_df = pd.DataFrame(data=data2)
data1['_Product_Count'].value_counts().sum()
d2_counts_df['Product_Count']['other']









    Out[17]:





548



In [18]:

    
data3 = { 'No. of Sub-Categories'       : {},
          'Products in Sub-Category'    : {},
          'Parent'                      : {},
          'Products in Parent Category' : {}
        }

for i in range(len(d1_counts_df['_Product_Count'])):
    #print counts_df.index[i], counts_df['_Product_Count'][i]
    match = re.findall(r'([\w ]+)( >> )([\w ]+)', d1_counts_df.index[i])
    parent, child = match[0][0], match[0][2]
    if parent in data3['No. of Sub-Categories']:
        data3['No. of Sub-Categories'][parent]       += 1
        data3['Products in Sub-Category'][child]      = d1_counts_df['_Product_Count'][i]
    else:
        data3['No. of Sub-Categories'][parent]               = 1
        data3['Products in Parent Category'][parent]  = d2_counts_df['Product_Count'][parent]
        data3['Products in Sub-Category'][child]      = d1_counts_df['_Product_Count'][i]
        
        
    if child not in data3['Parent']:
        data3['Parent'][child]                        = parent
        data3['Products in Parent Category'][child]   = d2_counts_df['Product_Count'][parent]


stacked_category_data = pd.DataFrame(data=data3)
stacked_category_data['Parent'
                     ] = stacked_category_data['Parent'
                                              ].fillna('None')
stacked_category_data['Products in Sub-Category'
                     ] = stacked_category_data['Products in Sub-Category'
                                              ].fillna('N/A')

stacked_category_data['No. of Sub-Categories'
                     ] = stacked_category_data['No. of Sub-Categories'
                                              ].fillna('None')

stacked_category_data.sort_values(by=['Parent', 'Products in Parent Category'])[21:40]









    Out[18]:







  
    
      
      No. of Sub-Categories
      Parent
      Products in Parent Category
      Products in Sub-Category
    
  
  
    
      beers coolers
      None
      alcohol
      1054
      385
    
    
      red wines
      None
      alcohol
      1054
      232
    
    
      specialty wines champagnes
      None
      alcohol
      1054
      95
    
    
      spirits
      None
      alcohol
      1054
      195
    
    
      white wines
      None
      alcohol
      1054
      147
    
    
      baby accessories
      None
      babies
      1081
      44
    
    
      baby bath body care
      None
      babies
      1081
      132
    
    
      baby food formula
      None
      babies
      1081
      718
    
    
      diapers wipes
      None
      babies
      1081
      187
    
    
      bakery desserts
      None
      bakery
      1516
      297
    
    
      bread
      None
      bakery
      1516
      557
    
    
      breakfast bakery
      None
      bakery
      1516
      226
    
    
      buns rolls
      None
      bakery
      1516
      195
    
    
      tortillas flat bread
      None
      bakery
      1516
      241
    
    
      cocoa drink mixes
      None
      beverages
      4365
      223
    
    
      coffee
      None
      beverages
      4365
      680
    
    
      energy sports drinks
      None
      beverages
      4365
      294
    
    
      juice nectars
      None
      beverages
      4365
      792
    
    
      refrigerated
      None
      beverages
      4365
      675



In [19]:

    
# Let's plot Sub_Category product groups (aisle) 
# as a proportion of total products in Parent Category (department)


newstacked = stacked_category_data.groupby(['Parent', 
                                            stacked_category_data.index,
                                            'Products in Sub-Category'
                                           ]).size()[21:40]
newstacked









    Out[19]:





Parent                                 Products in Sub-Category
alcohol    beers coolers               385.0                       1
           red wines                   232.0                       1
           specialty wines champagnes  95.0                        1
           spirits                     195.0                       1
           white wines                 147.0                       1
babies     baby accessories            44.0                        1
           baby bath body care         132.0                       1
           baby food formula           718.0                       1
           diapers wipes               187.0                       1
bakery     bakery desserts             297.0                       1
           bread                       557.0                       1
           breakfast bakery            226.0                       1
           buns rolls                  195.0                       1
           tortillas flat bread        241.0                       1
beverages  cocoa drink mixes           223.0                       1
           coffee                      680.0                       1
           energy sports drinks        294.0                       1
           juice nectars               792.0                       1
           refrigerated                675.0                       1
dtype: int64



In [20]:

    
aisles_only = stacked_category_data.sort_values(by=
                                                ['Parent',
                                                 'Products in Parent Category'])[21:]
Top15       = aisles_only.sort_values(by='Products in Sub-Category', 
                                      ascending=False)[:15]
Bottom15    = aisles_only.sort_values(by='Products in Sub-Category', 
                                      ascending=False)[119:]



In [21]:

    
# Plotting top and Bottom 15 Sub-Categories by Product Count

f, ax = plt.subplots(figsize=(14,5))

Top15['Products in Sub-Category'].plot(kind='bar')
_= ax.set_title('Top 15 Sub-Categories by Count', size=22)
_= ax.set_ylabel('Count', size=20)
_= ax.tick_params(labelsize=16)
plt.xticks(ha='right', rotation=55);



In [22]:

    
# Notice that the larger Sub-Categories reflect more vague grouping criteria, 
# e.g. "missing" isn't exactly specific
# which suggest less helpful groupings for making predictions
# as opposed to the smaller sub-categories below

f, ax = plt.subplots(figsize=(14,5))
Bottom15['Products in Sub-Category'].plot(kind='bar')
_= ax.set_title('Bottom 15 Sub-Categories by Count', size=22)
_= ax.set_ylabel('Count', size=20)
_= ax.tick_params(labelsize=16)
plt.xticks(ha='right', rotation=55);



In [93]:

    
# Now let's try to find some category trends. First, let's check out the alcohol category. 

master_set = pd.merge(pd.merge(prior_set, orders, on='order_id', how='right'), products,
                      on='product_id',
                      how='left')
master_set.head()









    Out[93]:







  
    
      
      order_id
      product_id
      add_to_cart_order
      reordered
      user_id
      eval_set
      order_number
      order_dow
      order_hour_of_day
      days_since_prior_order
      product_name
      Category
      Sub_Category
      aisle_id
      department_id
    
  
  
    
      0
      2
      33120
      1.0
      1.0
      202279
      prior
      3
      5
      9
      8.0
      Organic Egg Whites
      dairy eggs
      dairy eggs >> eggs
      86.0
      16.0
    
    
      1
      2
      28985
      2.0
      1.0
      202279
      prior
      3
      5
      9
      8.0
      Michigan Organic Kale
      produce
      produce >> fresh vegetables
      83.0
      4.0
    
    
      2
      2
      9327
      3.0
      0.0
      202279
      prior
      3
      5
      9
      8.0
      Garlic Powder
      pantry
      pantry >> spices seasonings
      104.0
      13.0
    
    
      3
      2
      45918
      4.0
      1.0
      202279
      prior
      3
      5
      9
      8.0
      Coconut Butter
      pantry
      pantry >> oils vinegars
      19.0
      13.0
    
    
      4
      2
      30035
      5.0
      0.0
      202279
      prior
      3
      5
      9
      8.0
      Natural Sweetener
      pantry
      pantry >> baking ingredients
      17.0
      13.0



In [91]:

    
# Let's plot a heatmap of the count of products ordered in the alcohol category by hour and day

category_five = master_set[master_set['department_id'] == 5.0]
cat_heatmap_data = pd.DataFrame(category_five[['order_dow', 'order_hour_of_day']].groupby(['order_dow', 
                                                                                           'order_hour_of_day']
                                                                                         ).size()
                               ).reset_index()
cat_heatmap_data = cat_heatmap_data.pivot(index='order_hour_of_day', 
                                          columns='order_dow', 
                                          values=0)

f, ax = plt.subplots(figsize=(16,12))
_= ax.set_title('Alcohol Category:\n Time-Series Trend', size=22)
_= ax.set_ylabel('Hour of Day', size=20, labelpad=15)
_= ax.set_xlabel('Day of Week', size=20, labelpad=15)
_= ax.tick_params(labelsize=16)
sns.heatmap(cat_heatmap_data, ax=ax, annot=True, fmt="d");



In [99]:

    
bananas = master_set[master_set['product_id'] == 24852]
cat_heatmap_data = pd.DataFrame(bananas[['order_dow', 'order_hour_of_day']].groupby(['order_dow', 
                                                                                           'order_hour_of_day']
                                                                                         ).size()
                               ).reset_index()
cat_heatmap_data = cat_heatmap_data.pivot(index='order_hour_of_day', 
                                          columns='order_dow', 
                                          values=0)

f, ax = plt.subplots(figsize=(16,12))
_= ax.set_title('Organic Bananas:\n Time-Series Trend', size=22)
_= ax.set_ylabel('Hour of Day', size=20, labelpad=15)
_= ax.set_xlabel('Day of Week', size=20, labelpad=15)
_= ax.tick_params(labelsize=16)
sns.heatmap(cat_heatmap_data, ax=ax, annot=True, fmt="d");

	product_id	product_name	aisle_id	department_id
0	1	Chocolate Sandwich Cookies	61	19
1	2	All-Seasons Salt	104	13
2	3	Robust Golden Unsweetened Oolong Tea	94	7
3	4	Smart Ones Classic Favorites Mini Rigatoni Wit...	38	1
4	5	Green Chile Anytime Sauce	5	13

	aisle_id	aisle
0	1	prepared soups salads
1	2	specialty cheeses
2	3	energy granola bars
3	4	instant foods
4	5	marinades meat preparation
5	6	other aisle
6	7	packaged meat
7	8	bakery desserts
8	9	pasta sauce
9	10	kitchen supplies

	product_id	product_name	Category	Sub_Category	aisle_id	department_id
349	350	Mixed 12 Pack Lion's Share Ale	alcohol	alcohol >> beers coolers	27	5
378	379	Super Dry Beer	alcohol	alcohol >> beers coolers	27	5
429	430	Born Yesterday Pale Ale	alcohol	alcohol >> beers coolers	27	5
469	470	Lite Beer	alcohol	alcohol >> beers coolers	27	5
735	736	Beer, IPA	alcohol	alcohol >> beers coolers	27	5

	Product_Count	_Product_Count
Category
alcohol	1054	--
alcohol >> beers coolers	--	385
alcohol >> red wines	--	232
alcohol >> specialty wines champagnes	--	95
alcohol >> spirits	--	195
alcohol >> white wines	--	147
babies	1081	--
babies >> baby accessories	--	44
babies >> baby bath body care	--	132
babies >> baby food formula	--	718
babies >> diapers wipes	--	187
bakery	1516	--
bakery >> bakery desserts	--	297
bakery >> bread	--	557
bakery >> breakfast bakery	--	226
bakery >> buns rolls	--	195
bakery >> tortillas flat bread	--	241
beverages	4365	--
beverages >> cocoa drink mixes	--	223
beverages >> coffee	--	680
beverages >> energy sports drinks	--	294
beverages >> juice nectars	--	792
beverages >> refrigerated	--	675
beverages >> soft drinks	--	463
beverages >> tea	--	894
beverages >> water seltzer sparkling water	--	344
breakfast	1115	--
breakfast >> breakfast bars pastries	--	173
breakfast >> cereal	--	454
breakfast >> granola	--	185
...	...	...
personal care >> first aid	--	240
personal care >> hair care	--	816
personal care >> muscles joints pain relief	--	172
personal care >> oral hygiene	--	565
personal care >> protein meal replacements	--	325
personal care >> shave needs	--	198
personal care >> skin care	--	245
personal care >> soap	--	525
personal care >> vitamins supplements	--	1038
pets	972	--
pets >> cat food care	--	499
pets >> dog food care	--	473
produce	1684	--
produce >> fresh fruits	--	382
produce >> fresh herbs	--	86
produce >> fresh vegetables	--	569
produce >> packaged produce	--	32
produce >> packaged vegetables fruits	--	615
snacks	6264	--
snacks >> candy chocolate	--	1246
snacks >> chips pretzels	--	989
snacks >> cookies cakes	--	874
snacks >> crackers	--	747
snacks >> energy granola bars	--	832
snacks >> fruit vegetable snacks	--	356
snacks >> ice cream toppings	--	85
snacks >> mint gum	--	168
snacks >> nuts seeds dried fruit	--	582
snacks >> popcorn jerky	--	316
snacks >> trail mix snack mix	--	69

	Product_Count	_Product_Count
Category
missing	1258	--
missing >> missing aisle	--	1258
other	548	--
other >> other aisle	--	548

	No. of Sub-Categories	Parent	Products in Parent Category	Products in Sub-Category
beers coolers	None	alcohol	1054	385
red wines	None	alcohol	1054	232
specialty wines champagnes	None	alcohol	1054	95
spirits	None	alcohol	1054	195
white wines	None	alcohol	1054	147
baby accessories	None	babies	1081	44
baby bath body care	None	babies	1081	132
baby food formula	None	babies	1081	718
diapers wipes	None	babies	1081	187
bakery desserts	None	bakery	1516	297
bread	None	bakery	1516	557
breakfast bakery	None	bakery	1516	226
buns rolls	None	bakery	1516	195
tortillas flat bread	None	bakery	1516	241
cocoa drink mixes	None	beverages	4365	223
coffee	None	beverages	4365	680
energy sports drinks	None	beverages	4365	294
juice nectars	None	beverages	4365	792
refrigerated	None	beverages	4365	675

	order_id	product_id	add_to_cart_order	reordered	user_id	eval_set	order_number	order_dow	order_hour_of_day	days_since_prior_order	product_name	Category	Sub_Category	aisle_id	department_id
0	2	33120	1.0	1.0	202279	prior	3	5	9	8.0	Organic Egg Whites	dairy eggs	dairy eggs >> eggs	86.0	16.0
1	2	28985	2.0	1.0	202279	prior	3	5	9	8.0	Michigan Organic Kale	produce	produce >> fresh vegetables	83.0	4.0
2	2	9327	3.0	0.0	202279	prior	3	5	9	8.0	Garlic Powder	pantry	pantry >> spices seasonings	104.0	13.0
3	2	45918	4.0	1.0	202279	prior	3	5	9	8.0	Coconut Butter	pantry	pantry >> oils vinegars	19.0	13.0
4	2	30035	5.0	0.0	202279	prior	3	5	9	8.0	Natural Sweetener	pantry	pantry >> baking ingredients	17.0	13.0