In [ ]:
In [2]:
import pandas as pd
import numpy as np
In [3]:
%run -i readData.py
Shape aisles: (134, 2)
Shape departments: (21, 2)
Shape order_products__prior: (32434489, 4)
Shape order_products__train: (1384617, 4)
Shape orders: (3421083, 7)
Shape products: (49688, 4)
In [14]:
allDsCombine = [aisles, departments, order_products__prior,order_products__train, orders, products]
In [31]:
#print all datasets
for ds in allDsCombine:
print(ds.head(n=10))
print("-"*100)
aisle_id aisle
0 1 prepared soups salads
1 2 specialty cheeses
2 3 energy granola bars
3 4 instant foods
4 5 marinades meat preparation
5 6 other
6 7 packaged meat
7 8 bakery desserts
8 9 pasta sauce
9 10 kitchen supplies
10 11 cold flu allergy
11 12 fresh pasta
12 13 prepared meals
13 14 tofu meat alternatives
14 15 packaged seafood
15 16 fresh herbs
16 17 baking ingredients
17 18 bulk dried fruits vegetables
18 19 oils vinegars
19 20 oral hygiene
----------------------------------------------------------------------------------------------------
department_id department
0 1 frozen
1 2 other
2 3 bakery
3 4 produce
4 5 alcohol
5 6 international
6 7 beverages
7 8 pets
8 9 dry goods pasta
9 10 bulk
10 11 personal care
11 12 meat seafood
12 13 pantry
13 14 breakfast
14 15 canned goods
15 16 dairy eggs
16 17 household
17 18 babies
18 19 snacks
19 20 deli
----------------------------------------------------------------------------------------------------
order_id product_id add_to_cart_order reordered
0 2 33120 1 1
1 2 28985 2 1
2 2 9327 3 0
3 2 45918 4 1
4 2 30035 5 0
5 2 17794 6 1
6 2 40141 7 1
7 2 1819 8 1
8 2 43668 9 0
9 3 33754 1 1
10 3 24838 2 1
11 3 17704 3 1
12 3 21903 4 1
13 3 17668 5 1
14 3 46667 6 1
15 3 17461 7 1
16 3 32665 8 1
17 4 46842 1 0
18 4 26434 2 1
19 4 39758 3 1
----------------------------------------------------------------------------------------------------
order_id product_id add_to_cart_order reordered
0 1 49302 1 1
1 1 11109 2 1
2 1 10246 3 0
3 1 49683 4 0
4 1 43633 5 1
5 1 13176 6 0
6 1 47209 7 0
7 1 22035 8 1
8 36 39612 1 0
9 36 19660 2 1
10 36 49235 3 0
11 36 43086 4 1
12 36 46620 5 1
13 36 34497 6 1
14 36 48679 7 1
15 36 46979 8 1
16 38 11913 1 0
17 38 18159 2 0
18 38 4461 3 0
19 38 21616 4 1
----------------------------------------------------------------------------------------------------
order_id user_id eval_set order_number order_dow order_hour_of_day \
0 2539329 1 prior 1 2 8
1 2398795 1 prior 2 3 7
2 473747 1 prior 3 3 12
3 2254736 1 prior 4 4 7
4 431534 1 prior 5 4 15
5 3367565 1 prior 6 2 7
6 550135 1 prior 7 1 9
7 3108588 1 prior 8 1 14
8 2295261 1 prior 9 1 16
9 2550362 1 prior 10 4 8
10 1187899 1 train 11 4 8
11 2168274 2 prior 1 2 11
12 1501582 2 prior 2 5 10
13 1901567 2 prior 3 1 10
14 738281 2 prior 4 2 10
15 1673511 2 prior 5 3 11
16 1199898 2 prior 6 2 9
17 3194192 2 prior 7 2 12
18 788338 2 prior 8 1 15
19 1718559 2 prior 9 2 9
days_since_prior_order
0 NaN
1 15.0
2 21.0
3 29.0
4 28.0
5 19.0
6 20.0
7 14.0
8 0.0
9 30.0
10 14.0
11 NaN
12 10.0
13 3.0
14 8.0
15 8.0
16 13.0
17 14.0
18 27.0
19 8.0
----------------------------------------------------------------------------------------------------
product_id product_name aisle_id \
0 1 Chocolate Sandwich Cookies 61
1 2 All-Seasons Salt 104
2 3 Robust Golden Unsweetened Oolong Tea 94
3 4 Smart Ones Classic Favorites Mini Rigatoni Wit... 38
4 5 Green Chile Anytime Sauce 5
5 6 Dry Nose Oil 11
6 7 Pure Coconut Water With Orange 98
7 8 Cut Russet Potatoes Steam N' Mash 116
8 9 Light Strawberry Blueberry Yogurt 120
9 10 Sparkling Orange Juice & Prickly Pear Beverage 115
10 11 Peach Mango Juice 31
11 12 Chocolate Fudge Layer Cake 119
12 13 Saline Nasal Mist 11
13 14 Fresh Scent Dishwasher Cleaner 74
14 15 Overnight Diapers Size 6 56
15 16 Mint Chocolate Flavored Syrup 103
16 17 Rendered Duck Fat 35
17 18 Pizza for One Suprema Frozen Pizza 79
18 19 Gluten Free Quinoa Three Cheese & Mushroom Blend 63
19 20 Pomegranate Cranberry & Aloe Vera Enrich Drink 98
department_id
0 19
1 13
2 7
3 1
4 13
5 11
6 7
7 1
8 16
9 7
10 7
11 1
12 11
13 17
14 18
15 19
16 12
17 1
18 9
19 7
----------------------------------------------------------------------------------------------------
In [60]:
#count number of data set sizes
orders.groupby(orders['eval_set']).size()
Out[60]:
eval_set
prior 3214874
test 75000
train 131209
dtype: int64
Find out if there is any products from the train sample that has never been bought before
In [68]:
products[~products['product_id'].isin(order_products__prior['product_id'])]
Out[68]:
product_id
product_name
aisle_id
department_id
3629
3630
Protein Granola Apple Crisp
57
14
3717
3718
Wasabi Cheddar Spreadable Cheese
21
16
7044
7045
Unpeeled Apricot Halves in Heavy Syrup
88
13
25382
25383
Chocolate Go Bites
61
19
27498
27499
Non-Dairy Coconut Seven Layer Bar
100
21
36232
36233
Water With Electrolytes
100
21
37702
37703
Ultra Sun Blossom Liquid 90 loads Fabric Enhan...
75
17
43724
43725
Sweetart Jelly Beans
100
21
45970
45971
12 Inch Taper Candle White
101
17
46624
46625
Single Barrel Kentucky Straight Bourbon Whiskey
31
7
49539
49540
Pure Squeezed Lemonade
31
7
In [ ]:
Content source: jakobbs/instacart
Similar notebooks: