In [108]:
import sklearn
import pandas as pd
from collections import Counter
import numpy as np
import matplotlib.pyplot as plt

Datasets import


In [118]:
aisles               =      pd.read_csv('~/kaggle-competitions/Instacart/data/aisles.csv')
departments          = pd.read_csv('~/kaggle-competitions/Instacart/data/departments.csv')
order_products_prior = pd.read_csv('~/kaggle-competitions/Instacart/data/order_products__prior.csv')
order_products_train = pd.read_csv('~/kaggle-competitions/Instacart/data/order_products__train.csv')
orders               = pd.read_csv('~/kaggle-competitions/Instacart/data/orders.csv')
products             = pd.read_csv('~/kaggle-competitions/Instacart/data/products.csv')

Cleaning Datasets


In [138]:
departments


Out[138]:
department_id department
0 1 frozen
1 2 other
2 3 bakery
3 4 produce
4 5 alcohol
5 6 international
6 7 beverages
7 8 pets
8 9 dry goods pasta
9 10 bulk
10 11 personal care
11 12 meat seafood
12 13 pantry
13 14 breakfast
14 15 canned goods
15 16 dairy eggs
16 17 household
17 18 babies
18 19 snacks
19 20 deli
20 21 missing

Functions


In [3]:
def hours_to_moment(hour):
    try : 
        hour = int(hour)
        if hour>6 and hour<12:
            return 'morning'
        elif hour>=12 and hour<19:
            return 'afternoon'
        else : 
            return 'night'
    except : 
        return 'tbd'

In [54]:
orders['moments']= orders.order_hour_of_day.apply(hours_to_moment)

In [61]:
user_counter_dow_prior = orders[orders.eval_set=='prior'].groupby('user_id')['order_dow'].apply(list).apply(Counter).to_frame(name='dow')

In [62]:
user_nb_order_prior = orders[orders.eval_set=='prior'].groupby('user_id')['order_dow'].count().to_frame(name='nb_order')

In [63]:
user_counter_moment_prior = orders[orders.eval_set=='prior'].groupby('user_id')['moments'].apply(list).apply(Counter).to_frame(name='moments')

d = {'dow':counter_dow.values} user_counter_dow = pd.DataFrame(data = d,index =counter_dow.index)

d = {'#order':nb_order.values} user_nb_order = pd.DataFrame(data = d,index =nb_order.index)

d = {'moments':counter_moment.values} user_counter_moment = pd.DataFrame(data = d,index =counter_moment.index)

users = user_counter_dow.join(user_nb_order, how='outer').join(user_counter_moment, how='outer')


In [64]:
users_prior = pd.concat([user_counter_dow_prior,user_nb_order_prior,user_counter_moment_prior],axis=1)

In [65]:
users_prior['nb_moments'] = users.moments.apply(len)

In [66]:
users_prior['nb_dow'] = users.dow.apply(len)

users[(users['#dow']==1) & (users['#moments']==1)]


In [81]:
users_prior.head()


Out[81]:
dow nb_order moments nb_moments nb_dow
user_id
1 {2: 2, 3: 2, 4: 3, 1: 3} 10 {'morning': 6, 'afternoon': 4} 2 4
2 {2: 5, 5: 1, 1: 5, 3: 2, 4: 1} 14 {'morning': 12, 'afternoon': 2} 2 5
3 {1: 2, 3: 3, 2: 1, 0: 6} 12 {'afternoon': 11, 'night': 1} 2 4
4 {6: 1, 4: 2, 5: 2} 5 {'morning': 2, 'afternoon': 3} 2 3
5 {3: 2, 0: 1, 1: 1} 4 {'afternoon': 4} 1 3

In [131]:
order_prior_enriched = order_products_prior.join(products['department_id'],on='product_id', how='left')

In [137]:
order_prior_enriched['department_id'] = order_prior_enriched['department_id'].fillna(value=21).apply(int)

In [150]:
order_prior_enriched = order_prior_enriched.join(order_prior_enriched.groupby('order_id')['department_id'].nunique().to_frame(name='nb_unique_department'),how='outer',on='order_id')

In [157]:
orders = orders.join(order_prior_enriched['nb_unique_department'],on='order_id',how='left')

In [158]:
orders[orders.eval_set=='prior'].groupby('user_id')['nb_unique_department'].apply(list).apply(Counter).to_frame(name='number_department')


Out[158]:
number_department
user_id
1 {11: 2, 6: 3, 7: 2, 12: 1, 2: 1, 9: 1}
2 {12: 2, 7: 2, 11: 1, 13: 1, 6: 2, 4: 1, 9: 2, ...
3 {5: 1, 11: 2, 9: 3, 8: 4, 3: 1, 14: 1}
4 {1: 1, 10: 1, 7: 1, 9: 1, 3: 1}
5 {9: 1, 14: 1, 11: 1, 8: 1}
6 {13: 1, 9: 1, 11: 1}
7 {7: 6, 13: 1, 15: 1, 2: 3, 9: 3, 3: 1, 11: 1, ...
8 {7: 1, 2: 1, 9: 1}
9 {12: 1, 9: 1, 3: 1}
10 {12: 1, 6: 1, 8: 1, 5: 1, 7: 1}
11 {9: 1, 8: 2, 3: 1, 4: 1, 10: 1, 12: 1}
12 {6: 1, 4: 1, 12: 2, 13: 1}
13 {5: 2, 14: 1, 6: 2, 4: 1, 11: 3, 8: 1, 12: 1, ...
14 {6: 4, 5: 1, 14: 1, 2: 2, 10: 1, 12: 1, 13: 2,...
15 {4: 1, 9: 4, 13: 2, 11: 2, 7: 1, 3: 2, 10: 3, ...
16 {8: 1, 13: 1, 7: 2, 12: 1, 9: 1}
17 {13: 3, 7: 6, 10: 2, 5: 2, 9: 4, 12: 3, 3: 2, ...
18 {4: 2, 9: 1, 10: 1, 12: 1, 6: 1}
19 {6: 2, 15: 1, 4: 1, 8: 2, 12: 2, 14: 1}
20 {6: 2, 9: 1, 15: 1}
21 {7: 5, 14: 1, 5: 2, 8: 6, 6: 5, 13: 2, 9: 1, 4...
22 {12: 2, 11: 1, 4: 2, 5: 1, 7: 3, 8: 1, 6: 2, 9...
23 {13: 1, 8: 2, 12: 1}
24 {14: 1, 12: 2, 3: 2, 5: 1, 8: 3, 10: 2, 11: 3,...
25 {12: 1, 5: 1, 10: 1}
26 {6: 1, 11: 2, 4: 1, 13: 1, 7: 2, 9: 2, 8: 1, 2...
27 {12: 5, 2: 2, 5: 13, 8: 7, 10: 12, 15: 2, 6: 6...
28 {5: 3, 9: 4, 12: 3, 11: 3, 10: 3, 8: 2, 3: 1, ...
29 {4: 5, 12: 1, 6: 2, 9: 3, 10: 2, 13: 1, 5: 1, ...
30 {6: 1, 12: 1, 4: 1, 9: 1, 5: 1, 7: 2, 11: 1}
... ...
206180 {8: 3, 4: 1, 5: 1, 14: 1, 13: 1, 6: 1, 7: 2, 1...
206181 {6: 2, 14: 1, 10: 4, 9: 2, 11: 1, 8: 2, 12: 1,...
206182 {8: 2, 14: 1, 15: 2, 11: 1, 7: 1, 3: 1, 9: 2}
206183 {11: 2, 6: 1, 9: 3, 10: 1, 13: 3, 7: 1, 5: 1, ...
206184 {10: 1, 5: 1, 13: 1, 3: 1}
206185 {4: 3, 9: 2, 12: 1, 11: 1, 7: 1, 6: 1, 14: 1}
206186 {3: 1, 7: 1, 13: 1}
206187 {11: 1, 8: 10, 13: 4, 9: 3, 14: 1, 6: 5, 12: 3...
206188 {10: 3, 11: 1, 5: 1, 8: 1, 9: 1}
206189 {6: 1, 4: 3, 7: 2}
206190 {3: 1, 11: 1, 4: 1, 2: 1, 7: 1, 12: 1, 10: 1, ...
206191 {13: 1, 9: 1, 8: 1, 7: 1, 12: 1}
206192 {12: 3, 11: 2, 8: 4, 7: 1, 14: 2, 13: 1, 9: 1}
206193 {4: 2, 10: 2, 9: 5, 6: 8, 7: 4, 18: 1, 5: 4, 1...
206194 {10: 3, 12: 1, 9: 2, 3: 1, 13: 1, 7: 2, 5: 1}
206195 {7: 6, 8: 2, 10: 2, 11: 3, 5: 1, 16: 1, 9: 1, ...
206196 {8: 1, 12: 1, 11: 2}
206197 {12: 3, 14: 3, 10: 4, 8: 2, 7: 4, 9: 2, 5: 1, ...
206198 {7: 1, 6: 1, 10: 1, 4: 1, 5: 1, 9: 1, 13: 1}
206199 {7: 2, 14: 1, 9: 5, 12: 4, 3: 2, 5: 1, 8: 2, 6...
206200 {14: 2, 10: 4, 5: 3, 7: 2, 8: 3, 11: 2, 13: 2,...
206201 {7: 1, 9: 7, 10: 4, 13: 2, 16: 1, 12: 2, 4: 2,...
206202 {10: 4, 7: 4, 15: 2, 9: 2, 11: 1, 6: 2, 8: 2, ...
206203 {3: 1, 12: 1, 5: 1, 8: 2}
206204 {7: 1, 13: 1, 5: 1, 12: 1}
206205 {10: 1, 13: 1, 8: 1}
206206 {13: 3, 8: 7, 6: 9, 7: 8, 17: 2, 10: 8, 3: 4, ...
206207 {14: 3, 9: 3, 12: 1, 11: 4, 10: 2, 3: 2, 7: 1}
206208 {7: 7, 11: 4, 10: 7, 5: 4, 8: 4, 12: 7, 13: 2,...
206209 {16: 1, 9: 3, 7: 2, 8: 2, 10: 2, 13: 1, 6: 1, ...

206209 rows × 1 columns


In [159]:
orders[orders.eval_set=='prior'].groupby('user_id')['nb_unique_department'].max().


Out[159]:
max_number_department
user_id
1 12
2 14
3 14
4 10
5 14
6 13
7 15
8 9
9 12
10 12
11 12
12 13
13 16
14 14
15 16
16 13
17 13
18 12
19 15
20 15
21 15
22 12
23 13
24 14
25 12
26 14
27 15
28 13
29 13
30 12
... ...
206180 15
206181 14
206182 15
206183 15
206184 13
206185 14
206186 13
206187 14
206188 11
206189 7
206190 12
206191 13
206192 14
206193 18
206194 13
206195 16
206196 12
206197 14
206198 13
206199 14
206200 15
206201 17
206202 15
206203 12
206204 13
206205 13
206206 17
206207 14
206208 15
206209 17

206209 rows × 1 columns


In [173]:
f = {'mean_nb_department' :np.mean,'max_nb_department' :np.max, 'min_nb_department':np.min, 'std_nb_departmenp.std()}
user_dep_prior = orders[orders.eval_set=='prior'].groupby('user_id')['nb_unique_department'].agg(f)


/usr/local/lib/python3.6/site-packages/ipykernel_launcher.py:2: FutureWarning: using a dict on a Series for aggregation
is deprecated and will be removed in a future version
  

In [174]:
users_prior = pd.concat([users_prior,user_dep_prior],axis=1)

In [177]:
users_prior.head()


Out[177]:
dow nb_order moments nb_moments nb_dow mean_nb_department max_nb_department min_nb_department
user_id
1 {2: 2, 3: 2, 4: 3, 1: 3} 10 {'morning': 6, 'afternoon': 4} 2 4 7.700000 12 2
2 {2: 5, 5: 1, 1: 5, 3: 2, 4: 1} 14 {'morning': 12, 'afternoon': 2} 2 5 9.071429 14 3
3 {1: 2, 3: 3, 2: 1, 0: 6} 12 {'afternoon': 11, 'night': 1} 2 4 8.583333 14 3
4 {6: 1, 4: 2, 5: 2} 5 {'morning': 2, 'afternoon': 3} 2 3 6.000000 10 1
5 {3: 2, 0: 1, 1: 1} 4 {'afternoon': 4} 1 3 10.500000 14 8

In [ ]: