In [108]:
import sklearn
import pandas as pd
from collections import Counter
import numpy as np
import matplotlib.pyplot as plt
In [118]:
aisles = pd.read_csv('~/kaggle-competitions/Instacart/data/aisles.csv')
departments = pd.read_csv('~/kaggle-competitions/Instacart/data/departments.csv')
order_products_prior = pd.read_csv('~/kaggle-competitions/Instacart/data/order_products__prior.csv')
order_products_train = pd.read_csv('~/kaggle-competitions/Instacart/data/order_products__train.csv')
orders = pd.read_csv('~/kaggle-competitions/Instacart/data/orders.csv')
products = pd.read_csv('~/kaggle-competitions/Instacart/data/products.csv')
In [138]:
departments
Out[138]:
In [3]:
def hours_to_moment(hour):
try :
hour = int(hour)
if hour>6 and hour<12:
return 'morning'
elif hour>=12 and hour<19:
return 'afternoon'
else :
return 'night'
except :
return 'tbd'
In [54]:
orders['moments']= orders.order_hour_of_day.apply(hours_to_moment)
In [61]:
user_counter_dow_prior = orders[orders.eval_set=='prior'].groupby('user_id')['order_dow'].apply(list).apply(Counter).to_frame(name='dow')
In [62]:
user_nb_order_prior = orders[orders.eval_set=='prior'].groupby('user_id')['order_dow'].count().to_frame(name='nb_order')
In [63]:
user_counter_moment_prior = orders[orders.eval_set=='prior'].groupby('user_id')['moments'].apply(list).apply(Counter).to_frame(name='moments')
d = {'dow':counter_dow.values} user_counter_dow = pd.DataFrame(data = d,index =counter_dow.index)
d = {'#order':nb_order.values} user_nb_order = pd.DataFrame(data = d,index =nb_order.index)
d = {'moments':counter_moment.values} user_counter_moment = pd.DataFrame(data = d,index =counter_moment.index)
users = user_counter_dow.join(user_nb_order, how='outer').join(user_counter_moment, how='outer')
In [64]:
users_prior = pd.concat([user_counter_dow_prior,user_nb_order_prior,user_counter_moment_prior],axis=1)
In [65]:
users_prior['nb_moments'] = users.moments.apply(len)
In [66]:
users_prior['nb_dow'] = users.dow.apply(len)
users[(users['#dow']==1) & (users['#moments']==1)]
In [81]:
users_prior.head()
Out[81]:
In [131]:
order_prior_enriched = order_products_prior.join(products['department_id'],on='product_id', how='left')
In [137]:
order_prior_enriched['department_id'] = order_prior_enriched['department_id'].fillna(value=21).apply(int)
In [150]:
order_prior_enriched = order_prior_enriched.join(order_prior_enriched.groupby('order_id')['department_id'].nunique().to_frame(name='nb_unique_department'),how='outer',on='order_id')
In [157]:
orders = orders.join(order_prior_enriched['nb_unique_department'],on='order_id',how='left')
In [158]:
orders[orders.eval_set=='prior'].groupby('user_id')['nb_unique_department'].apply(list).apply(Counter).to_frame(name='number_department')
Out[158]:
In [159]:
orders[orders.eval_set=='prior'].groupby('user_id')['nb_unique_department'].max().
Out[159]:
In [173]:
f = {'mean_nb_department' :np.mean,'max_nb_department' :np.max, 'min_nb_department':np.min, 'std_nb_departmenp.std()}
user_dep_prior = orders[orders.eval_set=='prior'].groupby('user_id')['nb_unique_department'].agg(f)
In [174]:
users_prior = pd.concat([users_prior,user_dep_prior],axis=1)
In [177]:
users_prior.head()
Out[177]:
In [ ]: