In [3]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

In [4]:
aisles = pd.read_csv('data/aisles.csv', index_col='aisle_id').to_dict()['aisle']
depart = pd.read_csv('data/departments.csv', index_col='department_id').to_dict()['department']
products_df = pd.read_csv('data/products.csv', index_col='product_id')
products_df['aisle'] = products_df.aisle_id.map(lambda x: aisles[x])
products_df['department'] = products_df.department_id.map(lambda x: depart[x])
products_df.drop(['aisle_id', 'department_id'], axis=1, inplace=True)

In [5]:
prior_df = pd.read_csv('data/order_products__prior.csv', index_col='order_id')
train_df = pd.read_csv('data/order_products__train.csv', index_col='order_id')
orders_df = pd.read_csv('data/orders.csv', index_col='order_id')

In [6]:
gb1 = orders_df.groupby(orders_df.user_id).days_since_prior_order.aggregate(np.sum)
data = gb1.value_counts()
plt.figure(figsize=(200,20))
sns.barplot(data.index, data.values)


Out[6]:
<matplotlib.axes._subplots.AxesSubplot at 0x10ff34f50>

In [7]:
sns.countplot(orders_df.eval_set)


Out[7]:
<matplotlib.axes._subplots.AxesSubplot at 0x110eabfd0>

In [38]:
hdf = orders_df.groupby(['order_dow', 'order_hour_of_day']).size().reset_index().pivot("order_dow", "order_hour_of_day", 0)
sns.heatmap(hdf)


Out[38]:
<matplotlib.axes._subplots.AxesSubplot at 0x1148cd410>

In [73]:
s = prior_df[prior_df.reordered==1].groupby('product_id').size()
s = s.order(ascending=False)
s /= sum(s)


/Users/asydorchuk/miniconda2/lib/python2.7/site-packages/ipykernel/__main__.py:2: FutureWarning: order is deprecated, use sort_values(...)
  from ipykernel import kernelapp as app

In [102]:
orders_df.order_hour_of_day


Out[102]:
user_id eval_set order_number order_dow order_hour_of_day days_since_prior_order
order_id
2539329 1 prior 1 2 8 NaN
2398795 1 prior 2 3 7 15.0
473747 1 prior 3 3 12 21.0
2254736 1 prior 4 4 7 29.0
431534 1 prior 5 4 15 28.0
3367565 1 prior 6 2 7 19.0
550135 1 prior 7 1 9 20.0
3108588 1 prior 8 1 14 14.0
2295261 1 prior 9 1 16 0.0
2550362 1 prior 10 4 8 30.0
1187899 1 train 11 4 8 14.0
2168274 2 prior 1 2 11 NaN
1501582 2 prior 2 5 10 10.0
1901567 2 prior 3 1 10 3.0
738281 2 prior 4 2 10 8.0
1673511 2 prior 5 3 11 8.0
1199898 2 prior 6 2 9 13.0
3194192 2 prior 7 2 12 14.0
788338 2 prior 8 1 15 27.0
1718559 2 prior 9 2 9 8.0
1447487 2 prior 10 1 11 6.0
1402090 2 prior 11 1 10 30.0
3186735 2 prior 12 1 9 28.0
3268552 2 prior 13 4 11 30.0
839880 2 prior 14 3 10 13.0
1492625 2 train 15 1 11 30.0
1374495 3 prior 1 1 14 NaN
444309 3 prior 2 3 19 9.0
3002854 3 prior 3 3 16 21.0
2037211 3 prior 4 2 18 20.0
... ... ... ... ... ... ...
2789700 206208 prior 35 3 22 4.0
844592 206208 prior 36 6 15 10.0
1541132 206208 prior 37 2 12 3.0
2808240 206208 prior 38 0 15 19.0
3027766 206208 prior 39 2 14 9.0
3356245 206208 prior 40 5 9 10.0
442304 206208 prior 41 2 14 11.0
2675140 206208 prior 42 1 19 6.0
167903 206208 prior 43 4 14 3.0
2393201 206208 prior 44 6 16 2.0
3292671 206208 prior 45 2 11 3.0
3059777 206208 prior 46 1 10 13.0
2239861 206208 prior 47 3 4 9.0
1285346 206208 prior 48 1 11 5.0
1882108 206208 prior 49 1 22 7.0
803273 206208 test 50 5 11 4.0
3154581 206209 prior 1 3 11 NaN
1889163 206209 prior 2 3 17 7.0
1542354 206209 prior 3 5 11 30.0
688306 206209 prior 4 1 10 30.0
2307371 206209 prior 5 4 15 3.0
3186442 206209 prior 6 0 16 3.0
550836 206209 prior 7 2 13 9.0
2129269 206209 prior 8 3 17 22.0
2558525 206209 prior 9 4 15 22.0
2266710 206209 prior 10 5 18 29.0
1854736 206209 prior 11 4 10 30.0
626363 206209 prior 12 1 12 18.0
2977660 206209 prior 13 1 12 7.0
272231 206209 train 14 6 14 30.0

3421083 rows × 6 columns


In [ ]: