In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
In [3]:
aisles_df = pd.read_csv('aisles.csv')
departments_df = pd.read_csv('departments.csv')
products_df = pd.read_csv('products.csv')
orders_df = pd.read_csv('orders.csv')
order_products_train_df = pd.read_csv('order_products__train.csv')
order_products_prior_df = pd.read_csv('order_products__prior.csv')
In [4]:
def get_order_df(df, order_id):
return df[df['order_id'] == order_id]
def get_order_user_id(orders, order_id):
return orders[orders['order_id'] == order_id].iloc[0]['user_id']
In [5]:
aisles_df.count()
In [1]:
products_df.head()
In [14]:
departments_df.head()
Out[14]:
In [4]:
orders_df.head()
Out[4]:
In [71]:
order_products_prior_df.head()
Out[71]:
In [7]:
order_products_train_df.head()
Out[7]:
In [68]:
In [69]:
# Get number of produts by order
order_nb_of_products = order_products_prior_df.groupby('order_id')['product_id'].count()
order_nb_of_products_df = order_nb_of_products.to_frame()
order_nb_of_products_df = order_nb_of_products_df.rename(columns = {'product_id':'nb_of_products'})
# Compute features
mean_products_by_id = order_nb_of_products_df.join(orders_df).groupby('user_id')['nb_of_products'].mean()
sum_products_by_id = order_nb_of_products_df.join(orders_df).groupby('user_id')['nb_of_products'].sum()
std_products_by_id = order_nb_of_products_df.join(orders_df).groupby('user_id')['nb_of_products'].std()
# Rename features
mean_products_by_id_df = mean_products_by_id.to_frame(name="mean")
sum_products_by_id_df = sum_products_by_id.to_frame(name="sum")
std_products_by_id_df = std_products_by_id.to_frame(name="std")
# Features on numbers of products per order per user
result = pd.concat([mean_products_by_id_df, sum_products_by_id_df, std_products_by_id_df], axis=1)
result.head()
Out[69]:
In [78]:
# Get number of produts by order
time = order_products_prior_df.groupby('order_id')['product_id'].count()
order_nb_of_products_df = order_nb_of_products.to_frame()
order_nb_of_products_df = order_nb_of_products_df.rename(columns = {'product_id':'nb_of_products'})
# Get user_id of each order
order_products_by_user = order_nb_of_products_df.join(orders_df)
# Compute features
mean_products_by_id = order_products_by_user.groupby('user_id')['nb_of_products'].mean()
sum_products_by_id = order_products_by_user.groupby('user_id')['nb_of_products'].sum()
std_products_by_id = order_products_by_user.groupby('user_id')['nb_of_products'].std()
# Rename features
mean_products_by_id_df = mean_products_by_id.to_frame(name="mean_nb_products_by_order")
sum_products_by_id_df = sum_products_by_id.to_frame(name="sum_nb_products_by_order")
std_products_by_id_df = std_products_by_id.to_frame(name="std_nb_products_by_order")
# Features on numbers of products per order per user
nb_products_by_order_by_user_df = pd.concat([mean_products_by_id_df, sum_products_by_id_df, std_products_by_id_df], axis=1)
nb_products_by_order_by_user_df.head()
Out[78]:
In [79]:
mean_days_between_orders_df = orders_df.groupby('user_id')['days_since_prior_order'].mean().to_frame(name='mean_days_between_order')
sum_days_between_orders_df = orders_df.groupby('user_id')['days_since_prior_order'].sum().to_frame(name='sum_days_between_order')
std_days_between_orders_df = orders_df.groupby('user_id')['days_since_prior_order'].std().to_frame(name='std_days_between_order')
days_between_orders_by_users_df = pd.concat([mean_days_between_orders_df, sum_days_between_orders_df, std_days_between_orders_df], axis=1)
days_between_orders_by_users_df.head()
Out[79]:
In [81]:
days_between_orders_by_users_df.join(nb_products_by_order_by_user_df).head()
Out[81]:
In [85]:
orders_df
Out[85]:
In [49]:
Out[49]:
In [ ]:
In [31]:
orders_df.join(order_nb_of_products, on='order_id', how='right')
Out[31]:
In [8]:
orders_df.eval_set = pd.Categorical(orders_df.eval_set)
orders_df['eval_set_code'] = orders_df.eval_set.cat.codes
orders_df.hist(column='eval_set_code')
plt.ylabel('Number of Occurrences', fontsize=12)
plt.xlabel('Eval set type', fontsize=12)
plt.title('Count of rows in each dataset', fontsize=15)
plt.xticks(rotation='vertical')
plt.show()
In [9]:
orders_df['order_dow'].hist(bins=14)
plt.ylabel('Count', fontsize=12)
plt.xlabel('Day of week', fontsize=12)
plt.xticks(rotation='vertical')
plt.title("Frequency of order by week day", fontsize=15)
plt.show()
In [10]:
orders_df['order_hour_of_day'].hist(bins=48)
plt.ylabel('Count', fontsize=12)
plt.xlabel('Hour of day', fontsize=12)
plt.xticks(rotation='vertical')
plt.title("Frequency of order by hour of day", fontsize=15)
plt.show()
In [11]:
orders_df['days_since_prior_order'].hist(bins=60)
plt.ylabel('Count', fontsize=12)
plt.xlabel('Days since prior order', fontsize=12)
plt.xticks(rotation='vertical')
plt.title("Frequency distribution by days since prior order", fontsize=15)
plt.show()
In [87]:
# combine aisles, departments and products (left joined to products)
product_details_df = pd.merge(left=pd.merge(left=products_df, right=departments_df, how='left'), right=aisles_df, how='left')
print(product_details_df.info())
product_details_df.head()
Out[87]:
In [88]:
from functools import partial
# merge train and prior together iteratively, to fit into 8GB kernel RAM
# split df indexes into parts
indexes = np.linspace(0, len(order_products_prior_df), num=10, dtype=np.int32)
## Begining with train
# Add products_id to orders ie now there is one line per product inside an order
order_details = pd.merge(
left=order_products_train_df,
right=orders_df,
how='left',
on='order_id'
).apply(partial(pd.to_numeric, errors='ignore', downcast='integer'))
# add information about products (aisles and departments)
order_details = pd.merge(
left=order_details,
right=product_details_df[['product_id',
'aisle_id',
'department_id']].apply(partial(pd.to_numeric,
errors='ignore',
downcast='integer')),
how='left',
on='product_id'
)
print(order_details.shape, order_products_train_df.shape)
# delete (redundant now) dataframes
#del order_products_train_df
order_details.head()
Out[88]:
In [100]:
nb_depts_by_order_df = order_details.groupby('order_id')['department_id'].nunique().to_frame(name='nb_depts')
pd.concat([order_details, nb_depts_by_order_df])
Out[100]:
In [27]:
order_details.eval_set.value_counts()
Out[27]:
In [28]:
%%time
# ADDING PRIOR
# update by small portions
for i in range(len(indexes)-1):
order_details = pd.concat(
[
order_details,
pd.merge(left=pd.merge(
left=order_products_prior_df.loc[indexes[i]:indexes[i+1], :],
right=product_details_df[['product_id',
'aisle_id',
'department_id' ]].apply(partial(pd.to_numeric,
errors='ignore',
downcast='integer')),
how='left',
on='product_id'
),
right=orders_df,
how='left',
on='order_id'
)
]
)
print('Datafame length: {}'.format(order_details.shape[0]))
print('Memory consumption: {:.2f} Mb'.format(sum(order_details.memory_usage(index=True, deep=True) / 2**20)))
# delete (redundant now) dataframes
#del order_products_prior_df
In [29]:
# Get Test data
test_orders = orders_df[orders_df.eval_set == 'test']
#del orders_df
In [30]:
order_details.eval_set.value_counts()
Out[30]:
In [31]:
test_orders.eval_set.value_counts()
Out[31]:
In [37]:
%%time
# dumb submission: get the user and give to submission all products buy by the user
test_history = order_details[(order_details.user_id.isin(test_orders.user_id))]\
.groupby('user_id')['product_id'].apply(lambda x: ' '.join([str(e) for e in set(x)])).reset_index()
test_history.columns = ['user_id', 'products']
test_history.head()
In [40]:
test_orders.head()
Out[40]:
In [39]:
test_history.head()
Out[39]:
In [41]:
%%time
test_history = pd.merge(left=test_history,
right=test_orders,
how='right',
on='user_id')[['order_id', 'products']]
test_history.to_csv('dumb_submission.csv', encoding='utf-8', index=False)
In [42]:
test_history.head()
Out[42]:
In [44]:
%%time
test_history = order_details[(order_details.user_id.isin(test_orders.user_id))]
last_orders = test_history.groupby('user_id')['order_number'].max()
def get_last_orders(last_orders, test_history):
t = pd.merge(
lbneft=pd.merge(
left=last_orders.reset_index(),
right=test_history,
how='inner',
on=['user_id', 'order_number']
)[['user_id', 'product_id']],
right=test_orders[['user_id', 'order_id']],
how='left',
on='user_id'
).groupby('order_id')['product_id'].apply(lambda x: ' '.join([str(e) for e in set(x)])).reset_index()
t.columns = ['order_id', 'products']
return t
# save submission
get_last_orders(last_orders, test_history).to_csv('last_order.csv', encoding='utf-8', index=False)
In [ ]:
from sklearn.linear_model import ElasticNet
enet = ElasticNet(alpha=alpha, l1_ratio=0.7)
y_pred_enet = enet.fit(X_train, y_train).predict(X_test)
r2_score_enet = r2_score(y_test, y_pred_enet)
print(enet)
print("r^2 on test data : %f" % r2_score_enet)
plt.plot(enet.coef_, color='lightgreen', linewidth=2,
label='Elastic net coefficients')
plt.plot(lasso.coef_, color='gold', linewidth=2,
label='Lasso coefficients')
plt.plot(coef, '--', color='navy', label='original coefficients')
plt.legend(loc='best')
plt.title("Lasso R^2: %f, Elastic Net R^2: %f"
% (r2_score_lasso, r2_score_enet))
plt.show()
In [ ]: