FOD KAGGLE

Setup


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter

In [3]:
aisles_df = pd.read_csv('aisles.csv')
departments_df = pd.read_csv('departments.csv')
products_df = pd.read_csv('products.csv')
orders_df = pd.read_csv('orders.csv')
order_products_train_df = pd.read_csv('order_products__train.csv')
order_products_prior_df = pd.read_csv('order_products__prior.csv')


---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
<ipython-input-3-5525ef49a972> in <module>()
----> 1 aisles_df = pd.read_csv('aisles.csv')
      2 departments_df = pd.read_csv('departments.csv')
      3 products_df = pd.read_csv('products.csv')
      4 orders_df = pd.read_csv('orders.csv')
      5 order_products_train_df = pd.read_csv('order_products__train.csv')

/usr/local/lib/python3.6/site-packages/pandas/io/parsers.py in parser_f(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, escapechar, comment, encoding, dialect, tupleize_cols, error_bad_lines, warn_bad_lines, skipfooter, skip_footer, doublequote, delim_whitespace, as_recarray, compact_ints, use_unsigned, low_memory, buffer_lines, memory_map, float_precision)
    653                     skip_blank_lines=skip_blank_lines)
    654 
--> 655         return _read(filepath_or_buffer, kwds)
    656 
    657     parser_f.__name__ = name

/usr/local/lib/python3.6/site-packages/pandas/io/parsers.py in _read(filepath_or_buffer, kwds)
    403 
    404     # Create the parser.
--> 405     parser = TextFileReader(filepath_or_buffer, **kwds)
    406 
    407     if chunksize or iterator:

/usr/local/lib/python3.6/site-packages/pandas/io/parsers.py in __init__(self, f, engine, **kwds)
    762             self.options['has_index_names'] = kwds['has_index_names']
    763 
--> 764         self._make_engine(self.engine)
    765 
    766     def close(self):

/usr/local/lib/python3.6/site-packages/pandas/io/parsers.py in _make_engine(self, engine)
    983     def _make_engine(self, engine='c'):
    984         if engine == 'c':
--> 985             self._engine = CParserWrapper(self.f, **self.options)
    986         else:
    987             if engine == 'python':

/usr/local/lib/python3.6/site-packages/pandas/io/parsers.py in __init__(self, src, **kwds)
   1603         kwds['allow_leading_cols'] = self.index_col is not False
   1604 
-> 1605         self._reader = parsers.TextReader(src, **kwds)
   1606 
   1607         # XXX

pandas/_libs/parsers.pyx in pandas._libs.parsers.TextReader.__cinit__ (pandas/_libs/parsers.c:4209)()

pandas/_libs/parsers.pyx in pandas._libs.parsers.TextReader._setup_parser_source (pandas/_libs/parsers.c:8873)()

FileNotFoundError: File b'aisles.csv' does not exist

Data Viz

Utils


In [4]:
def get_order_df(df, order_id):
    return df[df['order_id'] == order_id]

def get_order_user_id(orders, order_id):
    return orders[orders['order_id'] == order_id].iloc[0]['user_id']

Visualizing dataframes


In [5]:
aisles_df.count()


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-5-8b8422d9a519> in <module>()
----> 1 aisles_df.count()

NameError: name 'aisles_df' is not defined

In [1]:
products_df.head()


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-1-8165f702d0ba> in <module>()
----> 1 products_df.head()

NameError: name 'products_df' is not defined

In [14]:
departments_df.head()


Out[14]:
department_id department
0 1 frozen
1 2 other
2 3 bakery
3 4 produce
4 5 alcohol

In [4]:
orders_df.head()


Out[4]:
order_id user_id eval_set order_number order_dow order_hour_of_day days_since_prior_order
0 2539329 1 prior 1 2 8 NaN
1 2398795 1 prior 2 3 7 15.0
2 473747 1 prior 3 3 12 21.0
3 2254736 1 prior 4 4 7 29.0
4 431534 1 prior 5 4 15 28.0

In [71]:
order_products_prior_df.head()


Out[71]:
order_id product_id add_to_cart_order reordered
0 2 33120 1 1
1 2 28985 2 1
2 2 9327 3 0
3 2 45918 4 1
4 2 30035 5 0

In [7]:
order_products_train_df.head()


Out[7]:
order_id product_id add_to_cart_order reordered
0 1 49302 1 1
1 1 11109 2 1
2 1 10246 3 0
3 1 49683 4 0
4 1 43633 5 1

In [68]:


In [69]:
# Get number of produts by order
order_nb_of_products = order_products_prior_df.groupby('order_id')['product_id'].count()
order_nb_of_products_df = order_nb_of_products.to_frame()
order_nb_of_products_df = order_nb_of_products_df.rename(columns = {'product_id':'nb_of_products'})

# Compute features
mean_products_by_id = order_nb_of_products_df.join(orders_df).groupby('user_id')['nb_of_products'].mean()
sum_products_by_id = order_nb_of_products_df.join(orders_df).groupby('user_id')['nb_of_products'].sum()
std_products_by_id = order_nb_of_products_df.join(orders_df).groupby('user_id')['nb_of_products'].std()

# Rename features
mean_products_by_id_df = mean_products_by_id.to_frame(name="mean")
sum_products_by_id_df = sum_products_by_id.to_frame(name="sum")
std_products_by_id_df = std_products_by_id.to_frame(name="std")

# Features on numbers of products per order per user
result = pd.concat([mean_products_by_id_df, sum_products_by_id_df, std_products_by_id_df], axis=1)

result.head()


Out[69]:
mean sum std
user_id
1.0 10.222222 92 8.012144
2.0 10.071429 141 6.977594
3.0 9.500000 95 7.276293
4.0 5.166667 31 2.926887
5.0 7.600000 38 7.368853

In [78]:
# Get number of produts by order
time = order_products_prior_df.groupby('order_id')['product_id'].count()
order_nb_of_products_df = order_nb_of_products.to_frame()
order_nb_of_products_df = order_nb_of_products_df.rename(columns = {'product_id':'nb_of_products'})

# Get user_id of each order
order_products_by_user = order_nb_of_products_df.join(orders_df)

# Compute features
mean_products_by_id = order_products_by_user.groupby('user_id')['nb_of_products'].mean()
sum_products_by_id = order_products_by_user.groupby('user_id')['nb_of_products'].sum()
std_products_by_id = order_products_by_user.groupby('user_id')['nb_of_products'].std()

# Rename features
mean_products_by_id_df = mean_products_by_id.to_frame(name="mean_nb_products_by_order")
sum_products_by_id_df = sum_products_by_id.to_frame(name="sum_nb_products_by_order")
std_products_by_id_df = std_products_by_id.to_frame(name="std_nb_products_by_order")

# Features on numbers of products per order per user
nb_products_by_order_by_user_df = pd.concat([mean_products_by_id_df, sum_products_by_id_df, std_products_by_id_df], axis=1)
nb_products_by_order_by_user_df.head()


Out[78]:
mean_nb_products_by_order sum_nb_products_by_order std_nb_products_by_order
user_id
1.0 10.222222 92 8.012144
2.0 10.071429 141 6.977594
3.0 9.500000 95 7.276293
4.0 5.166667 31 2.926887
5.0 7.600000 38 7.368853

In [79]:
mean_days_between_orders_df = orders_df.groupby('user_id')['days_since_prior_order'].mean().to_frame(name='mean_days_between_order')
sum_days_between_orders_df = orders_df.groupby('user_id')['days_since_prior_order'].sum().to_frame(name='sum_days_between_order')
std_days_between_orders_df = orders_df.groupby('user_id')['days_since_prior_order'].std().to_frame(name='std_days_between_order')


days_between_orders_by_users_df = pd.concat([mean_days_between_orders_df, sum_days_between_orders_df, std_days_between_orders_df], axis=1)
days_between_orders_by_users_df.head()


Out[79]:
mean_days_between_order sum_days_between_order std_days_between_order
user_id
1 19.000000 190.0 9.030811
2 16.285714 228.0 10.268912
3 12.000000 144.0 5.134553
4 17.000000 85.0 10.977249
5 11.500000 46.0 5.446712

In [81]:
days_between_orders_by_users_df.join(nb_products_by_order_by_user_df).head()


Out[81]:
mean_days_between_order sum_days_between_order std_days_between_order mean_nb_products_by_order sum_nb_products_by_order std_nb_products_by_order
user_id
1 19.000000 190.0 9.030811 10.222222 92.0 8.012144
2 16.285714 228.0 10.268912 10.071429 141.0 6.977594
3 12.000000 144.0 5.134553 9.500000 95.0 7.276293
4 17.000000 85.0 10.977249 5.166667 31.0 2.926887
5 11.500000 46.0 5.446712 7.600000 38.0 7.368853

In [85]:
orders_df


Out[85]:
nb_of_orders
user_id
1 11
2 15
3 13
4 6
5 5
6 4
7 21
8 4
9 4
10 6
11 8
12 6
13 13
14 14
15 23
16 7
17 41
18 7
19 10
20 5
21 34
22 16
23 5
24 19
25 4
26 13
27 82
28 25
29 19
30 9
... ...
206180 19
206181 15
206182 11
206183 15
206184 5
206185 11
206186 4
206187 35
206188 8
206189 7
206190 9
206191 6
206192 15
206193 42
206194 12
206195 20
206196 5
206197 24
206198 8
206199 20
206200 24
206201 33
206202 23
206203 6
206204 5
206205 4
206206 68
206207 17
206208 50
206209 14

206209 rows × 1 columns


In [49]:



Out[49]:
user_id
1         19.000000
2         16.285714
3         12.000000
4         17.000000
5         11.500000
6         13.333333
7         10.450000
8         23.333333
9         22.000000
10        21.800000
11        18.714286
12        26.000000
13         7.666667
14        21.230769
15        10.636364
16        19.333333
17         8.000000
18         5.833333
19         9.333333
20        11.250000
21        10.454545
22        12.733333
23        18.500000
24        14.666667
25        30.000000
26        12.166667
27         4.432099
28        11.541667
29        11.611111
30        21.625000
            ...    
206180     7.777778
206181    17.928571
206182    20.300000
206183    14.000000
206184    25.000000
206185    25.400000
206186    14.666667
206187    10.235294
206188    27.428571
206189    14.833333
206190    22.875000
206191    21.600000
206192    12.142857
206193     5.780488
206194    11.727273
206195     8.000000
206196    26.500000
206197    13.086957
206198    18.285714
206199    16.947368
206200     8.826087
206201    10.718750
206202    12.500000
206203    29.400000
206204    14.750000
206205    16.666667
206206     3.716418
206207    14.312500
206208     7.367347
206209    18.461538
Name: days_since_prior_order, Length: 206209, dtype: float64

In [ ]:


In [31]:
orders_df.join(order_nb_of_products, on='order_id', how='right')


Out[31]:
order_id user_id eval_set order_number order_dow order_hour_of_day days_since_prior_order product_id
0 2539329 1 prior 1 2 8 NaN 5
1 2398795 1 prior 2 3 7 15.0 6
2 473747 1 prior 3 3 12 21.0 5
3 2254736 1 prior 4 4 7 29.0 5
4 431534 1 prior 5 4 15 28.0 8
5 3367565 1 prior 6 2 7 19.0 4
6 550135 1 prior 7 1 9 20.0 5
7 3108588 1 prior 8 1 14 14.0 6
8 2295261 1 prior 9 1 16 0.0 6
9 2550362 1 prior 10 4 8 30.0 9
11 2168274 2 prior 1 2 11 NaN 13
12 1501582 2 prior 2 5 10 10.0 6
13 1901567 2 prior 3 1 10 3.0 5
14 738281 2 prior 4 2 10 8.0 13
15 1673511 2 prior 5 3 11 8.0 13
16 1199898 2 prior 6 2 9 13.0 21
17 3194192 2 prior 7 2 12 14.0 14
18 788338 2 prior 8 1 15 27.0 16
19 1718559 2 prior 9 2 9 8.0 26
20 1447487 2 prior 10 1 11 6.0 9
21 1402090 2 prior 11 1 10 30.0 15
22 3186735 2 prior 12 1 9 28.0 19
23 3268552 2 prior 13 4 11 30.0 9
24 839880 2 prior 14 3 10 13.0 16
26 1374495 3 prior 1 1 14 NaN 10
27 444309 3 prior 2 3 19 9.0 9
28 3002854 3 prior 3 3 16 21.0 6
29 2037211 3 prior 4 2 18 20.0 5
30 2710558 3 prior 5 0 17 12.0 11
31 1972919 3 prior 6 0 16 7.0 8
... ... ... ... ... ... ... ... ...
3421051 2254244 206208 prior 33 6 13 7.0 10
3421052 2017995 206208 prior 34 6 15 7.0 10
3421053 2789700 206208 prior 35 3 22 4.0 16
3421054 844592 206208 prior 36 6 15 10.0 21
3421055 1541132 206208 prior 37 2 12 3.0 10
3421056 2808240 206208 prior 38 0 15 19.0 8
3421057 3027766 206208 prior 39 2 14 9.0 11
3421058 3356245 206208 prior 40 5 9 10.0 28
3421059 442304 206208 prior 41 2 14 11.0 18
3421060 2675140 206208 prior 42 1 19 6.0 9
3421061 167903 206208 prior 43 4 14 3.0 14
3421062 2393201 206208 prior 44 6 16 2.0 10
3421063 3292671 206208 prior 45 2 11 3.0 18
3421064 3059777 206208 prior 46 1 10 13.0 7
3421065 2239861 206208 prior 47 3 4 9.0 23
3421066 1285346 206208 prior 48 1 11 5.0 8
3421067 1882108 206208 prior 49 1 22 7.0 17
3421069 3154581 206209 prior 1 3 11 NaN 13
3421070 1889163 206209 prior 2 3 17 7.0 9
3421071 1542354 206209 prior 3 5 11 30.0 15
3421072 688306 206209 prior 4 1 10 30.0 16
3421073 2307371 206209 prior 5 4 15 3.0 3
3421074 3186442 206209 prior 6 0 16 3.0 2
3421075 550836 206209 prior 7 2 13 9.0 10
3421076 2129269 206209 prior 8 3 17 22.0 12
3421077 2558525 206209 prior 9 4 15 22.0 3
3421078 2266710 206209 prior 10 5 18 29.0 9
3421079 1854736 206209 prior 11 4 10 30.0 8
3421080 626363 206209 prior 12 1 12 18.0 20
3421081 2977660 206209 prior 13 1 12 7.0 9

3214874 rows × 8 columns

Features of dataframes


In [8]:
orders_df.eval_set = pd.Categorical(orders_df.eval_set)
orders_df['eval_set_code'] = orders_df.eval_set.cat.codes
orders_df.hist(column='eval_set_code')

plt.ylabel('Number of Occurrences', fontsize=12)
plt.xlabel('Eval set type', fontsize=12)
plt.title('Count of rows in each dataset', fontsize=15)
plt.xticks(rotation='vertical')
plt.show()



In [9]:
orders_df['order_dow'].hist(bins=14)
plt.ylabel('Count', fontsize=12)
plt.xlabel('Day of week', fontsize=12)
plt.xticks(rotation='vertical')
plt.title("Frequency of order by week day", fontsize=15)
plt.show()



In [10]:
orders_df['order_hour_of_day'].hist(bins=48)
plt.ylabel('Count', fontsize=12)
plt.xlabel('Hour of day', fontsize=12)
plt.xticks(rotation='vertical')
plt.title("Frequency of order by hour of day", fontsize=15)
plt.show()



In [11]:
orders_df['days_since_prior_order'].hist(bins=60)
plt.ylabel('Count', fontsize=12)
plt.xlabel('Days since prior order', fontsize=12)
plt.xticks(rotation='vertical')
plt.title("Frequency distribution by days since prior order", fontsize=15)
plt.show()


Cleaning and formating data


In [87]:
# combine aisles, departments and products (left joined to products)
product_details_df = pd.merge(left=pd.merge(left=products_df, right=departments_df, how='left'), right=aisles_df, how='left')
print(product_details_df.info())

product_details_df.head()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 49688 entries, 0 to 49687
Data columns (total 6 columns):
product_id       49688 non-null int64
product_name     49688 non-null object
aisle_id         49688 non-null int64
department_id    49688 non-null int64
department       49688 non-null object
aisle            49688 non-null object
dtypes: int64(3), object(3)
memory usage: 2.7+ MB
None
Out[87]:
product_id product_name aisle_id department_id department aisle
0 1 Chocolate Sandwich Cookies 61 19 snacks cookies cakes
1 2 All-Seasons Salt 104 13 pantry spices seasonings
2 3 Robust Golden Unsweetened Oolong Tea 94 7 beverages tea
3 4 Smart Ones Classic Favorites Mini Rigatoni Wit... 38 1 frozen frozen meals
4 5 Green Chile Anytime Sauce 5 13 pantry marinades meat preparation

In [88]:
from functools import partial

# merge train and prior together iteratively, to fit into 8GB kernel RAM
# split df indexes into parts
indexes = np.linspace(0, len(order_products_prior_df), num=10, dtype=np.int32)


## Begining with train

# Add products_id to orders ie now there is one line per product inside an order
order_details = pd.merge(
                left=order_products_train_df,
                 right=orders_df, 
                 how='left', 
                 on='order_id'
        ).apply(partial(pd.to_numeric, errors='ignore', downcast='integer'))

# add information about products (aisles and departments)
order_details = pd.merge(
                left=order_details,
                right=product_details_df[['product_id', 
                             'aisle_id', 
                             'department_id']].apply(partial(pd.to_numeric, 
                                                             errors='ignore', 
                                                             downcast='integer')),
                how='left',
                on='product_id'
)

print(order_details.shape, order_products_train_df.shape)

# delete (redundant now) dataframes
#del order_products_train_df

order_details.head()


(1384617, 12) (1384617, 4)
Out[88]:
order_id product_id add_to_cart_order reordered user_id eval_set order_number order_dow order_hour_of_day days_since_prior_order aisle_id department_id
0 1 49302 1 1 112108 train 4 4 10 9 120 16
1 1 11109 2 1 112108 train 4 4 10 9 108 16
2 1 10246 3 0 112108 train 4 4 10 9 83 4
3 1 49683 4 0 112108 train 4 4 10 9 83 4
4 1 43633 5 1 112108 train 4 4 10 9 95 15

In [100]:
nb_depts_by_order_df = order_details.groupby('order_id')['department_id'].nunique().to_frame(name='nb_depts')
pd.concat([order_details, nb_depts_by_order_df])


Out[100]:
add_to_cart_order aisle_id days_since_prior_order department_id eval_set nb_depts order_dow order_hour_of_day order_id order_number product_id reordered user_id
0 1.0 120.0 9.0 16.0 train NaN 4.0 10.0 1.0 4.0 49302.0 1.0 112108.0
1 2.0 108.0 9.0 16.0 train NaN 4.0 10.0 1.0 4.0 11109.0 1.0 112108.0
2 3.0 83.0 9.0 4.0 train NaN 4.0 10.0 1.0 4.0 10246.0 0.0 112108.0
3 4.0 83.0 9.0 4.0 train NaN 4.0 10.0 1.0 4.0 49683.0 0.0 112108.0
4 5.0 95.0 9.0 15.0 train NaN 4.0 10.0 1.0 4.0 43633.0 1.0 112108.0
5 6.0 24.0 9.0 4.0 train NaN 4.0 10.0 1.0 4.0 13176.0 0.0 112108.0
6 7.0 24.0 9.0 4.0 train NaN 4.0 10.0 1.0 4.0 47209.0 0.0 112108.0
7 8.0 21.0 9.0 16.0 train NaN 4.0 10.0 1.0 4.0 22035.0 1.0 112108.0
8 1.0 2.0 30.0 16.0 train NaN 6.0 18.0 36.0 23.0 39612.0 0.0 79431.0
9 2.0 115.0 30.0 7.0 train NaN 6.0 18.0 36.0 23.0 19660.0 1.0 79431.0
10 3.0 53.0 30.0 16.0 train NaN 6.0 18.0 36.0 23.0 49235.0 0.0 79431.0
11 4.0 123.0 30.0 4.0 train NaN 6.0 18.0 36.0 23.0 43086.0 1.0 79431.0
12 5.0 86.0 30.0 16.0 train NaN 6.0 18.0 36.0 23.0 46620.0 1.0 79431.0
13 6.0 96.0 30.0 20.0 train NaN 6.0 18.0 36.0 23.0 34497.0 1.0 79431.0
14 7.0 83.0 30.0 4.0 train NaN 6.0 18.0 36.0 23.0 48679.0 1.0 79431.0
15 8.0 83.0 30.0 4.0 train NaN 6.0 18.0 36.0 23.0 46979.0 1.0 79431.0
16 1.0 117.0 24.0 19.0 train NaN 6.0 16.0 38.0 6.0 11913.0 0.0 42756.0
17 2.0 123.0 24.0 4.0 train NaN 6.0 16.0 38.0 6.0 18159.0 0.0 42756.0
18 3.0 19.0 24.0 13.0 train NaN 6.0 16.0 38.0 6.0 4461.0 0.0 42756.0
19 4.0 123.0 24.0 4.0 train NaN 6.0 16.0 38.0 6.0 21616.0 1.0 42756.0
20 5.0 83.0 24.0 4.0 train NaN 6.0 16.0 38.0 6.0 23622.0 0.0 42756.0
21 6.0 116.0 24.0 1.0 train NaN 6.0 16.0 38.0 6.0 32433.0 0.0 42756.0
22 7.0 16.0 24.0 4.0 train NaN 6.0 16.0 38.0 6.0 28842.0 0.0 42756.0
23 8.0 16.0 24.0 4.0 train NaN 6.0 16.0 38.0 6.0 42625.0 0.0 42756.0
24 9.0 16.0 24.0 4.0 train NaN 6.0 16.0 38.0 6.0 39693.0 0.0 42756.0
25 1.0 96.0 30.0 20.0 train NaN 6.0 20.0 96.0 7.0 20574.0 1.0 17227.0
26 2.0 83.0 30.0 4.0 train NaN 6.0 20.0 96.0 7.0 30391.0 0.0 17227.0
27 3.0 123.0 30.0 4.0 train NaN 6.0 20.0 96.0 7.0 40706.0 1.0 17227.0
28 4.0 116.0 30.0 1.0 train NaN 6.0 20.0 96.0 7.0 25610.0 0.0 17227.0
29 5.0 123.0 30.0 4.0 train NaN 6.0 20.0 96.0 7.0 27966.0 1.0 17227.0
... ... ... ... ... ... ... ... ... ... ... ... ... ...
3420498 NaN NaN NaN NaN NaN 4.0 NaN NaN NaN NaN NaN NaN NaN
3420505 NaN NaN NaN NaN NaN 7.0 NaN NaN NaN NaN NaN NaN NaN
3420578 NaN NaN NaN NaN NaN 7.0 NaN NaN NaN NaN NaN NaN NaN
3420586 NaN NaN NaN NaN NaN 6.0 NaN NaN NaN NaN NaN NaN NaN
3420603 NaN NaN NaN NaN NaN 5.0 NaN NaN NaN NaN NaN NaN NaN
3420617 NaN NaN NaN NaN NaN 1.0 NaN NaN NaN NaN NaN NaN NaN
3420693 NaN NaN NaN NaN NaN 1.0 NaN NaN NaN NaN NaN NaN NaN
3420729 NaN NaN NaN NaN NaN 3.0 NaN NaN NaN NaN NaN NaN NaN
3420735 NaN NaN NaN NaN NaN 4.0 NaN NaN NaN NaN NaN NaN NaN
3420745 NaN NaN NaN NaN NaN 2.0 NaN NaN NaN NaN NaN NaN NaN
3420769 NaN NaN NaN NaN NaN 4.0 NaN NaN NaN NaN NaN NaN NaN
3420788 NaN NaN NaN NaN NaN 7.0 NaN NaN NaN NaN NaN NaN NaN
3420798 NaN NaN NaN NaN NaN 1.0 NaN NaN NaN NaN NaN NaN NaN
3420802 NaN NaN NaN NaN NaN 1.0 NaN NaN NaN NaN NaN NaN NaN
3420812 NaN NaN NaN NaN NaN 6.0 NaN NaN NaN NaN NaN NaN NaN
3420851 NaN NaN NaN NaN NaN 6.0 NaN NaN NaN NaN NaN NaN NaN
3420857 NaN NaN NaN NaN NaN 7.0 NaN NaN NaN NaN NaN NaN NaN
3420881 NaN NaN NaN NaN NaN 4.0 NaN NaN NaN NaN NaN NaN NaN
3420894 NaN NaN NaN NaN NaN 8.0 NaN NaN NaN NaN NaN NaN NaN
3420895 NaN NaN NaN NaN NaN 8.0 NaN NaN NaN NaN NaN NaN NaN
3420909 NaN NaN NaN NaN NaN 4.0 NaN NaN NaN NaN NaN NaN NaN
3420979 NaN NaN NaN NaN NaN 2.0 NaN NaN NaN NaN NaN NaN NaN
3420996 NaN NaN NaN NaN NaN 6.0 NaN NaN NaN NaN NaN NaN NaN
3420998 NaN NaN NaN NaN NaN 7.0 NaN NaN NaN NaN NaN NaN NaN
3421026 NaN NaN NaN NaN NaN 4.0 NaN NaN NaN NaN NaN NaN NaN
3421049 NaN NaN NaN NaN NaN 5.0 NaN NaN NaN NaN NaN NaN NaN
3421056 NaN NaN NaN NaN NaN 5.0 NaN NaN NaN NaN NaN NaN NaN
3421058 NaN NaN NaN NaN NaN 6.0 NaN NaN NaN NaN NaN NaN NaN
3421063 NaN NaN NaN NaN NaN 4.0 NaN NaN NaN NaN NaN NaN NaN
3421070 NaN NaN NaN NaN NaN 3.0 NaN NaN NaN NaN NaN NaN NaN

1515826 rows × 13 columns


In [27]:
order_details.eval_set.value_counts()


Out[27]:
train    1384617
Name: eval_set, dtype: int64

In [28]:
%%time
# ADDING PRIOR
# update by small portions
for i in range(len(indexes)-1):
    order_details = pd.concat(
        [   
            order_details,
            pd.merge(left=pd.merge(
                            left=order_products_prior_df.loc[indexes[i]:indexes[i+1], :],
                            right=product_details_df[['product_id', 
                                         'aisle_id', 
                                         'department_id' ]].apply(partial(pd.to_numeric, 
                                                                          errors='ignore', 
                                                                          downcast='integer')),
                            how='left',
                            on='product_id'
                            ),
                     right=orders_df, 
                     how='left', 
                     on='order_id'
                )
        ]
    )
        
print('Datafame length: {}'.format(order_details.shape[0]))
print('Memory consumption: {:.2f} Mb'.format(sum(order_details.memory_usage(index=True, deep=True) / 2**20)))


# delete (redundant now) dataframes
#del order_products_prior_df


Datafame length: 33819114
Memory consumption: 4676.60 Mb
CPU times: user 33 s, sys: 29.8 s, total: 1min 2s
Wall time: 1min 5s

In [29]:
# Get Test data
test_orders = orders_df[orders_df.eval_set == 'test']

#del orders_df

In [30]:
order_details.eval_set.value_counts()


Out[30]:
prior    32434497
train     1384617
Name: eval_set, dtype: int64

In [31]:
test_orders.eval_set.value_counts()


Out[31]:
test    75000
Name: eval_set, dtype: int64

Submissions

Dumb submission

Try as a first submission to gather all the products purchased by the user and buy all them in the next order


In [37]:
%%time
# dumb submission: get the user and give to submission all products buy by the user
test_history = order_details[(order_details.user_id.isin(test_orders.user_id))]\
.groupby('user_id')['product_id'].apply(lambda x: ' '.join([str(e) for e in set(x)])).reset_index()

test_history.columns = ['user_id', 'products']

test_history.head()


CPU times: user 20.6 s, sys: 1.47 s, total: 22 s
Wall time: 22.2 s

In [40]:
test_orders.head()


Out[40]:
order_id user_id eval_set order_number order_dow order_hour_of_day days_since_prior_order
38 2774568 3 test 13 5 15 11.0
44 329954 4 test 6 3 12 30.0
53 1528013 6 test 4 3 16 22.0
96 1376945 11 test 8 6 11 8.0
102 1356845 12 test 6 1 20 30.0

In [39]:
test_history.head()


Out[39]:
user_id products
0 3 17668 44683 48523 21903 14992 21137 32402 2203...
1 4 21573 42329 17769 35469 37646 1200 19057 26576...
2 6 40992 27521 20323 48679 8424 45007 21903 10644...
3 11 17794 8197 30855 33037 30480 10644 26209 35738...
4 12 11520 45056 17794 44422 17159 44683 37646 1499...

In [41]:
%%time


test_history = pd.merge(left=test_history, 
                        right=test_orders, 
                        how='right', 
                        on='user_id')[['order_id', 'products']]
test_history.to_csv('dumb_submission.csv', encoding='utf-8', index=False)


CPU times: user 758 ms, sys: 134 ms, total: 892 ms
Wall time: 911 ms

In [42]:
test_history.head()


Out[42]:
order_id products
0 2774568 17668 44683 48523 21903 14992 21137 32402 2203...
1 329954 21573 42329 17769 35469 37646 1200 19057 26576...
2 1528013 40992 27521 20323 48679 8424 45007 21903 10644...
3 1376945 17794 8197 30855 33037 30480 10644 26209 35738...
4 1356845 11520 45056 17794 44422 17159 44683 37646 1499...

Results

0.2164845

Last order

Use the last order of the user as a submission


In [44]:
%%time
test_history = order_details[(order_details.user_id.isin(test_orders.user_id))]
last_orders = test_history.groupby('user_id')['order_number'].max()

def get_last_orders(last_orders, test_history):
    t = pd.merge(
            lbneft=pd.merge(
                    left=last_orders.reset_index(),
                    right=test_history,
                    how='inner',
                    on=['user_id', 'order_number']
                )[['user_id', 'product_id']],
            right=test_orders[['user_id', 'order_id']],
            how='left',
            on='user_id'
        ).groupby('order_id')['product_id'].apply(lambda x: ' '.join([str(e) for e in set(x)])).reset_index()
    t.columns = ['order_id', 'products']
    return t

# save submission
get_last_orders(last_orders, test_history).to_csv('last_order.csv', encoding='utf-8', index=False)


CPU times: user 14 s, sys: 2.71 s, total: 16.7 s
Wall time: 17.8 s

Result

0.3118026

Tests in progress


In [ ]:
from sklearn.linear_model import ElasticNet

enet = ElasticNet(alpha=alpha, l1_ratio=0.7)

y_pred_enet = enet.fit(X_train, y_train).predict(X_test)
r2_score_enet = r2_score(y_test, y_pred_enet)
print(enet)
print("r^2 on test data : %f" % r2_score_enet)

plt.plot(enet.coef_, color='lightgreen', linewidth=2,
         label='Elastic net coefficients')
plt.plot(lasso.coef_, color='gold', linewidth=2,
         label='Lasso coefficients')
plt.plot(coef, '--', color='navy', label='original coefficients')
plt.legend(loc='best')
plt.title("Lasso R^2: %f, Elastic Net R^2: %f"
          % (r2_score_lasso, r2_score_enet))
plt.show()

In [ ]: