In [1]:
# This version
# Moving to Keras as '0' to '1' regression

# 3 epoch, batch size = 59322, 300 sec/epoch
#loss    : 0.2861 - mse    : 0.0825 acc    : 0.9022 - 
#val_loss: 0.3541 - val_mse: 0.0895 val_acc: 0.902
#LB =  0.1723952

#with batch norm and multiple layers 
#400 - bn - 20 - bn -10 - 1 (sigmoid)
#optimizer: rmsprop
#kernal init = "normal"


## this version
# removed BN and 
#40 - 20 -10 - 1 (sigmoid)
# batch size = 20k
# 5 epochs
# LB = 0.2922377 with 020 thres
# LB = 0.3097318 with 015 thres ***
# LB = 0.3065503 with 012 thres

In [1]:
#importing packages

import numpy as np
import pandas as pd
import lightgbm as lgb
import gc

### importing libraries
%matplotlib inline
from keras.layers.convolutional import Convolution2D, MaxPooling2D, Convolution1D
from keras.layers.core import Dense, Activation, Dropout, Flatten
from keras.models import Sequential
from keras.optimizers import SGD,RMSprop
from keras.datasets import mnist
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping

from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline


import matplotlib as mpl

#mpl.use('Agg')
import matplotlib.pyplot as plt
import time
timestr = time.strftime("%Y%m%d-%H%M%S")
print(timestr)


Using Theano backend.
20170722-173418

In [2]:
#read df_train.csv, df_test.csv, train_labels.csv
df_train=pd.read_csv("df_train.csv")

In [3]:
df_test=pd.read_csv("df_test.csv")

In [4]:
labels=pd.read_csv("train_labels.csv",header=0)

In [5]:
print(df_train.shape)
print(labels.shape)
print(df_test.shape)


(8474661, 22)
(8474661, 1)
(4833292, 22)

In [6]:
df_train['labels']=labels
df_train.head()


Out[6]:
Unnamed: 0 order_id product_id user_total_orders user_total_items total_distinct_items user_average_days_between_orders user_average_basket order_hour_of_day days_since_prior_order ... product_orders product_reorders product_reorder_rate UP_orders UP_orders_ratio UP_average_pos_in_cart UP_reorder_rate UP_orders_since_last UP_delta_hour_vs_last labels
0 0 1187899 17122 11 59 18 19.0 5.363636 8 14.0 ... 13880.0 9377.0 0.675576 1 0.090909 6.0 0.090909 6 7 0
1 1 1187899 196 11 59 18 19.0 5.363636 8 14.0 ... 35791.0 27791.0 0.776480 10 0.909091 1.4 0.909091 1 0 1
2 2 1187899 26405 11 59 18 19.0 5.363636 8 14.0 ... 1214.0 536.0 0.441516 2 0.181818 5.0 0.181818 7 1 1
3 3 1187899 46149 11 59 18 19.0 5.363636 8 14.0 ... 8558.0 6953.0 0.812456 3 0.272727 3.0 0.272727 1 0 1
4 4 1187899 14084 11 59 18 19.0 5.363636 8 14.0 ... 15935.0 12923.0 0.810982 1 0.090909 2.0 0.090909 10 0 0

5 rows × 23 columns


In [7]:
df_train.dropna(inplace=True,how='any')
print(df_train.shape)


(8474614, 23)

In [8]:
#selected features

f_to_use = ['user_total_orders', 'user_total_items', 'total_distinct_items',
       'user_average_days_between_orders', 'user_average_basket',
       'order_hour_of_day', 'days_since_prior_order', 'days_since_ratio',
       'aisle_id', 'department_id', 'product_orders', 'product_reorders',
       'product_reorder_rate', 'UP_orders', 'UP_orders_ratio',
       'UP_average_pos_in_cart', 'UP_reorder_rate', 'UP_orders_since_last',
       'UP_delta_hour_vs_last'] # 'dow', 'UP_same_dow_as_last_order

In [9]:
# Convert to numpy values
X_train = df_train[f_to_use].values.astype('float32')
X_test = df_test[f_to_use].values.astype('float32')
y_train = df_train['labels'].values.astype('int32')
print("Types")
print(X_train.dtype)
print(X_test.dtype)
print(y_train.dtype)

print("\nShapes")
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)

print("\nHeads")
print(y_train)
print(X_train[0:5,0:5])
print(X_test[0:5,0:5])
#df_columns = df_values.columns
#df_columns


Types
float32
float32
int32

Shapes
(8474614, 19)
(8474614,)
(4833292, 19)

Heads
[0 1 1 ..., 0 0 0]
[[ 11.          59.          18.          19.           5.36363649]
 [ 11.          59.          18.          19.           5.36363649]
 [ 11.          59.          18.          19.           5.36363649]
 [ 11.          59.          18.          19.           5.36363649]
 [ 11.          59.          18.          19.           5.36363649]]
[[ 13.          88.          33.          12.           6.76923084]
 [ 13.          88.          33.          12.           6.76923084]
 [ 13.          88.          33.          12.           6.76923084]
 [ 13.          88.          33.          12.           6.76923084]
 [ 13.          88.          33.          12.           6.76923084]]

In [10]:
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils
# encode class values as integers
encoder = LabelEncoder()
encoder.fit(y_train)
encoded_Y = encoder.transform(y_train)
# convert integers to dummy variables (i.e. one hot encoded)
dummy_y = np_utils.to_categorical(encoded_Y,num_classes=2).astype('float32')
print(dummy_y)
print(dummy_y.shape)
print(dummy_y.dtype)


[[ 1.  0.]
 [ 0.  1.]
 [ 0.  1.]
 ..., 
 [ 1.  0.]
 [ 1.  0.]
 [ 1.  0.]]
(8474614, 2)
float32

In [11]:
import keras.backend as K

def precision(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def recall(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def fbeta_score(y_true, y_pred, beta=1):
    if beta < 0:
        raise ValueError('The lowest choosable beta is zero (only precision).')

    # If there are no true positives, fix the F score at 0 like sklearn.
    if K.sum(K.round(K.clip(y_true, 0, 1))) == 0:
        return 0

    p = precision(y_true, y_pred)
    r = recall(y_true, y_pred)
    bb = beta ** 2
    fbeta_score = (1 + bb) * (p * r) / (bb * p + r + K.epsilon())
    return fbeta_score

def fmeasure(y_true, y_pred):
    return fbeta_score(y_true, y_pred, beta=1)

def f1_score(y_true, y_pred):

    # Count positive samples.
    c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    c2 = K.sum(K.round(K.clip(y_pred, 0, 1)))
    c3 = K.sum(K.round(K.clip(y_true, 0, 1)))

    # If there are no true samples, fix the F1 score at 0.
    if c3 == 0:
        return 0

    # How many selected items are relevant?
    precision = c1 / c2

    # How many relevant items are selected?
    recall = c1 / c3

    # Calculate f1_score
    f1_score = 2 * (precision * recall) / (precision + recall)
    return f1_score

def baseline_model():
    model = Sequential()
    #model.add(Convolution1D(20, 5, border_mode='valid', input_shape=(244, 1)))
    
    model.add(Dense(40 ,kernel_initializer='normal',input_dim=df_train[f_to_use].shape[1], activation='relu'))
    #model.add(Convolution1D(20, 5, strides=1, padding='valid', dilation_rate=1, activation='relu')) 
    #model.add(Flatten())
    #model.add(BatchNormalization())
    # model.add(Dropout(0.5))
    model.add(Dense(20,kernel_initializer='normal',activation="relu"))
    #model.add(Dropout(0.5))
    #model.add(Dense(30,activation="relu"))
    #model.add(Dropout(0.5))
    #model.add(BatchNormalization())
    model.add(Dense(10,kernel_initializer='normal',activation="relu"))
    #model.add(Dropout(0.25))
    model.add(Dense(1,kernel_initializer='normal',activation="sigmoid"))
    # Compile model
    #sgd=SGD(lr=0.01, momentum=0.0, decay=0.0, nesterov=False)
    model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['mse'])
        #Adadelta
        #sgd
    return model

In [12]:
model=baseline_model()
model.summary()


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
dense_1 (Dense)              (None, 40)                800       
_________________________________________________________________
dense_2 (Dense)              (None, 20)                820       
_________________________________________________________________
dense_3 (Dense)              (None, 10)                210       
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 11        
=================================================================
Total params: 1,841
Trainable params: 1,841
Non-trainable params: 0
_________________________________________________________________

In [15]:
print ("^^^INFO: Fit Model^^^")
#X_train = X_train.reshape(X_train.shape[0],244,1)

callbacks = [
    EarlyStopping(monitor='val_loss', patience=3, verbose=1)
]

history = model.fit(X_train, y_train, epochs=5, batch_size=20000, validation_split=0.3, verbose=2,callbacks=callbacks) #verbose=2


^^^INFO: Fit Model^^^
Train on 5932229 samples, validate on 2542385 samples
Epoch 1/5
30s - loss: 0.2933 - mean_squared_error: 0.0834 - val_loss: 0.2842 - val_mean_squared_error: 0.0814
Epoch 2/5
29s - loss: 0.2881 - mean_squared_error: 0.0822 - val_loss: 0.2806 - val_mean_squared_error: 0.0811
Epoch 3/5
28s - loss: 0.2857 - mean_squared_error: 0.0818 - val_loss: 0.2862 - val_mean_squared_error: 0.0830
Epoch 4/5
28s - loss: 0.2840 - mean_squared_error: 0.0814 - val_loss: 0.2785 - val_mean_squared_error: 0.0796
Epoch 5/5
28s - loss: 0.2816 - mean_squared_error: 0.0808 - val_loss: 0.2743 - val_mean_squared_error: 0.0792

In [16]:
y_test=model.predict(X_test,batch_size=7000,verbose=1)
print(y_test)
print(y_test.dtype)


4802000/4833292 [============================>.] - ETA: 0s [[ 0.28245082]
 [ 0.10372701]
 [ 0.13353223]
 ..., 
 [ 0.04673484]
 [ 0.01697712]
 [ 0.04662048]]
float32

In [17]:
print(y_test[500:750,0])
y_test.shape


[ 0.01815125  0.14809722  0.07266475  0.06048336  0.15022524  0.03658184
  0.14999463  0.12282379  0.01300962  0.14885272  0.11696173  0.1162293
  0.08024094  0.02492772  0.01902132  0.05341727  0.10813192  0.16867933
  0.01072569  0.11180816  0.09083668  0.003302    0.06391382  0.23262353
  0.09874324  0.30451614  0.03653005  0.00920638  0.02159989  0.05489255
  0.08285429  0.18277843  0.01190487  0.08648319  0.01560901  0.1588596
  0.06526915  0.04969047  0.01596156  0.01823694  0.0461975   0.28664124
  0.020274    0.01095532  0.02043608  0.06157111  0.04053055  0.04823358
  0.08828104  0.09494013  0.01669116  0.14025719  0.01752861  0.11463436
  0.03267328  0.04292507  0.04317291  0.03265386  0.07493969  0.02695579
  0.12785     0.02591073  0.23044063  0.0026286   0.21241364  0.03655732
  0.12784059  0.07622741  0.08019531  0.00263713  0.12214535  0.02768833
  0.05275078  0.00885954  0.01911984  0.05742706  0.01353756  0.03627934
  0.01481265  0.01699708  0.04040465  0.01740555  0.03681495  0.03390338
  0.04443166  0.01902348  0.04830939  0.0164627   0.04604276  0.04271731
  0.03183826  0.07049286  0.17593496  0.08885129  0.05699983  0.01100427
  0.01513718  0.05863804  0.01496919  0.04232144  0.01044755  0.01591937
  0.04442933  0.03030215  0.02608482  0.01950184  0.0098965   0.01935632
  0.01812559  0.05391857  0.04669416  0.04010862  0.05000804  0.0852477
  0.01174503  0.06055144  0.02753347  0.05912718  0.02102659  0.01987715
  0.04131579  0.09063774  0.04805377  0.00863654  0.03188583  0.05671236
  0.01636876  0.04220454  0.04680528  0.01724536  0.0344421   0.02608888
  0.07385913  0.02240212  0.02796505  0.03303478  0.07463893  0.06505697
  0.03040182  0.03374597  0.0672683   0.09565474  0.00999215  0.01026342
  0.02141064  0.00842172  0.00960965  0.0287006   0.0110711   0.02918897
  0.01148176  0.04588388  0.038739    0.01941033  0.02852736  0.0199741
  0.26009911  0.02528557  0.00710617  0.06415679  0.03577683  0.01776244
  0.01228962  0.05041197  0.01618711  0.03625732  0.06899907  0.01163454
  0.01913806  0.05724014  0.03444924  0.05482111  0.05320981  0.00676128
  0.01502153  0.04530813  0.01923777  0.050842    0.04567977  0.04182978
  0.03531818  0.10196272  0.19338013  0.02898869  0.03341822  0.01115773
  0.02136972  0.01953574  0.03308653  0.06840216  0.15412246  0.02788271
  0.03319615  0.07428519  0.05100705  0.02079517  0.02941451  0.0097115
  0.06903064  0.022664    0.12398539  0.01098862  0.0283022   0.02012403
  0.04376444  0.02482683  0.04184137  0.03632685  0.02209918  0.050536
  0.02378258  0.02129971  0.03650183  0.0606218   0.03830454  0.01021917
  0.04347595  0.05090136  0.03591524  0.03161646  0.04125465  0.01471813
  0.00738509  0.16954498  0.05448378  0.01178634  0.02093552  0.03101519
  0.04375409  0.0371642   0.0589481   0.05737135  0.00929489  0.00967008
  0.0603017   0.05186754  0.05024529  0.0097648   0.04200985  0.04603958
  0.0381435   0.09294534  0.03076704  0.06076034  0.04212036  0.03167096
  0.2227919   0.00694153  0.05327413  0.02542448]
Out[17]:
(4833292, 1)

In [18]:
df_test['pred'] = y_test[:,0]

In [19]:
df_test['pred'].hist()


Out[19]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f5cef1bcb38>

In [39]:
print(df_test.shape[0])
print(df_test["pred"][df_test["pred"]>0.15].count() / df_test.shape[0]) # near 19% but very less than 27 %
print(df_test["pred"][df_test["pred"]>0.12].count() / df_test.shape[0])


4833292
0.1955931899
0.277309957685

In [20]:
df_train['labels'].hist()


Out[20]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f5ce956ebe0>

In [42]:
print(df_train["labels"][df_train["labels"]>0.12].count() / df_train.shape[0]) # 9.78 %


0.0977999705945

In [28]:
### build candidates list for test ###

#df_test['pred'] = preds

TRESHOLD = 0.15  # for reorder probability >> examine picks, by histogram of predictions?

d = dict()
for row in df_test.itertuples():
    if row.pred > TRESHOLD:
        try:
            d[row.order_id] += ' ' + str(row.product_id)
        except:
            d[row.order_id] = str(row.product_id)

In [29]:
#print('loading orders')
#orders = pd.read_csv(IDIR + 'orders.csv')



for order in df_test.order_id:
    if order not in d:
        d[order] = 'None'

sub = pd.DataFrame.from_dict(d, orient='index')

sub.reset_index(inplace=True)
sub.columns = ['order_id', 'products']
sub.to_csv('sub_frm_v4_015.csv', index=False)

In [30]:
print(sub.shape)
print(sub.head())


(75000, 2)
   order_id                                           products
0   2774568  17668 48523 21903 14992 21137 32402 22035 4968...
1    329954     21573 17769 37646 19057 26576 7350 43704 25146
2   1528013           27521 48679 8424 45007 21903 38293 11068
3   1376945  33037 20383 18465 33572 8230 47912 17706 28465...
4   1356845  11520 44422 37646 49683 38164 22935 30489 7076...

In [ ]: