notebook.community

Edit and run



In [1]:

    
# This version
# Moving to Keras as '0' to '1' regression

# 3 epoch, batch size = 59322, 300 sec/epoch
#loss    : 0.2861 - mse    : 0.0825 acc    : 0.9022 - 
#val_loss: 0.3541 - val_mse: 0.0895 val_acc: 0.902
#LB =  0.1723952

#with batch norm and multiple layers 
#400 - bn - 20 - bn -10 - 1 (sigmoid)
#optimizer: rmsprop
#kernal init = "normal"


## this version
# removed BN and 
#40 - 20 -10 - 1 (sigmoid)
# batch size = 20k
# 5 epochs
# LB = 0.2922377 with 020 thres
# LB = 0.3097318 with 015 thres ***
# LB = 0.3065503 with 012 thres



In [1]:

    
#importing packages

import numpy as np
import pandas as pd
import lightgbm as lgb
import gc

### importing libraries
%matplotlib inline
from keras.layers.convolutional import Convolution2D, MaxPooling2D, Convolution1D
from keras.layers.core import Dense, Activation, Dropout, Flatten
from keras.models import Sequential
from keras.optimizers import SGD,RMSprop
from keras.datasets import mnist
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping

from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline


import matplotlib as mpl

#mpl.use('Agg')
import matplotlib.pyplot as plt
import time
timestr = time.strftime("%Y%m%d-%H%M%S")
print(timestr)









    



Using Theano backend.






    



20170722-173418



In [2]:

    
#read df_train.csv, df_test.csv, train_labels.csv
df_train=pd.read_csv("df_train.csv")



In [3]:

    
df_test=pd.read_csv("df_test.csv")



In [4]:

    
labels=pd.read_csv("train_labels.csv",header=0)



In [5]:

    
print(df_train.shape)
print(labels.shape)
print(df_test.shape)









    



(8474661, 22)
(8474661, 1)
(4833292, 22)



In [6]:

    
df_train['labels']=labels
df_train.head()









    Out[6]:






  
    
      
      Unnamed: 0
      order_id
      product_id
      user_total_orders
      user_total_items
      total_distinct_items
      user_average_days_between_orders
      user_average_basket
      order_hour_of_day
      days_since_prior_order
      ...
      product_orders
      product_reorders
      product_reorder_rate
      UP_orders
      UP_orders_ratio
      UP_average_pos_in_cart
      UP_reorder_rate
      UP_orders_since_last
      UP_delta_hour_vs_last
      labels
    
  
  
    
      0
      0
      1187899
      17122
      11
      59
      18
      19.0
      5.363636
      8
      14.0
      ...
      13880.0
      9377.0
      0.675576
      1
      0.090909
      6.0
      0.090909
      6
      7
      0
    
    
      1
      1
      1187899
      196
      11
      59
      18
      19.0
      5.363636
      8
      14.0
      ...
      35791.0
      27791.0
      0.776480
      10
      0.909091
      1.4
      0.909091
      1
      0
      1
    
    
      2
      2
      1187899
      26405
      11
      59
      18
      19.0
      5.363636
      8
      14.0
      ...
      1214.0
      536.0
      0.441516
      2
      0.181818
      5.0
      0.181818
      7
      1
      1
    
    
      3
      3
      1187899
      46149
      11
      59
      18
      19.0
      5.363636
      8
      14.0
      ...
      8558.0
      6953.0
      0.812456
      3
      0.272727
      3.0
      0.272727
      1
      0
      1
    
    
      4
      4
      1187899
      14084
      11
      59
      18
      19.0
      5.363636
      8
      14.0
      ...
      15935.0
      12923.0
      0.810982
      1
      0.090909
      2.0
      0.090909
      10
      0
      0
    
  

5 rows × 23 columns



In [7]:

    
df_train.dropna(inplace=True,how='any')
print(df_train.shape)









    



(8474614, 23)



In [8]:

    
#selected features

f_to_use = ['user_total_orders', 'user_total_items', 'total_distinct_items',
       'user_average_days_between_orders', 'user_average_basket',
       'order_hour_of_day', 'days_since_prior_order', 'days_since_ratio',
       'aisle_id', 'department_id', 'product_orders', 'product_reorders',
       'product_reorder_rate', 'UP_orders', 'UP_orders_ratio',
       'UP_average_pos_in_cart', 'UP_reorder_rate', 'UP_orders_since_last',
       'UP_delta_hour_vs_last'] # 'dow', 'UP_same_dow_as_last_order



In [9]:

    
# Convert to numpy values
X_train = df_train[f_to_use].values.astype('float32')
X_test = df_test[f_to_use].values.astype('float32')
y_train = df_train['labels'].values.astype('int32')
print("Types")
print(X_train.dtype)
print(X_test.dtype)
print(y_train.dtype)

print("\nShapes")
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)

print("\nHeads")
print(y_train)
print(X_train[0:5,0:5])
print(X_test[0:5,0:5])
#df_columns = df_values.columns
#df_columns









    



Types
float32
float32
int32

Shapes
(8474614, 19)
(8474614,)
(4833292, 19)

Heads
[0 1 1 ..., 0 0 0]
[[ 11.          59.          18.          19.           5.36363649]
 [ 11.          59.          18.          19.           5.36363649]
 [ 11.          59.          18.          19.           5.36363649]
 [ 11.          59.          18.          19.           5.36363649]
 [ 11.          59.          18.          19.           5.36363649]]
[[ 13.          88.          33.          12.           6.76923084]
 [ 13.          88.          33.          12.           6.76923084]
 [ 13.          88.          33.          12.           6.76923084]
 [ 13.          88.          33.          12.           6.76923084]
 [ 13.          88.          33.          12.           6.76923084]]



In [10]:

    
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils
# encode class values as integers
encoder = LabelEncoder()
encoder.fit(y_train)
encoded_Y = encoder.transform(y_train)
# convert integers to dummy variables (i.e. one hot encoded)
dummy_y = np_utils.to_categorical(encoded_Y,num_classes=2).astype('float32')
print(dummy_y)
print(dummy_y.shape)
print(dummy_y.dtype)









    



[[ 1.  0.]
 [ 0.  1.]
 [ 0.  1.]
 ..., 
 [ 1.  0.]
 [ 1.  0.]
 [ 1.  0.]]
(8474614, 2)
float32



In [11]:

    
import keras.backend as K

def precision(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def recall(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def fbeta_score(y_true, y_pred, beta=1):
    if beta < 0:
        raise ValueError('The lowest choosable beta is zero (only precision).')

    # If there are no true positives, fix the F score at 0 like sklearn.
    if K.sum(K.round(K.clip(y_true, 0, 1))) == 0:
        return 0

    p = precision(y_true, y_pred)
    r = recall(y_true, y_pred)
    bb = beta ** 2
    fbeta_score = (1 + bb) * (p * r) / (bb * p + r + K.epsilon())
    return fbeta_score

def fmeasure(y_true, y_pred):
    return fbeta_score(y_true, y_pred, beta=1)

def f1_score(y_true, y_pred):

    # Count positive samples.
    c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    c2 = K.sum(K.round(K.clip(y_pred, 0, 1)))
    c3 = K.sum(K.round(K.clip(y_true, 0, 1)))

    # If there are no true samples, fix the F1 score at 0.
    if c3 == 0:
        return 0

    # How many selected items are relevant?
    precision = c1 / c2

    # How many relevant items are selected?
    recall = c1 / c3

    # Calculate f1_score
    f1_score = 2 * (precision * recall) / (precision + recall)
    return f1_score

def baseline_model():
    model = Sequential()
    #model.add(Convolution1D(20, 5, border_mode='valid', input_shape=(244, 1)))
    
    model.add(Dense(40 ,kernel_initializer='normal',input_dim=df_train[f_to_use].shape[1], activation='relu'))
    #model.add(Convolution1D(20, 5, strides=1, padding='valid', dilation_rate=1, activation='relu')) 
    #model.add(Flatten())
    #model.add(BatchNormalization())
    # model.add(Dropout(0.5))
    model.add(Dense(20,kernel_initializer='normal',activation="relu"))
    #model.add(Dropout(0.5))
    #model.add(Dense(30,activation="relu"))
    #model.add(Dropout(0.5))
    #model.add(BatchNormalization())
    model.add(Dense(10,kernel_initializer='normal',activation="relu"))
    #model.add(Dropout(0.25))
    model.add(Dense(1,kernel_initializer='normal',activation="sigmoid"))
    # Compile model
    #sgd=SGD(lr=0.01, momentum=0.0, decay=0.0, nesterov=False)
    model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['mse'])
        #Adadelta
        #sgd
    return model



In [12]:

    
model=baseline_model()
model.summary()









    



_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
dense_1 (Dense)              (None, 40)                800       
_________________________________________________________________
dense_2 (Dense)              (None, 20)                820       
_________________________________________________________________
dense_3 (Dense)              (None, 10)                210       
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 11        
=================================================================
Total params: 1,841
Trainable params: 1,841
Non-trainable params: 0
_________________________________________________________________



In [15]:

    
print ("^^^INFO: Fit Model^^^")
#X_train = X_train.reshape(X_train.shape[0],244,1)

callbacks = [
    EarlyStopping(monitor='val_loss', patience=3, verbose=1)
]

history = model.fit(X_train, y_train, epochs=5, batch_size=20000, validation_split=0.3, verbose=2,callbacks=callbacks) #verbose=2









    



^^^INFO: Fit Model^^^
Train on 5932229 samples, validate on 2542385 samples
Epoch 1/5
30s - loss: 0.2933 - mean_squared_error: 0.0834 - val_loss: 0.2842 - val_mean_squared_error: 0.0814
Epoch 2/5
29s - loss: 0.2881 - mean_squared_error: 0.0822 - val_loss: 0.2806 - val_mean_squared_error: 0.0811
Epoch 3/5
28s - loss: 0.2857 - mean_squared_error: 0.0818 - val_loss: 0.2862 - val_mean_squared_error: 0.0830
Epoch 4/5
28s - loss: 0.2840 - mean_squared_error: 0.0814 - val_loss: 0.2785 - val_mean_squared_error: 0.0796
Epoch 5/5
28s - loss: 0.2816 - mean_squared_error: 0.0808 - val_loss: 0.2743 - val_mean_squared_error: 0.0792



In [16]:

    
y_test=model.predict(X_test,batch_size=7000,verbose=1)
print(y_test)
print(y_test.dtype)









    



4802000/4833292 [============================>.] - ETA: 0s [[ 0.28245082]
 [ 0.10372701]
 [ 0.13353223]
 ..., 
 [ 0.04673484]
 [ 0.01697712]
 [ 0.04662048]]
float32



In [17]:

    
print(y_test[500:750,0])
y_test.shape









    



[ 0.01815125  0.14809722  0.07266475  0.06048336  0.15022524  0.03658184
  0.14999463  0.12282379  0.01300962  0.14885272  0.11696173  0.1162293
  0.08024094  0.02492772  0.01902132  0.05341727  0.10813192  0.16867933
  0.01072569  0.11180816  0.09083668  0.003302    0.06391382  0.23262353
  0.09874324  0.30451614  0.03653005  0.00920638  0.02159989  0.05489255
  0.08285429  0.18277843  0.01190487  0.08648319  0.01560901  0.1588596
  0.06526915  0.04969047  0.01596156  0.01823694  0.0461975   0.28664124
  0.020274    0.01095532  0.02043608  0.06157111  0.04053055  0.04823358
  0.08828104  0.09494013  0.01669116  0.14025719  0.01752861  0.11463436
  0.03267328  0.04292507  0.04317291  0.03265386  0.07493969  0.02695579
  0.12785     0.02591073  0.23044063  0.0026286   0.21241364  0.03655732
  0.12784059  0.07622741  0.08019531  0.00263713  0.12214535  0.02768833
  0.05275078  0.00885954  0.01911984  0.05742706  0.01353756  0.03627934
  0.01481265  0.01699708  0.04040465  0.01740555  0.03681495  0.03390338
  0.04443166  0.01902348  0.04830939  0.0164627   0.04604276  0.04271731
  0.03183826  0.07049286  0.17593496  0.08885129  0.05699983  0.01100427
  0.01513718  0.05863804  0.01496919  0.04232144  0.01044755  0.01591937
  0.04442933  0.03030215  0.02608482  0.01950184  0.0098965   0.01935632
  0.01812559  0.05391857  0.04669416  0.04010862  0.05000804  0.0852477
  0.01174503  0.06055144  0.02753347  0.05912718  0.02102659  0.01987715
  0.04131579  0.09063774  0.04805377  0.00863654  0.03188583  0.05671236
  0.01636876  0.04220454  0.04680528  0.01724536  0.0344421   0.02608888
  0.07385913  0.02240212  0.02796505  0.03303478  0.07463893  0.06505697
  0.03040182  0.03374597  0.0672683   0.09565474  0.00999215  0.01026342
  0.02141064  0.00842172  0.00960965  0.0287006   0.0110711   0.02918897
  0.01148176  0.04588388  0.038739    0.01941033  0.02852736  0.0199741
  0.26009911  0.02528557  0.00710617  0.06415679  0.03577683  0.01776244
  0.01228962  0.05041197  0.01618711  0.03625732  0.06899907  0.01163454
  0.01913806  0.05724014  0.03444924  0.05482111  0.05320981  0.00676128
  0.01502153  0.04530813  0.01923777  0.050842    0.04567977  0.04182978
  0.03531818  0.10196272  0.19338013  0.02898869  0.03341822  0.01115773
  0.02136972  0.01953574  0.03308653  0.06840216  0.15412246  0.02788271
  0.03319615  0.07428519  0.05100705  0.02079517  0.02941451  0.0097115
  0.06903064  0.022664    0.12398539  0.01098862  0.0283022   0.02012403
  0.04376444  0.02482683  0.04184137  0.03632685  0.02209918  0.050536
  0.02378258  0.02129971  0.03650183  0.0606218   0.03830454  0.01021917
  0.04347595  0.05090136  0.03591524  0.03161646  0.04125465  0.01471813
  0.00738509  0.16954498  0.05448378  0.01178634  0.02093552  0.03101519
  0.04375409  0.0371642   0.0589481   0.05737135  0.00929489  0.00967008
  0.0603017   0.05186754  0.05024529  0.0097648   0.04200985  0.04603958
  0.0381435   0.09294534  0.03076704  0.06076034  0.04212036  0.03167096
  0.2227919   0.00694153  0.05327413  0.02542448]






    Out[17]:





(4833292, 1)



In [18]:

    
df_test['pred'] = y_test[:,0]



In [19]:

    
df_test['pred'].hist()









    Out[19]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f5cef1bcb38>



In [39]:

    
print(df_test.shape[0])
print(df_test["pred"][df_test["pred"]>0.15].count() / df_test.shape[0]) # near 19% but very less than 27 %
print(df_test["pred"][df_test["pred"]>0.12].count() / df_test.shape[0])









    



4833292
0.1955931899
0.277309957685



In [20]:

    
df_train['labels'].hist()









    Out[20]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f5ce956ebe0>



In [42]:

    
print(df_train["labels"][df_train["labels"]>0.12].count() / df_train.shape[0]) # 9.78 %









    



0.0977999705945



In [28]:

    
### build candidates list for test ###

#df_test['pred'] = preds

TRESHOLD = 0.15  # for reorder probability >> examine picks, by histogram of predictions?

d = dict()
for row in df_test.itertuples():
    if row.pred > TRESHOLD:
        try:
            d[row.order_id] += ' ' + str(row.product_id)
        except:
            d[row.order_id] = str(row.product_id)



In [29]:

    
#print('loading orders')
#orders = pd.read_csv(IDIR + 'orders.csv')



for order in df_test.order_id:
    if order not in d:
        d[order] = 'None'

sub = pd.DataFrame.from_dict(d, orient='index')

sub.reset_index(inplace=True)
sub.columns = ['order_id', 'products']
sub.to_csv('sub_frm_v4_015.csv', index=False)



In [30]:

    
print(sub.shape)
print(sub.head())









    



(75000, 2)
   order_id                                           products
0   2774568  17668 48523 21903 14992 21137 32402 22035 4968...
1    329954     21573 17769 37646 19057 26576 7350 43704 25146
2   1528013           27521 48679 8424 45007 21903 38293 11068
3   1376945  33037 20383 18465 33572 8230 47912 17706 28465...
4   1356845  11520 44422 37646 49683 38164 22935 30489 7076...



In [ ]:

	Unnamed: 0	order_id	product_id	user_total_orders	user_total_items	total_distinct_items	user_average_days_between_orders	user_average_basket	order_hour_of_day	days_since_prior_order	...	product_orders	product_reorders	product_reorder_rate	UP_orders	UP_orders_ratio	UP_average_pos_in_cart	UP_reorder_rate	UP_orders_since_last	UP_delta_hour_vs_last	labels
0	0	1187899	17122	11	59	18	19.0	5.363636	8	14.0	...	13880.0	9377.0	0.675576	1	0.090909	6.0	0.090909	6	7	0
1	1	1187899	196	11	59	18	19.0	5.363636	8	14.0	...	35791.0	27791.0	0.776480	10	0.909091	1.4	0.909091	1	0	1
2	2	1187899	26405	11	59	18	19.0	5.363636	8	14.0	...	1214.0	536.0	0.441516	2	0.181818	5.0	0.181818	7	1	1
3	3	1187899	46149	11	59	18	19.0	5.363636	8	14.0	...	8558.0	6953.0	0.812456	3	0.272727	3.0	0.272727	1	0	1
4	4	1187899	14084	11	59	18	19.0	5.363636	8	14.0	...	15935.0	12923.0	0.810982	1	0.090909	2.0	0.090909	10	0	0