In [1]:
# This version
# Moving to Keras as '0' to '1' regression
# 3 epoch, batch size = 59322, 300 sec/epoch
#loss : 0.2861 - mse : 0.0825 acc : 0.9022 -
#val_loss: 0.3541 - val_mse: 0.0895 val_acc: 0.902
#LB = 0.1723952
#with batch norm and multiple layers
#400 - bn - 20 - bn -10 - 1 (sigmoid)
#optimizer: rmsprop
#kernal init = "normal"
## this version
# removed BN and
#40 - 20 -10 - 1 (sigmoid)
# batch size = 20k
# 5 epochs
# LB = 0.2922377 with 020 thres
# LB = 0.3097318 with 015 thres ***
# LB = 0.3065503 with 012 thres
In [1]:
#importing packages
import numpy as np
import pandas as pd
import lightgbm as lgb
import gc
### importing libraries
%matplotlib inline
from keras.layers.convolutional import Convolution2D, MaxPooling2D, Convolution1D
from keras.layers.core import Dense, Activation, Dropout, Flatten
from keras.models import Sequential
from keras.optimizers import SGD,RMSprop
from keras.datasets import mnist
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import matplotlib as mpl
#mpl.use('Agg')
import matplotlib.pyplot as plt
import time
timestr = time.strftime("%Y%m%d-%H%M%S")
print(timestr)
In [2]:
#read df_train.csv, df_test.csv, train_labels.csv
df_train=pd.read_csv("df_train.csv")
In [3]:
df_test=pd.read_csv("df_test.csv")
In [4]:
labels=pd.read_csv("train_labels.csv",header=0)
In [5]:
print(df_train.shape)
print(labels.shape)
print(df_test.shape)
In [6]:
df_train['labels']=labels
df_train.head()
Out[6]:
In [7]:
df_train.dropna(inplace=True,how='any')
print(df_train.shape)
In [8]:
#selected features
f_to_use = ['user_total_orders', 'user_total_items', 'total_distinct_items',
'user_average_days_between_orders', 'user_average_basket',
'order_hour_of_day', 'days_since_prior_order', 'days_since_ratio',
'aisle_id', 'department_id', 'product_orders', 'product_reorders',
'product_reorder_rate', 'UP_orders', 'UP_orders_ratio',
'UP_average_pos_in_cart', 'UP_reorder_rate', 'UP_orders_since_last',
'UP_delta_hour_vs_last'] # 'dow', 'UP_same_dow_as_last_order
In [9]:
# Convert to numpy values
X_train = df_train[f_to_use].values.astype('float32')
X_test = df_test[f_to_use].values.astype('float32')
y_train = df_train['labels'].values.astype('int32')
print("Types")
print(X_train.dtype)
print(X_test.dtype)
print(y_train.dtype)
print("\nShapes")
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print("\nHeads")
print(y_train)
print(X_train[0:5,0:5])
print(X_test[0:5,0:5])
#df_columns = df_values.columns
#df_columns
In [10]:
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils
# encode class values as integers
encoder = LabelEncoder()
encoder.fit(y_train)
encoded_Y = encoder.transform(y_train)
# convert integers to dummy variables (i.e. one hot encoded)
dummy_y = np_utils.to_categorical(encoded_Y,num_classes=2).astype('float32')
print(dummy_y)
print(dummy_y.shape)
print(dummy_y.dtype)
In [11]:
import keras.backend as K
def precision(y_true, y_pred):
true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
precision = true_positives / (predicted_positives + K.epsilon())
return precision
def recall(y_true, y_pred):
true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
recall = true_positives / (possible_positives + K.epsilon())
return recall
def fbeta_score(y_true, y_pred, beta=1):
if beta < 0:
raise ValueError('The lowest choosable beta is zero (only precision).')
# If there are no true positives, fix the F score at 0 like sklearn.
if K.sum(K.round(K.clip(y_true, 0, 1))) == 0:
return 0
p = precision(y_true, y_pred)
r = recall(y_true, y_pred)
bb = beta ** 2
fbeta_score = (1 + bb) * (p * r) / (bb * p + r + K.epsilon())
return fbeta_score
def fmeasure(y_true, y_pred):
return fbeta_score(y_true, y_pred, beta=1)
def f1_score(y_true, y_pred):
# Count positive samples.
c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
c2 = K.sum(K.round(K.clip(y_pred, 0, 1)))
c3 = K.sum(K.round(K.clip(y_true, 0, 1)))
# If there are no true samples, fix the F1 score at 0.
if c3 == 0:
return 0
# How many selected items are relevant?
precision = c1 / c2
# How many relevant items are selected?
recall = c1 / c3
# Calculate f1_score
f1_score = 2 * (precision * recall) / (precision + recall)
return f1_score
def baseline_model():
model = Sequential()
#model.add(Convolution1D(20, 5, border_mode='valid', input_shape=(244, 1)))
model.add(Dense(40 ,kernel_initializer='normal',input_dim=df_train[f_to_use].shape[1], activation='relu'))
#model.add(Convolution1D(20, 5, strides=1, padding='valid', dilation_rate=1, activation='relu'))
#model.add(Flatten())
#model.add(BatchNormalization())
# model.add(Dropout(0.5))
model.add(Dense(20,kernel_initializer='normal',activation="relu"))
#model.add(Dropout(0.5))
#model.add(Dense(30,activation="relu"))
#model.add(Dropout(0.5))
#model.add(BatchNormalization())
model.add(Dense(10,kernel_initializer='normal',activation="relu"))
#model.add(Dropout(0.25))
model.add(Dense(1,kernel_initializer='normal',activation="sigmoid"))
# Compile model
#sgd=SGD(lr=0.01, momentum=0.0, decay=0.0, nesterov=False)
model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['mse'])
#Adadelta
#sgd
return model
In [12]:
model=baseline_model()
model.summary()
In [15]:
print ("^^^INFO: Fit Model^^^")
#X_train = X_train.reshape(X_train.shape[0],244,1)
callbacks = [
EarlyStopping(monitor='val_loss', patience=3, verbose=1)
]
history = model.fit(X_train, y_train, epochs=5, batch_size=20000, validation_split=0.3, verbose=2,callbacks=callbacks) #verbose=2
In [16]:
y_test=model.predict(X_test,batch_size=7000,verbose=1)
print(y_test)
print(y_test.dtype)
In [17]:
print(y_test[500:750,0])
y_test.shape
Out[17]:
In [18]:
df_test['pred'] = y_test[:,0]
In [19]:
df_test['pred'].hist()
Out[19]:
In [39]:
print(df_test.shape[0])
print(df_test["pred"][df_test["pred"]>0.15].count() / df_test.shape[0]) # near 19% but very less than 27 %
print(df_test["pred"][df_test["pred"]>0.12].count() / df_test.shape[0])
In [20]:
df_train['labels'].hist()
Out[20]:
In [42]:
print(df_train["labels"][df_train["labels"]>0.12].count() / df_train.shape[0]) # 9.78 %
In [28]:
### build candidates list for test ###
#df_test['pred'] = preds
TRESHOLD = 0.15 # for reorder probability >> examine picks, by histogram of predictions?
d = dict()
for row in df_test.itertuples():
if row.pred > TRESHOLD:
try:
d[row.order_id] += ' ' + str(row.product_id)
except:
d[row.order_id] = str(row.product_id)
In [29]:
#print('loading orders')
#orders = pd.read_csv(IDIR + 'orders.csv')
for order in df_test.order_id:
if order not in d:
d[order] = 'None'
sub = pd.DataFrame.from_dict(d, orient='index')
sub.reset_index(inplace=True)
sub.columns = ['order_id', 'products']
sub.to_csv('sub_frm_v4_015.csv', index=False)
In [30]:
print(sub.shape)
print(sub.head())
In [ ]: