In [1]:
import numpy as np
import pandas as pd
import gc
print('loading prior')
priors = pd.read_csv('./data/order_products__prior.csv')
train = pd.read_csv('./data/train_new.csv')
train_eval = pd.read_csv('./data/train_eval.csv')
print('loading orders')
orders = pd.read_csv('./data/orders.csv')
###
# some memory measures for kaggle kernel
print('optimize memory')
orders.order_dow = orders.order_dow.astype(np.int8)
orders.order_hour_of_day = orders.order_hour_of_day.astype(np.int8)
orders.order_number = orders.order_number.astype(np.int16)
orders.order_id = orders.order_id.astype(np.int32)
orders.user_id = orders.user_id.astype(np.int32)
orders.days_since_prior_order = orders.days_since_prior_order.astype(np.float32)
train.reordered = train.reordered.astype(np.int8)
train.add_to_cart_order = train.add_to_cart_order.astype(np.int16)
train_eval.reordered = train.reordered.astype(np.int8)
train_eval.add_to_cart_order = train.add_to_cart_order.astype(np.int16)
priors.order_id = priors.order_id.astype(np.int32)
priors.add_to_cart_order = priors.add_to_cart_order.astype(np.int16)
priors.reordered = priors.reordered.astype(np.int8)
priors.product_id = priors.product_id.astype(np.int32)
gc.collect()
Out[1]:
In [2]:
orders.set_index('order_id', inplace = True, drop = False)
train.set_index('order_id', inplace = True, drop = False)
train_eval.set_index('order_id', inplace = True, drop = False)
train = train.join(orders, on = 'order_id', rsuffix = '_')
train.drop('order_id_', inplace = True, axis = 1)
train_eval = train_eval.join(orders, on = 'order_id', rsuffix = '_')
train_eval.drop('order_id_', inplace = True, axis = 1)
train.reset_index(inplace=True, drop=True)
train_eval.reset_index(inplace=True, drop=True)
orders.reset_index(inplace=True, drop=True)
y_df = pd.DataFrame()
y_df['y'] = train.groupby('user_id')['product_id'].agg('count')
y_df.reset_index(inplace = True, drop = False)
y_df.head()
Out[2]:
In [3]:
y_eval_df = pd.DataFrame()
y_eval_df['y'] = train_eval.groupby('user_id')['product_id'].agg('count')
y_eval_df.reset_index(inplace = True, drop = False)
y_eval_df.head()
Out[3]:
In [4]:
print('Join orders with prior')
orders.set_index('order_id', inplace = True, drop = False)
priors.set_index('order_id', inplace = True, drop = False)
priors = priors.join(orders, on = 'order_id', rsuffix = '_')
priors.drop('order_id_', inplace = True, axis = 1)
priors.reset_index(inplace=True, drop=True)
orders.reset_index(inplace=True, drop=True)
print('Prepare prior for featrues')
prior_subset = pd.DataFrame()
prior_subset['p_count'] = priors.groupby(['user_id','order_id','order_number'], group_keys = False)['product_id'].agg('count')
prior_subset['rorder_sum'] = priors.groupby(['user_id','order_id','order_number'], group_keys = False)['reordered'].agg('sum')
prior_subset['rorder_rate'] = prior_subset['rorder_sum'] / prior_subset['p_count']
prior_subset.reset_index(drop = False, inplace=True)
prior_subset.sort_values(by = ['user_id','order_number'], inplace = True, ascending = [True, True], axis = 0)
prior_subset.drop('order_id', inplace = True, axis = 1)
prior_subset.reset_index(inplace = True, drop = True)
In [5]:
prior_subset.head()
Out[5]:
In [6]:
ts_features = pd.read_csv('./features/ts_features.csv')
ts_features.head()
Out[6]:
In [7]:
ts_features['user_id'] = prior_subset['user_id'].unique()
ts_features.head(10)
Out[7]:
In [8]:
y_df = y_df.join(ts_features, on='user_id', rsuffix='_')
y_df.drop(['user_id_'], inplace = True, axis = 1)
y_eval_df = y_eval_df.join(ts_features, on='user_id', rsuffix='_')
y_eval_df.drop(['user_id_'], inplace = True, axis = 1)
## Replace na with 0
nan_rows = y_df[y_df.isnull().T.any().T]
y_df.fillna(value = 0, inplace = True)
## Replace na with 0
nan_rows = y_eval_df[y_eval_df.isnull().T.any().T]
y_eval_df.fillna(value = 0, inplace = True)
y_df.to_csv('./features/regression/y_df.csv', index = False)
y_eval_df.to_csv('./features/regression/y_eval_df.csv', index = False)
In [9]:
y_df.head()
Out[9]:
In [12]:
y_eval_df.head()
Out[12]:
In [64]:
import lightgbm as lgb
train = lgb.Dataset(y_df.ix[:,2:], y_df.ix[:,1])
evalset = lgb.Dataset(y_eval_df.ix[:,2:], y_eval_df.ix[:,1])
# specify your configurations as a dict
params = {
'task': 'train',
'boosting_type': 'gbdt',
'objective': 'poisson',
'metric': {'poisson', 'mse'},
'num_leaves': 31,
'learning_rate': 0.1,
'feature_fraction': 1.0,
'bagging_fraction': 0.9,
'bagging_freq': 10,
'verbose': 0
}
print('Start training...')
# train
gbm = lgb.train(params,
train,
num_boost_round=20,
valid_sets=evalset,
early_stopping_rounds=5)
print('Save model...')
# save model to file
gbm.save_model('./models/gbm_reg_model.txt')
In [65]:
from sklearn.metrics import mean_squared_error
print('Start predicting...')
# predict
y_pred = gbm.predict(y_eval_df.ix[:,2:], num_iteration=gbm.best_iteration)
# eval
print('The rmse of prediction is:', mean_squared_error(y_eval_df.ix[:,1], y_pred) ** 0.5)
In [48]:
y_pred
Out[48]:
In [30]:
# feature importances
for i,(n,s) in enumerate(zip(train.feature_name, list(gbm.feature_importance()))):
print i,' :',n,s
In [33]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.scatter(y_eval_df.ix[:,9], y_eval_df.ix[:,1])
Out[33]:
In [35]:
y_eval_df.ix[:,8]
Out[35]:
In [26]:
from sklearn.ensemble import RandomForestRegressor
forest = RandomForestRegressor(n_estimators = 200, max_features = 'sqrt' ,random_state = 100, n_jobs = -1, verbose = 1, oob_score = True)
forest.fit(X.values, Y.values)
Out[26]:
In [27]:
oob_p = forest.oob_prediction_
oob_p = np.round(oob_p, 0)
np.mean(np.abs(oob_p - Y.values))
Out[27]:
In [23]:
oob_p
Out[23]:
In [ ]:
from sklearn.ensemble import RandomForestRegressor
oob_scores = []
r2_scores = []
n_trees = range(100,200,10)
for ne in n_trees:
print "Starting ", ne
forest = RandomForestRegressor(n_estimators = 200, max_features = 2 ,random_state = 100, n_jobs = -1, verbose = 0, oob_score = True)
forest.fit(X.values, Y.values)
oob_p = forest.oob_prediction_
y_p = np.round(oob_p,0)
oob_scores.append(sum(abs(y_p - Y.values)) / ( 1.0 * len(y_p)))
r2_scores.append(forest.score(X.values, Y.values))
In [ ]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.scatter(n_trees, oob_scores)
In [ ]:
plt.scatter(n_trees, r2_scores)
In [ ]:
X_eval = y_eval_df.ix[:,2:]
Y_eval = y_eval_df.ix[:,1]
## Dummy variables
X_eval = pd.get_dummies(X_eval, prefix = ['o1_order_dow','o2_order_dow','o3_order_dow'
,'o1_order_hr','o2_order_hr','o3_order_hr'],
columns = ['o1_order_dow','o2_order_dow','o3_order_dow'
,'o1_order_hr','o2_order_hr','o3_order_hr'])
#forest.score(X_eval.values, Y_eval.values)
In [ ]:
y_p = forest.predict(X_eval.values)
y_p = np.round(y_p,0)
np.mean(np.abs(Y_eval.values - y_p))
In [ ]:
Y_eval.values
In [ ]: