In [ ]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory
import os
print(os.listdir("../input"))
# Any results you write to the current directory are saved as output.
In [ ]:
%%bash
ls -lh ../input
In [ ]:
%%bash
wc -l ../input/*
In [ ]:
40000000*100/184903891
In [ ]:
%%bash
head ../input/sample_submission.csv
In [ ]:
#train = pd.read_csv("../input/train.csv")
#test = pd.read_csv("../input/test.csv")
path = "../input/"
dtypes = {
'ip' : 'uint32',
'app' : 'uint16',
'device' : 'uint16',
'os' : 'uint16',
'channel' : 'uint16',
'is_attributed' : 'uint8',
'click_id' : 'uint32'
}
print('loading train data...')
train_df = pd.read_csv(path+"train.csv", nrows=40000000, dtype=dtypes, usecols=['ip','app','device','os', 'channel', 'click_time', 'is_attributed'])
print('loading test data...')
test_df = pd.read_csv(path+"test.csv", dtype=dtypes, usecols=['ip','app','device','os', 'channel', 'click_time', 'click_id'])
sub = pd.read_csv("../input/sample_submission.csv")
In [ ]:
import gc
gc.collect()
In [ ]:
print(test.shape)
test.head()
In [ ]:
len_train = len(train_df)
train_df=train_df.append(test_df)
In [ ]:
del test_df
gc.collect()
print('Extracting new features...')
train_df['hour'] = pd.to_datetime(train_df.click_time).dt.hour.astype('uint8')
train_df['day'] = pd.to_datetime(train_df.click_time).dt.day.astype('uint8')
train_df['wday'] = pd.to_datetime(train_df.click_time).dt.dayofweek.astype('uint8')
gc.collect()
In [ ]:
print('grouping by ip-day-hour combination...')
gp = train_df[['ip','day','hour','channel']].groupby(by=['ip','day','hour'])[['channel']].count().reset_index().rename(index=str, columns={'channel': 'qty'})
train_df = train_df.merge(gp, on=['ip','day','hour'], how='left')
del gp
gc.collect()
In [ ]:
print('group by ip-app combination...')
gp = train_df[['ip', 'app', 'channel']].groupby(by=['ip', 'app'])[['channel']].count().reset_index().rename(index=str, columns={'channel': 'ip_app_count'})
train_df = train_df.merge(gp, on=['ip','app'], how='left')
del gp
gc.collect()
In [ ]:
print('group by ip-app-os combination...')
gp = train_df[['ip','app', 'os', 'channel']].groupby(by=['ip', 'app', 'os'])[['channel']].count().reset_index().rename(index=str, columns={'channel': 'ip_app_os_count'})
print("merging...")
train_df = train_df.merge(gp, on=['ip','app', 'os'], how='left')
del gp
gc.collect()
In [ ]:
print("vars and data type: ")
train_df.info()
train_df['qty'] = train_df['qty'].astype('uint16')
train_df['ip_app_count'] = train_df['ip_app_count'].astype('uint16')
train_df['ip_app_os_count'] = train_df['ip_app_os_count'].astype('uint16')
In [ ]:
test_df = train_df[len_train:]
train_df = train_df[:len_train]
In [ ]:
train_df.head()
In [ ]:
train_df = train_df.sample(frac=1).reset_index(drop=True)
In [ ]:
train_df.head()
In [ ]:
val_df = train_df[(len_train-3000000):len_train]
train_df = train_df[:(len_train-3000000)]
In [ ]:
"""
Adding improvements inspired from:
Ravi Teja's fe script: https://www.kaggle.com/rteja1113/lightgbm-with-count-features?scriptVersionId=2815638
"""
import pandas as pd
import time
import numpy as np
from sklearn.cross_validation import train_test_split
import lightgbm as lgb
import gc
def lgb_modelfit_nocv(params, dtrain, dvalid, predictors, target='target', objective='binary', metrics='auc',
feval=None, early_stopping_rounds=20, num_boost_round=3000, verbose_eval=10, categorical_features=None):
lgb_params = {
'boosting_type': 'gbdt',
'objective': objective,
'metric':metrics,
'learning_rate': 0.01,
#'is_unbalance': 'true', #because training data is unbalance (replaced with scale_pos_weight)
'num_leaves': 31, # we should let it be smaller than 2^(max_depth)
'max_depth': -1, # -1 means no limit
'min_child_samples': 20, # Minimum number of data need in a child(min_data_in_leaf)
'max_bin': 255, # Number of bucketed bin for feature values
'subsample': 0.6, # Subsample ratio of the training instance.
'subsample_freq': 0, # frequence of subsample, <=0 means no enable
'colsample_bytree': 0.3, # Subsample ratio of columns when constructing each tree.
'min_child_weight': 5, # Minimum sum of instance weight(hessian) needed in a child(leaf)
'subsample_for_bin': 200000, # Number of samples for constructing bin
'min_split_gain': 0, # lambda_l1, lambda_l2 and min_gain_to_split to regularization
'reg_alpha': 0, # L1 regularization term on weights
'reg_lambda': 0, # L2 regularization term on weights
'nthread': 4,
'verbose': 0,
'metric':metrics
}
lgb_params.update(params)
print("preparing validation datasets")
xgtrain = lgb.Dataset(dtrain[predictors].values, label=dtrain[target].values,
feature_name=predictors,
categorical_feature=categorical_features
)
xgvalid = lgb.Dataset(dvalid[predictors].values, label=dvalid[target].values,
feature_name=predictors,
categorical_feature=categorical_features
)
evals_results = {}
bst1 = lgb.train(lgb_params,
xgtrain,
valid_sets=[xgtrain, xgvalid],
valid_names=['train','valid'],
evals_result=evals_results,
num_boost_round=num_boost_round,
early_stopping_rounds=early_stopping_rounds,
verbose_eval=10,
feval=feval)
n_estimators = bst1.best_iteration
print("\nModel Report")
print("n_estimators : ", n_estimators)
print(metrics+":", evals_results['valid'][metrics][n_estimators-1])
return bst1
In [ ]:
test.headhead()
In [ ]:
print("train size: ", len(train_df))
print("valid size: ", len(val_df))
print("test size : ", len(test_df))
target = 'is_attributed'
predictors = ['app','device','os', 'channel', 'hour', 'day', 'wday', 'qty', 'ip_app_count', 'ip_app_os_count']
categorical = ['app', 'device', 'os', 'channel', 'hour', 'day', 'wday']
sub = pd.DataFrame()
sub['click_id'] = test_df['click_id'].astype('int')
In [ ]:
gc.collect()
print("Training...")
start_time = time.time()
params = {
'learning_rate': 0.15,
#'is_unbalance': 'true', # replaced with scale_pos_weight argument
'num_leaves': 15, # 2^max_depth - 1
'max_depth': 4, # -1 means no limit
'min_child_samples': 100, # Minimum number of data need in a child(min_data_in_leaf)
'max_bin': 100, # Number of bucketed bin for feature values
'subsample': .7, # Subsample ratio of the training instance.
'subsample_freq': 1, # frequence of subsample, <=0 means no enable
'colsample_bytree': 0.7, # Subsample ratio of columns when constructing each tree.
'min_child_weight': 0, # Minimum sum of instance weight(hessian) needed in a child(leaf)
'scale_pos_weight':99 # because training data is extremely unbalanced
}
bst = lgb_modelfit_nocv(params,
train_df,
val_df,
predictors,
target,
objective='binary',
metrics='auc',
early_stopping_rounds=50,
verbose_eval=True,
num_boost_round=350,
categorical_features=categorical)
print('[{}]: model training time'.format(time.time() - start_time))
In [ ]:
print("Predicting...")
sub['is_attributed'] = bst.predict(test_df[predictors])
print("writing...")
sub.to_csv('sub_lgb_balanced99.csv',index=False)
print("done...")
In [ ]: