Recent years have witnessed a surge in the number of internet savvy users. Companies in the financial services domain leverage this huge internet traffic arriving at their interface by strategically placing ads/promotions for cross selling of various financial products on a plethora of web pages. The digital analytics unit of Best Cards Company uses cutting edge data science and machine learning for successful promotion of its valuable card products. They believe that a predictive model that forecasts whether a session involves a click on the ad/promotion would help them extract the maximum out of the huge clickstream data that they have collected. You are hired as a consultant to build an efficient model to predict whether a user will click on an ad or not, given the following features:
Train Data
Variable | Definition |
---|---|
session_id | Unique ID for a session |
DateTime | Timestamp |
user_id | Unique ID for user |
product | Product ID |
campaign_id | Unique ID for ad campaign |
webpage_id | Webpage ID at which the ad is displayed |
product_category_1 | Product category 1 (Ordered) |
product_category_2 | Product category 2 |
user_group_id | Customer segmentation ID |
gender | Gender of the user |
age_level | Age level of the user |
user_depth | Interaction level of user with the web platform (1 - low, 2 - medium, 3 - High) |
city_development_index | Scaled development index of the residence city |
var_1 | Anonymised session feature |
is_click | 0 - no click, 1 - click |
Historical User logs
Variable | Definition |
---|---|
DateTime | Timestamp |
user_id | Unique ID for the user |
product | Product ID |
action | view/interest (view - viewed the product page, interest - registered interest for the product) |
Evaluation Metric The evaluation metric for this competition is AUC-ROC score. Public and Private Split Test data is further randomly divided into Public (30%) and Private (70%) data.
In [ ]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
In [ ]:
import pandas as pd
import warnings, gc, os, numpy as np
from tqdm import tqdm_notebook
def ignore_warn(*args, **kwargs):
pass
warnings.warn = ignore_warn
pd.option_context("display.max_rows", 500);
pd.option_context("display.max_columns", 100);
In [ ]:
PATH = os.getcwd(); PATH = '../input'
In [ ]:
%%time
df_raw = pd.read_csv(f'{PATH}/train.csv', low_memory=False, parse_dates=['DateTime'])
df_test = pd.read_csv(f'{PATH}/test.csv', low_memory=False, parse_dates=['DateTime'])
df_historical = pd.read_csv(f'{PATH}/historical_user_logs.csv', parse_dates=['DateTime'])
In [ ]:
def reduce_mem_usage(df):
""" iterate through all the columns of a dataframe and modify the data type
to reduce memory usage.
"""
start_mem = df.memory_usage().sum() / 1024**2
print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
for col in df.columns:
col_type = df[col].dtype
if col_type != object and str(col_type)[:4] != 'date':
c_min = df[col].min()
c_max = df[col].max()
if str(col_type)[:3] == 'int':
if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
df[col] = df[col].astype(np.int8)
elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
df[col] = df[col].astype(np.int16)
elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
df[col] = df[col].astype(np.int32)
elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
df[col] = df[col].astype(np.int64)
else:
if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
df[col] = df[col].astype(np.float16)
elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
df[col] = df[col].astype(np.float32)
else:
df[col] = df[col].astype(np.float64)
##else: df[col] = df[col].astype('category')
end_mem = df.memory_usage().sum() / 1024**2
print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
return df
df_raw = reduce_mem_usage(df_raw)
df_test = reduce_mem_usage(df_test)
df_historical = reduce_mem_usage(df_historical)
In [ ]:
%%time
df_raw = df_raw.merge(df_historical.groupby(['user_id'])['action'].count().reset_index().rename(columns = {'action':'count_past_product_interaction'}),\
on=['user_id'], how='left')
df_test = df_test.merge(df_historical.groupby(['user_id'])['action'].count().reset_index().rename(columns = {'action':'count_past_product_interaction'}),\
on=['user_id'], how='left')
#######################################################
df_raw = df_raw.merge(df_historical[['user_id','product']].groupby(['user_id'])['product'].nunique().reset_index().rename(columns={'product':'count_past_product_interaction_nunique'}),\
on=['user_id'], how='left')
df_test = df_test.merge(df_historical[['user_id','product']].groupby(['user_id'])['product'].nunique().reset_index().rename(columns={'product':'count_past_product_interaction_nunique'}),\
on=['user_id'], how='left')
#######################################################
df_raw = df_raw.merge(df_historical[['user_id','product', 'DateTime']].groupby(['user_id', 'product'])['DateTime'].count().reset_index().rename(columns={'DateTime':'visited_this_product_before_count'}),\
on=['user_id', 'product'], how='left')
df_test = df_test.merge(df_historical[['user_id','product', 'DateTime']].groupby(['user_id', 'product'])['DateTime'].count().reset_index().rename(columns={'DateTime':'visited_this_product_before_count'}),\
on=['user_id', 'product'], how='left')
#######################################################
df_raw = df_raw.merge(df_historical[['user_id','product','action']].groupby(['user_id', 'product'])['action'].nunique()\
.reset_index().rename(columns={'action':'had_interest_before'}),
on=['user_id', 'product'], how='left')
df_test = df_test.merge(df_historical[['user_id','product','action']].groupby(['user_id', 'product'])['action'].nunique()\
.reset_index().rename(columns={'action':'had_interest_before'}),
on=['user_id', 'product'], how='left')
df_raw['seen_and_intrested_in_this_product_in_past'] = df_raw['had_interest_before'].apply(lambda x: int(x>1))
df_test['seen_and_intrested_in_this_product_in_past'] = df_test['had_interest_before'].apply(lambda x: int(x>1))
df_raw = df_raw.merge(df_raw.groupby(['user_id', 'webpage_id'])['campaign_id'].nunique().reset_index().rename(columns={'campaign_id':'nunique_campaign_in_total'}),\
on=['user_id', 'webpage_id'], how='left')
df_test = df_test.merge(df_test.groupby(['user_id', 'webpage_id'])['campaign_id'].nunique().reset_index().rename(columns={'campaign_id':'nunique_campaign_in_total'}),\
on=['user_id', 'webpage_id'], how='left')
df_raw.shape, df_test.shape
In [ ]:
#add time features first
import re
def add_datepart(df, fldname, drop=True):
"""
Parameters:
-----------
df: A pandas data frame. df gain several new columns.
fldname: A string that is the name of the date column you wish to expand.
If it is not a datetime64 series, it will be converted to one with pd.to_datetime.
drop: If true then the original date column will be removed.
"""
fld = df[fldname]
fld_dtype = fld.dtype
if isinstance(fld_dtype, pd.core.dtypes.dtypes.DatetimeTZDtype):
fld_dtype = np.datetime64
if not np.issubdtype(fld_dtype, np.datetime64):
df[fldname] = fld = pd.to_datetime(fld, infer_datetime_format=True)
targ_pre = re.sub('[Dd]ate$', '', fldname)
attr = ['Month', 'Week', 'Day', 'Dayofweek', 'Is_month_end', 'Is_month_start', 'Is_quarter_end', 'Is_quarter_start',\
'Hour']
for n in attr:
df[targ_pre + '_' +n] = getattr(fld.dt, n.lower())
if drop:
df.drop(fldname, axis=1, inplace=True)
add_datepart(df_raw, 'DateTime', False)
add_datepart(df_test, 'DateTime', False)
add_datepart(df_historical, 'DateTime', False)
df_raw.shape, df_test.shape
In [ ]:
all_sat_sun = reduce_mem_usage(df_historical[df_historical['DateTime_Dayofweek'] >=5])
all_sat = reduce_mem_usage(df_historical[df_historical['DateTime_Dayofweek'] == 5])
all_sun = reduce_mem_usage(df_historical[df_historical['DateTime_Dayofweek'] == 6])
del df_historical
In [ ]:
df_raw = reduce_mem_usage(df_raw)
df_test = reduce_mem_usage(df_test)
df_raw.shape, df_test.shape
In [ ]:
%%time
df_raw = df_raw.merge(all_sat_sun.groupby(['user_id'])['action'].count().reset_index().rename(columns = {'action':'sat_sun_count_past_product_interaction'}),\
on=['user_id'], how='left')
df_test = df_test.merge(all_sat_sun.groupby(['user_id'])['action'].count().reset_index().rename(columns = {'action':'sat_sun_count_past_product_interaction'}),\
on=['user_id'], how='left')
#######################################################
df_raw = df_raw.merge(all_sat_sun[['user_id','product']].groupby(['user_id'])['product'].nunique().reset_index().rename(columns={'product':'sat_sun_count_past_product_interaction_nunique'}),\
on=['user_id'], how='left')
df_test = df_test.merge(all_sat_sun[['user_id','product']].groupby(['user_id'])['product'].nunique().reset_index().rename(columns={'product':'sat_sun_count_past_product_interaction_nunique'}),\
on=['user_id'], how='left')
#######################################################
df_raw = df_raw.merge(all_sat_sun[['user_id','product', 'DateTime']].groupby(['user_id', 'product'])['DateTime'].count().reset_index().rename(columns={'DateTime':'sat_sun_visited_this_product_before_count'}),\
on=['user_id', 'product'], how='left')
df_test = df_test.merge(all_sat_sun[['user_id','product', 'DateTime']].groupby(['user_id', 'product'])['DateTime'].count().reset_index().rename(columns={'DateTime':'sat_sun_visited_this_product_before_count'}),\
on=['user_id', 'product'], how='left')
#######################################################
df_raw = df_raw.merge(all_sat_sun[['user_id','product','action']].groupby(['user_id', 'product'])['action'].nunique()\
.reset_index().rename(columns={'action':'sat_sun_had_interest_before'}),
on=['user_id', 'product'], how='left')
df_test = df_test.merge(all_sat_sun[['user_id','product','action']].groupby(['user_id', 'product'])['action'].nunique()\
.reset_index().rename(columns={'action':'sat_sun_had_interest_before'}),
on=['user_id', 'product'], how='left')
df_raw['sat_sun_seen_and_intrested_in_this_product_in_past'] = df_raw['had_interest_before'].apply(lambda x: int(x>1))
df_test['sat_sun_seen_and_intrested_in_this_product_in_past'] = df_test['had_interest_before'].apply(lambda x: int(x>1))
df_raw = df_raw.merge(df_raw.groupby(['user_id', 'webpage_id'])['campaign_id'].nunique().reset_index().rename(columns={'campaign_id':'sat_sun_nunique_campaign_in_total'}),\
on=['user_id', 'webpage_id'], how='left')
df_test = df_test.merge(df_test.groupby(['user_id', 'webpage_id'])['campaign_id'].nunique().reset_index().rename(columns={'campaign_id':'sat_sun_nunique_campaign_in_total'}),\
on=['user_id', 'webpage_id'], how='left')
df_raw.shape, df_test.shape
In [ ]:
%%time
df_raw = df_raw.merge(all_sat.groupby(['user_id'])['action'].count().reset_index().rename(columns = {'action':'sat_count_past_product_interaction'}),\
on=['user_id'], how='left')
df_test = df_test.merge(all_sat.groupby(['user_id'])['action'].count().reset_index().rename(columns = {'action':'sat_count_past_product_interaction'}),\
on=['user_id'], how='left')
#######################################################
df_raw = df_raw.merge(all_sat[['user_id','product']].groupby(['user_id'])['product'].nunique().reset_index().rename(columns={'product':'sat_count_past_product_interaction_nunique'}),\
on=['user_id'], how='left')
df_test = df_test.merge(all_sat[['user_id','product']].groupby(['user_id'])['product'].nunique().reset_index().rename(columns={'product':'sat_count_past_product_interaction_nunique'}),\
on=['user_id'], how='left')
#######################################################
df_raw = df_raw.merge(all_sat[['user_id','product', 'DateTime']].groupby(['user_id', 'product'])['DateTime'].count().reset_index().rename(columns={'DateTime':'sat_visited_this_product_before_count'}),\
on=['user_id', 'product'], how='left')
df_test = df_test.merge(all_sat[['user_id','product', 'DateTime']].groupby(['user_id', 'product'])['DateTime'].count().reset_index().rename(columns={'DateTime':'sat_visited_this_product_before_count'}),\
on=['user_id', 'product'], how='left')
#######################################################
df_raw = df_raw.merge(all_sat[['user_id','product','action']].groupby(['user_id', 'product'])['action'].nunique()\
.reset_index().rename(columns={'action':'sat_had_interest_before'}),
on=['user_id', 'product'], how='left')
df_test = df_test.merge(all_sat[['user_id','product','action']].groupby(['user_id', 'product'])['action'].nunique()\
.reset_index().rename(columns={'action':'sat_had_interest_before'}),
on=['user_id', 'product'], how='left')
df_raw['sat_seen_and_intrested_in_this_product_in_past'] = df_raw['had_interest_before'].apply(lambda x: int(x>1))
df_test['sat_seen_and_intrested_in_this_product_in_past'] = df_test['had_interest_before'].apply(lambda x: int(x>1))
df_raw = df_raw.merge(df_raw.groupby(['user_id', 'webpage_id'])['campaign_id'].nunique().reset_index().rename(columns={'campaign_id':'sat_nunique_campaign_in_total'}),\
on=['user_id', 'webpage_id'], how='left')
df_test = df_test.merge(df_test.groupby(['user_id', 'webpage_id'])['campaign_id'].nunique().reset_index().rename(columns={'campaign_id':'sat_nunique_campaign_in_total'}),\
on=['user_id', 'webpage_id'], how='left')
df_raw.shape, df_test.shape
In [ ]:
%%time
df_raw = df_raw.merge(all_sun.groupby(['user_id'])['action'].count().reset_index().rename(columns = {'action':'sun_count_past_product_interaction'}),\
on=['user_id'], how='left')
df_test = df_test.merge(all_sun.groupby(['user_id'])['action'].count().reset_index().rename(columns = {'action':'sun_count_past_product_interaction'}),\
on=['user_id'], how='left')
#######################################################
df_raw = df_raw.merge(all_sun[['user_id','product']].groupby(['user_id'])['product'].nunique().reset_index().rename(columns={'product':'sun_count_past_product_interaction_nunique'}),\
on=['user_id'], how='left')
df_test = df_test.merge(all_sun[['user_id','product']].groupby(['user_id'])['product'].nunique().reset_index().rename(columns={'product':'sun_count_past_product_interaction_nunique'}),\
on=['user_id'], how='left')
#######################################################
df_raw = df_raw.merge(all_sun[['user_id','product', 'DateTime']].groupby(['user_id', 'product'])['DateTime'].count().reset_index().rename(columns={'DateTime':'sun_visited_this_product_before_count'}),\
on=['user_id', 'product'], how='left')
df_test = df_test.merge(all_sun[['user_id','product', 'DateTime']].groupby(['user_id', 'product'])['DateTime'].count().reset_index().rename(columns={'DateTime':'sun_visited_this_product_before_count'}),\
on=['user_id', 'product'], how='left')
#######################################################
df_raw = df_raw.merge(all_sun[['user_id','product','action']].groupby(['user_id', 'product'])['action'].nunique()\
.reset_index().rename(columns={'action':'sun_had_interest_before'}),
on=['user_id', 'product'], how='left')
df_test = df_test.merge(all_sun[['user_id','product','action']].groupby(['user_id', 'product'])['action'].nunique()\
.reset_index().rename(columns={'action':'sun_had_interest_before'}),
on=['user_id', 'product'], how='left')
df_raw['sun_seen_and_intrested_in_this_product_in_past'] = df_raw['had_interest_before'].apply(lambda x: int(x>1))
df_test['sun_seen_and_intrested_in_this_product_in_past'] = df_test['had_interest_before'].apply(lambda x: int(x>1))
df_raw = df_raw.merge(df_raw.groupby(['user_id', 'webpage_id'])['campaign_id'].nunique().reset_index().rename(columns={'campaign_id':'sun_nunique_campaign_in_total'}),\
on=['user_id', 'webpage_id'], how='left')
df_test = df_test.merge(df_test.groupby(['user_id', 'webpage_id'])['campaign_id'].nunique().reset_index().rename(columns={'campaign_id':'sun_nunique_campaign_in_total'}),\
on=['user_id', 'webpage_id'], how='left')
df_raw.shape, df_test.shape
In [ ]:
df_raw = reduce_mem_usage(df_raw)
df_test = reduce_mem_usage(df_test)
In [ ]:
del all_sat_sun, all_sat, all_sun
In [ ]:
pi = np.pi
df_raw ['night'] = df_raw['DateTime_Hour'].apply(lambda hour:int(hour >= 22 or hour <= 4 ))
df_raw ['morning'] = df_raw['DateTime_Hour'].apply(lambda hour:int(hour >= 5 and hour <= 11 ))
df_raw ['midday'] = df_raw['DateTime_Hour'].apply(lambda hour:int(hour >= 12 and hour <= 18 ))
df_raw ['evening'] = df_raw['DateTime_Hour'].apply(lambda hour:int(hour >= 19 and hour <= 21 ))
df_test ['night'] = df_test['DateTime_Hour'].apply(lambda hour:int(hour >= 22 or hour <= 4 ))
df_test ['morning'] = df_test['DateTime_Hour'].apply(lambda hour:int(hour >= 5 and hour <= 11 ))
df_test ['midday'] = df_test['DateTime_Hour'].apply(lambda hour:int(hour >= 12 and hour <= 18 ))
df_test ['evening'] = df_test['DateTime_Hour'].apply(lambda hour:int(hour >= 19 and hour <= 21 ))
#cyclic coords
df_raw['hour_sin_x'] = df_raw['DateTime_Hour'].apply(lambda ts: np.sin(2*pi*ts/24.))
df_raw['hour_cos_x'] = df_raw['DateTime_Hour'].apply(lambda ts: np.cos(2*pi*ts/24.))
df_test['hour_sin_x'] = df_test['DateTime_Hour'].apply(lambda ts: np.sin(2*pi*ts/24.))
df_test['hour_cos_x'] = df_test['DateTime_Hour'].apply(lambda ts: np.cos(2*pi*ts/24.))
df_raw.shape, df_test.shape
In [ ]:
df_raw['gender'].fillna('NA', inplace=True)
df_test['gender'].fillna('NA', inplace=True)
df_raw['age_level'].fillna(3, inplace=True)
df_test['age_level'].fillna(3, inplace=True)
df_raw['user_depth'].fillna(3, inplace=True)
df_test['user_depth'].fillna(3, inplace=True)
df_raw['user_group_id'].fillna(-1, inplace=True)
df_test['user_group_id'].fillna(-1, inplace=True)
df_raw['visited_this_product_before_count'].fillna(-1, inplace=True)
df_test['visited_this_product_before_count'].fillna(-1, inplace=True)
df_raw['count_past_product_interaction'].fillna(-1, inplace=True)
df_test['count_past_product_interaction'].fillna(-1, inplace=True)
df_raw['count_past_product_interaction_nunique'].fillna(-1, inplace=True)
df_test['count_past_product_interaction_nunique'].fillna(-1, inplace=True)
df_raw['sat_visited_this_product_before_count'].fillna(-1, inplace=True)
df_test['sat_visited_this_product_before_count'].fillna(-1, inplace=True)
df_raw['sat_count_past_product_interaction'].fillna(-1, inplace=True)
df_test['sat_count_past_product_interaction'].fillna(-1, inplace=True)
df_raw['sat_count_past_product_interaction_nunique'].fillna(-1, inplace=True)
df_test['sat_count_past_product_interaction_nunique'].fillna(-1, inplace=True)
df_raw['sun_visited_this_product_before_count'].fillna(-1, inplace=True)
df_test['sun_visited_this_product_before_count'].fillna(-1, inplace=True)
df_raw['sun_count_past_product_interaction'].fillna(-1, inplace=True)
df_test['sun_count_past_product_interaction'].fillna(-1, inplace=True)
df_raw['sun_count_past_product_interaction_nunique'].fillna(-1, inplace=True)
df_test['sun_count_past_product_interaction_nunique'].fillna(-1, inplace=True)
df_raw['sat_sun_visited_this_product_before_count'].fillna(-1, inplace=True)
df_test['sat_sun_visited_this_product_before_count'].fillna(-1, inplace=True)
df_raw['sat_sun_count_past_product_interaction'].fillna(-1, inplace=True)
df_test['sat_sun_count_past_product_interaction'].fillna(-1, inplace=True)
df_raw['sat_sun_count_past_product_interaction_nunique'].fillna(-1, inplace=True)
df_test['sat_sun_count_past_product_interaction_nunique'].fillna(-1, inplace=True)
df_raw['city_development_index'].fillna(-1, inplace=True)
df_test['city_development_index'].fillna(-1, inplace=True)
df_raw.shape, df_test.shape
In [ ]:
#mix product and product cat
#df_raw['product__user_group_id'] = df_raw['product']+ '_'+ df_raw['user_group_id'].astype(str)
df_raw['product__gender'] = df_raw['gender'].astype(str) + '_'+ df_raw['product']
#mix product and product cat
#df_test['product__user_group_id'] = df_test['product']+ '_'+ df_test['user_group_id'].astype(str)
df_test['product__gender'] = df_test['gender'].astype(str) + '_'+ df_test['product']
In [ ]:
df_raw_agg = df_raw.drop(columns = ['campaign_id','session_id','DateTime',\
'product_category_1','product_category_2','gender'])\
[['var_1','user_depth','age_level','DateTime_Hour', 'user_id','webpage_id','night', 'morning', 'evening','DateTime_Dayofweek','nunique_campaign_in_total']]\
.groupby('user_id', as_index = False).agg(['max', 'min', 'sum', 'count']).reset_index()
df_test_agg = df_test.drop(columns = ['campaign_id','session_id','DateTime',\
'product_category_1','product_category_2','gender'])\
[['var_1','user_depth','age_level','DateTime_Hour', 'user_id','webpage_id', 'night', 'morning', 'evening','DateTime_Dayofweek','nunique_campaign_in_total']]\
.groupby('user_id', as_index = False).agg(['max', 'min', 'sum', 'count']).reset_index()
# List of column names
columns = ['user_id']
# Iterate through the variables names
for var in df_raw_agg.columns.levels[0]:
# Skip the id name
if var != 'user_id':
# Iterate through the stat names
for stat in df_raw_agg.columns.levels[1][:-1]:
# Make a new column name for the variable and stat
columns.append('stat_%s_%s' % (var, stat))
df_raw_agg.columns = columns
# Merge with the training data
df_raw = df_raw.merge(df_raw_agg, on = 'user_id', how = 'left')
# List of column names
columns = ['user_id']
# Iterate through the variables names
for var in df_test_agg.columns.levels[0]:
# Skip the id name
if var != 'user_id':
# Iterate through the stat names
for stat in df_test_agg.columns.levels[1][:-1]:
# Make a new column name for the variable and stat
columns.append('stat_%s_%s' % (var, stat))
df_test_agg.columns = columns
# Merge with the training data
df_test = df_test.merge(df_test_agg, on = 'user_id', how = 'left')
df_raw.shape, df_test.shape
In [ ]:
df_raw = reduce_mem_usage(df_raw)
df_test = reduce_mem_usage(df_test)
In [ ]:
df_raw['webpage_id'] = df_raw['webpage_id'].astype(object)
df_test['webpage_id'] = df_test['webpage_id'].astype(object)
df_raw['product_category_1']= df_raw['product_category_1'].astype(object)
df_test['product_category_1'] = df_test['product_category_1'].astype(object)
df_raw['had_interest_before_cat'] = df_raw['had_interest_before'].astype(object)
df_test['had_interest_before_cat'] = df_test['had_interest_before'].astype(object)
df_raw['product_category_1_cat']= df_raw['product_category_1'].astype(object)
df_test['product_category_1_cat'] = df_test['product_category_1'].astype(object)
df_raw['age_level']= df_raw['age_level'].astype(object)
df_test['age_level'] = df_test['age_level'].astype(object)
df_raw['user_depth']= df_raw['user_depth'].astype(object)
df_test['user_depth'] = df_test['user_depth'].astype(object)
In [ ]:
categorical = pd.get_dummies(df_raw.select_dtypes('object'))
categorical['user_id'] = df_raw['user_id']
categorical_grouped = categorical.groupby('user_id').agg(['sum', 'mean'])
group_var = 'user_id'
# Need to create new column names
columns = []
# Iterate through the variables names
for var in categorical_grouped.columns.levels[0]:
# Skip the grouping variable
if var != group_var:
# Iterate through the stat names
for stat in ['count', 'count_norm']:
# Make a new column name for the variable and stat
columns.append('%s_%s' % (var, stat))
# Rename the columns
categorical_grouped.columns = columns
df_raw = df_raw.merge(categorical_grouped, left_on = 'user_id', right_index = True, how = 'left')
categorical = pd.get_dummies(df_test.select_dtypes('object'))
categorical['user_id'] = df_test['user_id']
categorical_grouped = categorical.groupby('user_id').agg(['sum', 'mean'])
group_var = 'user_id'
# Need to create new column names
columns = []
# Iterate through the variables names
for var in categorical_grouped.columns.levels[0]:
# Skip the grouping variable
if var != group_var:
# Iterate through the stat names
for stat in ['count', 'count_norm']:
# Make a new column name for the variable and stat
columns.append('%s_%s' % (var, stat))
# Rename the columns
categorical_grouped.columns = columns
df_test = df_test.merge(categorical_grouped, left_on = 'user_id', right_index = True, how = 'left')
In [ ]:
df_raw = reduce_mem_usage(df_raw)
df_test = reduce_mem_usage(df_test)
df_raw.shape, df_test.shape
In [ ]:
target = df_raw['is_click'].values
set(pd.get_dummies(df_raw).columns) - set(pd.get_dummies(df_test).columns)
In [ ]:
features = df_raw.columns
numeric_features = []
categorical_features = []
i = 0
index = []
for dtype, feature in zip(df_raw.dtypes, df_raw.columns):
if dtype == object:
#print(column)
#print(train_data[column].describe())
categorical_features.append(feature)
index.append(i)
else:
numeric_features.append(feature)
i +=1
len(categorical_features), len(numeric_features)
In [ ]:
from fastai.imports import *
from fastai.structured import *
In [ ]:
train_cats(df_raw);
apply_cats(df_test, df_raw)
In [ ]:
df_raw.drop(['is_click','session_id', 'DateTime'], axis=1, inplace=True)
df_test.drop(['session_id', 'DateTime'], axis=1, inplace=True)
In [ ]:
X_train_num = df_raw.drop(categorical_features, axis=1) #numeric ones
X_test_num = df_test.drop(categorical_features, axis=1) #numeric ones
X_train_od = df_raw[categorical_features] #cat ones
X_test_od = df_test[categorical_features] #cat ones
In [ ]:
X_train_od = pd.get_dummies(X_train_od)
X_test_od = pd.get_dummies(X_test_od)
X_train_num.shape, X_train_od.shape
In [ ]:
X_train, X_test = np.hstack([X_train_num, X_train_od]), np.hstack([X_test_num, X_test_od])
In [ ]:
X_train.shape, X_test.shape
In [ ]:
np.bincount(target)
In [ ]:
del df_raw, df_test, X_train_od, X_test_od, X_train_num, X_test_num
In [ ]:
gc.collect()
In [ ]:
params = {}
params['tree_method'] = 'hist'
params['objective'] = 'binary:logistic'
params['eval_metric'] = 'auc'
params["eta"] = 0.05 #0.03
params["subsample"] = .7 #.85 was tried before
params["silent"] = 0
params['verbose'] = 1
params["max_depth"] = 9
params["seed"] = 1
params["max_delta_step"] = 5
params['scale_pos_weight'] = 431960/31331
params["gamma"] = 20
params['colsample_bytree'] = 0.7
params['nrounds'] = 1000
params['missing'] = -1
In [ ]:
X_train.shape, X_test.shape
In [ ]:
import os
def get_importances(model, features):
for feature in features:
assert '\n' not in feature and '\t' not in feature, "\\n and \\t cannot be in feature names"
outfile = open('mlcrate_xgb.fmap', 'w')
for i, feat in enumerate(features):
outfile.write('{0}\t{1}\tq\n'.format(i, feat))
outfile.close()
importance = model.get_fscore(fmap='mlcrate_xgb.fmap')
importance = sorted(importance.items(), key=lambda x: x[1], reverse=True)
os.remove('mlcrate_xgb.fmap')
return importance
def train_kfold(params, x_train, y_train, x_test=None, folds=5, stratify=None, random_state=1337, skip_checks=False, print_imp='final'):
from sklearn.model_selection import KFold, StratifiedKFold # Optional dependencies
from collections import defaultdict
import numpy as np
import xgboost as xgb
assert print_imp in ['every', 'final', None]
# If it's a dataframe, we can take column names, otherwise just use column indices (eg. for printing importances).
if hasattr(x_train, 'columns'):
columns = x_train.columns.values
columns_exists = True
else:
columns = ['f{}'.format(i) for i in np.arange(x_train.shape[1])]
columns_exists = False
x_train = np.asarray(x_train)
y_train = np.array(y_train)
if x_test is not None:
if columns_exists and not skip_checks:
try:
x_test = x_test[columns]
except Exception as e:
print('Could not coerce x_test columns to match x_train columns. Set skip_checks=True to run anyway.')
raise e
x_test = np.asarray(x_test)
d_test = xgb.DMatrix(x_test)
if not skip_checks and x_test is not None:
assert x_train.shape[1] == x_test.shape[1], "x_train and x_test have different numbers of features."
print('Training {} {}XGBoost models on training set {} {}'.format(folds, 'stratified ' if stratify is not None else '',
x_train.shape, 'with test set {}'.format(x_test.shape) if x_test is not None else 'without a test set'))
if stratify is not None:
kf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=random_state)
splits = kf.split(x_train, stratify)
else:
kf = KFold(n_splits=folds, shuffle=True, random_state=4242)
splits = kf.split(x_train)
p_train = np.zeros_like(y_train, dtype=np.float32)
ps_test = []
models = []
scores = []
imps = defaultdict(int)
fold_i = 0
for train_kf, valid_kf in splits:
print('Running fold {}, {} train samples, {} validation samples'.format(fold_i, len(train_kf), len(valid_kf)))
d_train = xgb.DMatrix(x_train[train_kf], label=y_train[train_kf])
d_valid = xgb.DMatrix(x_train[valid_kf], label=y_train[valid_kf])
# Metrics to print
watchlist = [(d_train, 'train'), (d_valid, 'valid')]
mdl = xgb.train(params, d_train, params.get('nrounds', 100000), watchlist,
early_stopping_rounds=params.get('early_stopping_rounds', 50), verbose_eval=params.get('verbose_eval', 1), feval=params.get('feval'))
scores.append(mdl.best_score)
print('Finished training fold {} - running score {}'.format(fold_i, np.mean(scores)))
# Get importances for this model and add to global importance
imp = get_importances(mdl, columns)
if print_imp == 'every':
print('Fold {} importances:'.format(fold_i), imp)
for f, i in imp:
imps[f] += i
# Get predictions from the model
p_valid = mdl.predict(d_valid, ntree_limit=mdl.best_ntree_limit)
if x_test is not None:
p_test = mdl.predict(d_test, ntree_limit=mdl.best_ntree_limit)
ps_test.append(p_test)
p_train[valid_kf] = p_valid
models.append(mdl)
fold_i += 1
if x_test is not None:
p_test = np.mean(ps_test, axis=0)
print('Finished training {} XGBoost models'.format(folds))
if print_imp in ['every', 'final']:
print('Overall feature importances:', sorted(imps.items(), key=lambda x: x[1], reverse=True))
if x_test is None:
p_test = None
return models, p_train, p_test, imps
In [ ]:
models, p_train, p_test, imps = train_kfold(params, X_train, target, X_test, folds=4, stratify=target, random_state=1337)
In [ ]:
import joblib
In [ ]:
joblib.dump(models[0], 'model_0.joblib.dat')
joblib.dump(models[1], 'model_1.joblib.dat')
joblib.dump(models[3], 'model_4.joblib.dat')
In [ ]:
np.save('preds_hist.npy', p_test)
In [ ]:
import xgboost as xgb
xgb.plot_importance(models[1], max_num_features=20,importance_type='gain' )